View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.parse;
3   
4   import java.io.File;
5   import java.io.IOException;
6   import java.util.ArrayList;
7   import java.util.HashMap;
8   import java.util.HashSet;
9   import java.util.List;
10  import org.galagosearch.core.types.WordCount;
11  import org.galagosearch.tupleflow.InputClass;
12  import org.galagosearch.tupleflow.OutputClass;
13  import org.galagosearch.tupleflow.Reducer;
14  import org.galagosearch.tupleflow.StandardStep;
15  import org.galagosearch.tupleflow.TupleFlowParameters;
16  import org.galagosearch.tupleflow.Utility;
17  import org.galagosearch.tupleflow.execution.Verified;
18  
19  /***
20   *
21   * @author trevor
22   */
23  @Verified
24  @InputClass(className = "org.galagosearch.core.parse.Document")
25  @OutputClass(className = "org.galagosearch.core.types.WordCount")
26  public class WordCounter extends StandardStep<Document, WordCount> implements Reducer<WordCount> {
27      int maxWidth = 1;
28      HashSet<String> filterWords;
29  
30      public WordCounter(TupleFlowParameters parameters) throws IOException {
31          maxWidth = (int) parameters.getXML().get("width", 1);
32          String filename = parameters.getXML().get("filter", (String) null);
33          if (filename != null) {
34              filterWords = Utility.readFileToStringSet(new File(filename));
35          } else {
36              filterWords = null;
37          }
38      }
39  
40      public void process(Document document) throws IOException {
41          List<String> tokens = document.terms;
42          HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>();
43  
44          for (int i = 0; i < tokens.size(); i++) {
45              String token = tokens.get(i);
46  
47              if (token == null) {
48                  continue;
49              }
50              updateCounts(token, countObjects);
51  
52              if (maxWidth > 1) {
53                  StringBuilder builder = new StringBuilder();
54                  builder.append(token);
55  
56                  int end = Math.min(i + maxWidth, tokens.size());
57                  for (int j = i + 1; j < end; j++) {
58                      token = tokens.get(j);
59  
60                      if (token == null) {
61                          break;
62                      }
63                      builder.append(' ');
64                      builder.append(token);
65  
66                      updateCounts(builder.toString(), countObjects);
67                  }
68              }
69          }
70  
71          for (WordCount count : countObjects.values()) {
72              assert count != null;
73              assert count.word != null;
74              processor.process(count);
75          }
76      }
77  
78      public ArrayList<WordCount> reduce(List<WordCount> input) throws IOException {
79          HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>(input.size() / 5);
80  
81          for (WordCount wordCount : input) {
82              WordCount original = countObjects.get(wordCount.word);
83  
84              if (original == null) {
85                  countObjects.put(wordCount.word, original);
86              } else {
87                  original.documents += wordCount.documents;
88                  original.count += wordCount.count;
89              }
90          }
91  
92          return new ArrayList<WordCount>(countObjects.values());
93      }
94  
95      void updateCounts(String token, HashMap<String, WordCount> countObjects) {
96          WordCount wordCount = countObjects.get(token);
97  
98          if (filterWords != null && !filterWords.contains(token)) {
99              return;
100         }
101         if (wordCount != null) {
102             wordCount.count += 1;
103         } else {
104             wordCount = new WordCount(new String(token), 1, 1);
105             countObjects.put(token, wordCount);
106         }
107     }
108 }