1
2 package org.galagosearch.core.parse;
3
4 import java.io.File;
5 import java.io.IOException;
6 import java.util.ArrayList;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.List;
10 import org.galagosearch.core.types.WordCount;
11 import org.galagosearch.tupleflow.InputClass;
12 import org.galagosearch.tupleflow.OutputClass;
13 import org.galagosearch.tupleflow.Reducer;
14 import org.galagosearch.tupleflow.StandardStep;
15 import org.galagosearch.tupleflow.TupleFlowParameters;
16 import org.galagosearch.tupleflow.Utility;
17 import org.galagosearch.tupleflow.execution.Verified;
18
19 /***
20 *
21 * @author trevor
22 */
23 @Verified
24 @InputClass(className = "org.galagosearch.core.parse.Document")
25 @OutputClass(className = "org.galagosearch.core.types.WordCount")
26 public class WordCounter extends StandardStep<Document, WordCount> implements Reducer<WordCount> {
27 int maxWidth = 1;
28 HashSet<String> filterWords;
29
30 public WordCounter(TupleFlowParameters parameters) throws IOException {
31 maxWidth = (int) parameters.getXML().get("width", 1);
32 String filename = parameters.getXML().get("filter", (String) null);
33 if (filename != null) {
34 filterWords = Utility.readFileToStringSet(new File(filename));
35 } else {
36 filterWords = null;
37 }
38 }
39
40 public void process(Document document) throws IOException {
41 List<String> tokens = document.terms;
42 HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>();
43
44 for (int i = 0; i < tokens.size(); i++) {
45 String token = tokens.get(i);
46
47 if (token == null) {
48 continue;
49 }
50 updateCounts(token, countObjects);
51
52 if (maxWidth > 1) {
53 StringBuilder builder = new StringBuilder();
54 builder.append(token);
55
56 int end = Math.min(i + maxWidth, tokens.size());
57 for (int j = i + 1; j < end; j++) {
58 token = tokens.get(j);
59
60 if (token == null) {
61 break;
62 }
63 builder.append(' ');
64 builder.append(token);
65
66 updateCounts(builder.toString(), countObjects);
67 }
68 }
69 }
70
71 for (WordCount count : countObjects.values()) {
72 assert count != null;
73 assert count.word != null;
74 processor.process(count);
75 }
76 }
77
78 public ArrayList<WordCount> reduce(List<WordCount> input) throws IOException {
79 HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>(input.size() / 5);
80
81 for (WordCount wordCount : input) {
82 WordCount original = countObjects.get(wordCount.word);
83
84 if (original == null) {
85 countObjects.put(wordCount.word, original);
86 } else {
87 original.documents += wordCount.documents;
88 original.count += wordCount.count;
89 }
90 }
91
92 return new ArrayList<WordCount>(countObjects.values());
93 }
94
95 void updateCounts(String token, HashMap<String, WordCount> countObjects) {
96 WordCount wordCount = countObjects.get(token);
97
98 if (filterWords != null && !filterWords.contains(token)) {
99 return;
100 }
101 if (wordCount != null) {
102 wordCount.count += 1;
103 } else {
104 wordCount = new WordCount(new String(token), 1, 1);
105 countObjects.put(token, wordCount);
106 }
107 }
108 }