| 1 | |
|
| 2 | |
package org.galagosearch.core.parse; |
| 3 | |
|
| 4 | |
import java.io.File; |
| 5 | |
import java.io.IOException; |
| 6 | |
import java.util.ArrayList; |
| 7 | |
import java.util.HashMap; |
| 8 | |
import java.util.HashSet; |
| 9 | |
import java.util.List; |
| 10 | |
import org.galagosearch.core.types.WordCount; |
| 11 | |
import org.galagosearch.tupleflow.InputClass; |
| 12 | |
import org.galagosearch.tupleflow.OutputClass; |
| 13 | |
import org.galagosearch.tupleflow.Reducer; |
| 14 | |
import org.galagosearch.tupleflow.StandardStep; |
| 15 | |
import org.galagosearch.tupleflow.TupleFlowParameters; |
| 16 | |
import org.galagosearch.tupleflow.Utility; |
| 17 | |
import org.galagosearch.tupleflow.execution.Verified; |
| 18 | |
|
| 19 | |
|
| 20 | |
|
| 21 | |
|
| 22 | |
|
| 23 | |
@Verified |
| 24 | |
@InputClass(className = "org.galagosearch.core.parse.Document") |
| 25 | |
@OutputClass(className = "org.galagosearch.core.types.WordCount") |
| 26 | 0 | public class WordCounter extends StandardStep<Document, WordCount> implements Reducer<WordCount> { |
| 27 | 0 | int maxWidth = 1; |
| 28 | |
HashSet<String> filterWords; |
| 29 | |
|
| 30 | 0 | public WordCounter(TupleFlowParameters parameters) throws IOException { |
| 31 | 0 | maxWidth = (int) parameters.getXML().get("width", 1); |
| 32 | 0 | String filename = parameters.getXML().get("filter", (String) null); |
| 33 | 0 | if (filename != null) { |
| 34 | 0 | filterWords = Utility.readFileToStringSet(new File(filename)); |
| 35 | |
} else { |
| 36 | 0 | filterWords = null; |
| 37 | |
} |
| 38 | 0 | } |
| 39 | |
|
| 40 | |
public void process(Document document) throws IOException { |
| 41 | 0 | List<String> tokens = document.terms; |
| 42 | 0 | HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>(); |
| 43 | |
|
| 44 | 0 | for (int i = 0; i < tokens.size(); i++) { |
| 45 | 0 | String token = tokens.get(i); |
| 46 | |
|
| 47 | 0 | if (token == null) { |
| 48 | 0 | continue; |
| 49 | |
} |
| 50 | 0 | updateCounts(token, countObjects); |
| 51 | |
|
| 52 | 0 | if (maxWidth > 1) { |
| 53 | 0 | StringBuilder builder = new StringBuilder(); |
| 54 | 0 | builder.append(token); |
| 55 | |
|
| 56 | 0 | int end = Math.min(i + maxWidth, tokens.size()); |
| 57 | 0 | for (int j = i + 1; j < end; j++) { |
| 58 | 0 | token = tokens.get(j); |
| 59 | |
|
| 60 | 0 | if (token == null) { |
| 61 | 0 | break; |
| 62 | |
} |
| 63 | 0 | builder.append(' '); |
| 64 | 0 | builder.append(token); |
| 65 | |
|
| 66 | 0 | updateCounts(builder.toString(), countObjects); |
| 67 | |
} |
| 68 | |
} |
| 69 | |
} |
| 70 | |
|
| 71 | 0 | for (WordCount count : countObjects.values()) { |
| 72 | 0 | assert count != null; |
| 73 | 0 | assert count.word != null; |
| 74 | 0 | processor.process(count); |
| 75 | |
} |
| 76 | 0 | } |
| 77 | |
|
| 78 | |
public ArrayList<WordCount> reduce(List<WordCount> input) throws IOException { |
| 79 | 0 | HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>(input.size() / 5); |
| 80 | |
|
| 81 | 0 | for (WordCount wordCount : input) { |
| 82 | 0 | WordCount original = countObjects.get(wordCount.word); |
| 83 | |
|
| 84 | 0 | if (original == null) { |
| 85 | 0 | countObjects.put(wordCount.word, original); |
| 86 | |
} else { |
| 87 | 0 | original.documents += wordCount.documents; |
| 88 | 0 | original.count += wordCount.count; |
| 89 | |
} |
| 90 | 0 | } |
| 91 | |
|
| 92 | 0 | return new ArrayList<WordCount>(countObjects.values()); |
| 93 | |
} |
| 94 | |
|
| 95 | |
void updateCounts(String token, HashMap<String, WordCount> countObjects) { |
| 96 | 0 | WordCount wordCount = countObjects.get(token); |
| 97 | |
|
| 98 | 0 | if (filterWords != null && !filterWords.contains(token)) { |
| 99 | 0 | return; |
| 100 | |
} |
| 101 | 0 | if (wordCount != null) { |
| 102 | 0 | wordCount.count += 1; |
| 103 | |
} else { |
| 104 | 0 | wordCount = new WordCount(new String(token), 1, 1); |
| 105 | 0 | countObjects.put(token, wordCount); |
| 106 | |
} |
| 107 | 0 | } |
| 108 | |
} |