1
2 package org.galagosearch.core.parse;
3
4 import java.io.File;
5 import java.io.IOException;
6 import java.util.HashSet;
7 import java.util.List;
8 import java.util.Set;
9 import org.galagosearch.tupleflow.IncompatibleProcessorException;
10 import org.galagosearch.tupleflow.InputClass;
11 import org.galagosearch.tupleflow.Linkage;
12 import org.galagosearch.tupleflow.NullProcessor;
13 import org.galagosearch.tupleflow.OutputClass;
14 import org.galagosearch.tupleflow.Processor;
15 import org.galagosearch.tupleflow.Source;
16 import org.galagosearch.tupleflow.Step;
17 import org.galagosearch.tupleflow.TupleFlowParameters;
18 import org.galagosearch.tupleflow.Utility;
19 import org.galagosearch.tupleflow.execution.ErrorHandler;
20
21 /***
22 * WordFilter filters out unnecessary words from documents. Typically this object
23 * takes a stopword list as parameters and removes all the listed words. However,
24 * this can also be used to keep only the specified list of words in the index, which
25 * can be used to create an index that is tailored for only a small set
26 * of experimental queries.
27 *
28 * @author trevor
29 */
30 @InputClass(className = "org.galagosearch.core.parse.Document")
31 @OutputClass(className = "org.galagosearch.core.parse.Document")
32 public class WordFilter implements Processor<Document>, Source<Document> {
33 Set<String> stopwords = new HashSet<String>();
34 boolean keepListWords = false;
35 public Processor<Document> processor = new NullProcessor(Document.class);
36
37 public WordFilter(HashSet<String> words) {
38 stopwords = words;
39 }
40
41 public WordFilter(TupleFlowParameters params) throws IOException {
42 if (params.getXML().containsKey("filename")) {
43 String filename = params.getXML().get("filename");
44 stopwords = Utility.readFileToStringSet(new File(filename));
45 } else {
46 stopwords = new HashSet(params.getXML().stringList("word"));
47 }
48
49 keepListWords = params.getXML().get("keepListWords", false);
50 }
51
52 public void process(Document document) throws IOException {
53 List<String> words = document.terms;
54
55 for (int i = 0; i < words.size(); i++) {
56 String word = words.get(i);
57 boolean wordInList = stopwords.contains(word);
58 boolean removeWord = wordInList != keepListWords;
59
60 if (removeWord) {
61 words.set(i, null);
62 }
63 }
64
65 processor.process(document);
66 }
67
68 public void close() throws IOException {
69 processor.close();
70 }
71
72 public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
73 if (parameters.getXML().containsKey("filename")) {
74 return;
75 }
76 if (parameters.getXML().stringList("word").size() == 0) {
77 handler.addWarning("Couldn't find any words in the stopword list.");
78 }
79 }
80
81 public void setProcessor(Step processor) throws IncompatibleProcessorException {
82 Linkage.link(this, processor);
83 }
84 }