View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.parse;
3   
4   import java.io.File;
5   import java.io.IOException;
6   import java.util.HashSet;
7   import java.util.List;
8   import java.util.Set;
9   import org.galagosearch.tupleflow.IncompatibleProcessorException;
10  import org.galagosearch.tupleflow.InputClass;
11  import org.galagosearch.tupleflow.Linkage;
12  import org.galagosearch.tupleflow.NullProcessor;
13  import org.galagosearch.tupleflow.OutputClass;
14  import org.galagosearch.tupleflow.Processor;
15  import org.galagosearch.tupleflow.Source;
16  import org.galagosearch.tupleflow.Step;
17  import org.galagosearch.tupleflow.TupleFlowParameters;
18  import org.galagosearch.tupleflow.Utility;
19  import org.galagosearch.tupleflow.execution.ErrorHandler;
20  
21  /***
22   * WordFilter filters out unnecessary words from documents.  Typically this object
23   * takes a stopword list as parameters and removes all the listed words.  However, 
24   * this can also be used to keep only the specified list of words in the index, which
25   * can be used to create an index that is tailored for only a small set
26   * of experimental queries.
27   * 
28   * @author trevor
29   */
30  @InputClass(className = "org.galagosearch.core.parse.Document")
31  @OutputClass(className = "org.galagosearch.core.parse.Document")
32  public class WordFilter implements Processor<Document>, Source<Document> {
33      Set<String> stopwords = new HashSet<String>();
34      boolean keepListWords = false;
35      public Processor<Document> processor = new NullProcessor(Document.class);
36  
37      public WordFilter(HashSet<String> words) {
38          stopwords = words;
39      }
40  
41      public WordFilter(TupleFlowParameters params) throws IOException {
42          if (params.getXML().containsKey("filename")) {
43              String filename = params.getXML().get("filename");
44              stopwords = Utility.readFileToStringSet(new File(filename));
45          } else {
46              stopwords = new HashSet(params.getXML().stringList("word"));
47          }
48  
49          keepListWords = params.getXML().get("keepListWords", false);
50      }
51  
52      public void process(Document document) throws IOException {
53          List<String> words = document.terms;
54  
55          for (int i = 0; i < words.size(); i++) {
56              String word = words.get(i);
57              boolean wordInList = stopwords.contains(word);
58              boolean removeWord = wordInList != keepListWords;
59  
60              if (removeWord) {
61                  words.set(i, null);
62              }
63          }
64  
65          processor.process(document);
66      }
67  
68      public void close() throws IOException {
69          processor.close();
70      }
71  
72      public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
73          if (parameters.getXML().containsKey("filename")) {
74              return;
75          }
76          if (parameters.getXML().stringList("word").size() == 0) {
77              handler.addWarning("Couldn't find any words in the stopword list.");
78          }
79      }
80  
81      public void setProcessor(Step processor) throws IncompatibleProcessorException {
82          Linkage.link(this, processor);
83      }
84  }