Coverage Report - org.galagosearch.core.parse.WordFilter
 
Classes in this File Line Coverage Branch Coverage Complexity
WordFilter
0%
0/33
0%
0/12
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 package org.galagosearch.core.parse;
 3  
 
 4  
 import java.io.File;
 5  
 import java.io.IOException;
 6  
 import java.util.HashSet;
 7  
 import java.util.List;
 8  
 import java.util.Set;
 9  
 import org.galagosearch.tupleflow.IncompatibleProcessorException;
 10  
 import org.galagosearch.tupleflow.InputClass;
 11  
 import org.galagosearch.tupleflow.Linkage;
 12  
 import org.galagosearch.tupleflow.NullProcessor;
 13  
 import org.galagosearch.tupleflow.OutputClass;
 14  
 import org.galagosearch.tupleflow.Processor;
 15  
 import org.galagosearch.tupleflow.Source;
 16  
 import org.galagosearch.tupleflow.Step;
 17  
 import org.galagosearch.tupleflow.TupleFlowParameters;
 18  
 import org.galagosearch.tupleflow.Utility;
 19  
 import org.galagosearch.tupleflow.execution.ErrorHandler;
 20  
 
 21  
 /**
 22  
  * WordFilter filters out unnecessary words from documents.  Typically this object
 23  
  * takes a stopword list as parameters and removes all the listed words.  However, 
 24  
  * this can also be used to keep only the specified list of words in the index, which
 25  
  * can be used to create an index that is tailored for only a small set
 26  
  * of experimental queries.
 27  
  * 
 28  
  * @author trevor
 29  
  */
 30  
 @InputClass(className = "org.galagosearch.core.parse.Document")
 31  
 @OutputClass(className = "org.galagosearch.core.parse.Document")
 32  0
 public class WordFilter implements Processor<Document>, Source<Document> {
 33  0
     Set<String> stopwords = new HashSet<String>();
 34  0
     boolean keepListWords = false;
 35  0
     public Processor<Document> processor = new NullProcessor(Document.class);
 36  
 
 37  0
     public WordFilter(HashSet<String> words) {
 38  0
         stopwords = words;
 39  0
     }
 40  
 
 41  0
     public WordFilter(TupleFlowParameters params) throws IOException {
 42  0
         if (params.getXML().containsKey("filename")) {
 43  0
             String filename = params.getXML().get("filename");
 44  0
             stopwords = Utility.readFileToStringSet(new File(filename));
 45  0
         } else {
 46  0
             stopwords = new HashSet(params.getXML().stringList("word"));
 47  
         }
 48  
 
 49  0
         keepListWords = params.getXML().get("keepListWords", false);
 50  0
     }
 51  
 
 52  
     public void process(Document document) throws IOException {
 53  0
         List<String> words = document.terms;
 54  
 
 55  0
         for (int i = 0; i < words.size(); i++) {
 56  0
             String word = words.get(i);
 57  0
             boolean wordInList = stopwords.contains(word);
 58  0
             boolean removeWord = wordInList != keepListWords;
 59  
 
 60  0
             if (removeWord) {
 61  0
                 words.set(i, null);
 62  
             }
 63  
         }
 64  
 
 65  0
         processor.process(document);
 66  0
     }
 67  
 
 68  
     public void close() throws IOException {
 69  0
         processor.close();
 70  0
     }
 71  
 
 72  
     public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
 73  0
         if (parameters.getXML().containsKey("filename")) {
 74  0
             return;
 75  
         }
 76  0
         if (parameters.getXML().stringList("word").size() == 0) {
 77  0
             handler.addWarning("Couldn't find any words in the stopword list.");
 78  
         }
 79  0
     }
 80  
 
 81  
     public void setProcessor(Step processor) throws IncompatibleProcessorException {
 82  0
         Linkage.link(this, processor);
 83  0
     }
 84  
 }