View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import org.galagosearch.tupleflow.IncompatibleProcessorException;
6   import org.galagosearch.tupleflow.InputClass;
7   import org.galagosearch.tupleflow.Linkage;
8   import org.galagosearch.tupleflow.NullProcessor;
9   import org.galagosearch.tupleflow.OutputClass;
10  import org.galagosearch.tupleflow.Processor;
11  import org.galagosearch.tupleflow.Source;
12  import org.galagosearch.tupleflow.Step;
13  import org.galagosearch.tupleflow.execution.Verified;
14  import java.io.IOException;
15  import java.util.HashMap;
16  import java.util.List;
17  import org.tartarus.snowball.ext.englishStemmer;
18  
19  /***
20   *
21   * @author trevor
22   */
23  @Verified
24  @InputClass(className = "org.galagosearch.core.parse.Document")
25  @OutputClass(className = "org.galagosearch.core.parse.Document")
26  public class Porter2Stemmer implements Processor<Document>, Source<Document> {
27      englishStemmer stemmer = new englishStemmer();
28      HashMap<String, String> cache = new HashMap();
29      public Processor<Document> processor = new NullProcessor(Document.class);
30  
31      public void process(Document document) throws IOException {
32          List<String> words = document.terms;
33  
34          for (int i = 0; i < words.size(); i++) {
35              String word = words.get(i);
36  
37              if (word != null) {
38                  if (cache.containsKey(word)) {
39                      words.set(i, cache.get(word));
40                  } else {
41                      stemmer.setCurrent(word);
42                      if (stemmer.stem()) {
43                          String stem = stemmer.getCurrent();
44                          words.set(i, stem);
45                          cache.put(word, stem);
46                      } else {
47                          cache.put(word, word);
48                      }
49                  }
50  
51                  if (cache.size() > 50000) {
52                      cache.clear();
53                  }
54              }
55          }
56  
57          processor.process(document);
58      }
59  
60      public void setProcessor(Step processor) throws IncompatibleProcessorException {
61          Linkage.link(this, processor);
62      }
63  
64      public void close() throws IOException {
65          processor.close();
66      }
67  }