1
2
3 package org.galagosearch.core.parse;
4
5 import org.galagosearch.tupleflow.IncompatibleProcessorException;
6 import org.galagosearch.tupleflow.InputClass;
7 import org.galagosearch.tupleflow.Linkage;
8 import org.galagosearch.tupleflow.NullProcessor;
9 import org.galagosearch.tupleflow.OutputClass;
10 import org.galagosearch.tupleflow.Processor;
11 import org.galagosearch.tupleflow.Source;
12 import org.galagosearch.tupleflow.Step;
13 import org.galagosearch.tupleflow.execution.Verified;
14 import java.io.IOException;
15 import java.util.HashMap;
16 import java.util.List;
17 import org.tartarus.snowball.ext.englishStemmer;
18
19 /***
20 *
21 * @author trevor
22 */
23 @Verified
24 @InputClass(className = "org.galagosearch.core.parse.Document")
25 @OutputClass(className = "org.galagosearch.core.parse.Document")
26 public class Porter2Stemmer implements Processor<Document>, Source<Document> {
27 englishStemmer stemmer = new englishStemmer();
28 HashMap<String, String> cache = new HashMap();
29 public Processor<Document> processor = new NullProcessor(Document.class);
30
31 public void process(Document document) throws IOException {
32 List<String> words = document.terms;
33
34 for (int i = 0; i < words.size(); i++) {
35 String word = words.get(i);
36
37 if (word != null) {
38 if (cache.containsKey(word)) {
39 words.set(i, cache.get(word));
40 } else {
41 stemmer.setCurrent(word);
42 if (stemmer.stem()) {
43 String stem = stemmer.getCurrent();
44 words.set(i, stem);
45 cache.put(word, stem);
46 } else {
47 cache.put(word, word);
48 }
49 }
50
51 if (cache.size() > 50000) {
52 cache.clear();
53 }
54 }
55 }
56
57 processor.process(document);
58 }
59
60 public void setProcessor(Step processor) throws IncompatibleProcessorException {
61 Linkage.link(this, processor);
62 }
63
64 public void close() throws IOException {
65 processor.close();
66 }
67 }