View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.scoring;
3   
4   import java.io.IOException;
5   import java.util.HashMap;
6   import org.galagosearch.core.types.DocumentLengthWordCount;
7   import org.galagosearch.core.types.DocumentWordProbability;
8   import org.galagosearch.core.types.WordProbability;
9   import org.galagosearch.tupleflow.StandardStep;
10  import org.galagosearch.tupleflow.TupleFlowParameters;
11  import org.galagosearch.tupleflow.TypeReader;
12  import org.galagosearch.tupleflow.Utility;
13  import org.galagosearch.tupleflow.execution.ErrorHandler;
14  import org.galagosearch.tupleflow.execution.Verification;
15  
16  /***
17   *
18   * @author trevor
19   */
20  public class DirichletSmoother extends StandardStep<DocumentLengthWordCount, DocumentWordProbability>
21          implements DistributionSmoother {
22      double mu;
23      HashMap<String, Double> backgrounds;
24  
25      public DirichletSmoother(double mu, HashMap<String, Double> backgrounds) {
26          this.mu = mu;
27          this.backgrounds = backgrounds;
28      }
29  
30      public DirichletSmoother(TupleFlowParameters parameters) throws IOException {
31          this.mu = parameters.getXML().get("mu", 1500);
32          TypeReader<WordProbability> backgroundReader = parameters.getTypeReader("background");
33          WordProbability backgroundObject = null;
34          this.backgrounds = new HashMap<String, Double>();
35  
36          while ((backgroundObject = backgroundReader.read()) != null) {
37              backgrounds.put(backgroundObject.word, backgroundObject.probability);
38          }
39      }
40  
41      public void process(DocumentLengthWordCount object) throws IOException {
42          double probability = smooth(object.word, object.count, object.length);
43          processor.process(new DocumentWordProbability(object.document,
44                                                        Utility.makeBytes(object.word), probability));
45      }
46  
47      public double smooth(double background, int count, int length) {
48          double numerator = count + mu * background;
49          double denominator = length + mu;
50  
51          return numerator / denominator;
52      }
53  
54      public double smooth(String word, int count, int length) {
55          Double background = backgrounds.get(word);
56          assert background != null : "Couldn't find " + word + " in backgrounds: " + backgrounds.size();
57          return smooth(background, count, length);
58      }
59  
60      public Class<DocumentLengthWordCount> getInputClass() {
61          return DocumentLengthWordCount.class;
62      }
63  
64      public Class<DocumentWordProbability> getOutputClass() {
65          return DocumentWordProbability.class;
66      }
67  
68      public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
69          Verification.verifyTypeReader("background", WordProbability.class, parameters, handler);
70      }
71  }