1
2 package org.galagosearch.core.scoring;
3
4 import java.io.IOException;
5 import java.util.HashMap;
6 import org.galagosearch.core.types.DocumentLengthWordCount;
7 import org.galagosearch.core.types.DocumentWordProbability;
8 import org.galagosearch.core.types.WordProbability;
9 import org.galagosearch.tupleflow.StandardStep;
10 import org.galagosearch.tupleflow.TupleFlowParameters;
11 import org.galagosearch.tupleflow.TypeReader;
12 import org.galagosearch.tupleflow.Utility;
13 import org.galagosearch.tupleflow.execution.ErrorHandler;
14 import org.galagosearch.tupleflow.execution.Verification;
15
16 /***
17 *
18 * @author trevor
19 */
20 public class DirichletSmoother extends StandardStep<DocumentLengthWordCount, DocumentWordProbability>
21 implements DistributionSmoother {
22 double mu;
23 HashMap<String, Double> backgrounds;
24
25 public DirichletSmoother(double mu, HashMap<String, Double> backgrounds) {
26 this.mu = mu;
27 this.backgrounds = backgrounds;
28 }
29
30 public DirichletSmoother(TupleFlowParameters parameters) throws IOException {
31 this.mu = parameters.getXML().get("mu", 1500);
32 TypeReader<WordProbability> backgroundReader = parameters.getTypeReader("background");
33 WordProbability backgroundObject = null;
34 this.backgrounds = new HashMap<String, Double>();
35
36 while ((backgroundObject = backgroundReader.read()) != null) {
37 backgrounds.put(backgroundObject.word, backgroundObject.probability);
38 }
39 }
40
41 public void process(DocumentLengthWordCount object) throws IOException {
42 double probability = smooth(object.word, object.count, object.length);
43 processor.process(new DocumentWordProbability(object.document,
44 Utility.makeBytes(object.word), probability));
45 }
46
47 public double smooth(double background, int count, int length) {
48 double numerator = count + mu * background;
49 double denominator = length + mu;
50
51 return numerator / denominator;
52 }
53
54 public double smooth(String word, int count, int length) {
55 Double background = backgrounds.get(word);
56 assert background != null : "Couldn't find " + word + " in backgrounds: " + backgrounds.size();
57 return smooth(background, count, length);
58 }
59
60 public Class<DocumentLengthWordCount> getInputClass() {
61 return DocumentLengthWordCount.class;
62 }
63
64 public Class<DocumentWordProbability> getOutputClass() {
65 return DocumentWordProbability.class;
66 }
67
68 public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
69 Verification.verifyTypeReader("background", WordProbability.class, parameters, handler);
70 }
71 }