| 1 | |
|
| 2 | |
package org.galagosearch.core.scoring; |
| 3 | |
|
| 4 | |
import java.io.IOException; |
| 5 | |
import org.galagosearch.core.retrieval.structured.CountIterator; |
| 6 | |
import org.galagosearch.core.retrieval.structured.RequiredStatistics; |
| 7 | |
import org.galagosearch.core.retrieval.structured.ScoringFunctionIterator; |
| 8 | |
import org.galagosearch.tupleflow.Parameters; |
| 9 | |
|
| 10 | |
|
| 11 | |
|
| 12 | |
|
| 13 | |
|
| 14 | |
@RequiredStatistics(statistics = {"collectionLength"}) |
| 15 | |
public class DirichletScorer extends ScoringFunctionIterator { |
| 16 | |
double background; |
| 17 | |
double mu; |
| 18 | |
|
| 19 | |
public DirichletScorer(Parameters parameters, CountIterator iterator) throws IOException { |
| 20 | 8 | super(iterator); |
| 21 | |
|
| 22 | 8 | mu = parameters.get("mu", 1500); |
| 23 | 8 | if (parameters.containsKey("collectionProbability")) { |
| 24 | 0 | background = parameters.get("collectionProbability", 0.0001); |
| 25 | |
} else { |
| 26 | 8 | long collectionLength = parameters.get("collectionLength", (long)0); |
| 27 | 8 | long count = 0; |
| 28 | |
|
| 29 | 36 | while (!iterator.isDone()) { |
| 30 | 28 | count += iterator.count(); |
| 31 | 28 | iterator.nextDocument(); |
| 32 | |
} |
| 33 | |
|
| 34 | 8 | background = (double)count / (double)collectionLength; |
| 35 | 8 | iterator.reset(); |
| 36 | |
} |
| 37 | 8 | } |
| 38 | |
|
| 39 | |
public double scoreCount(int count, int length) { |
| 40 | 40 | double numerator = count + mu * background; |
| 41 | 40 | double denominator = length + mu; |
| 42 | |
|
| 43 | 40 | return Math.log(numerator / denominator); |
| 44 | |
} |
| 45 | |
} |
| 46 | |
|