1
2 package org.galagosearch.core.retrieval.structured;
3
4 import java.io.IOException;
5 import java.util.Arrays;
6 import java.util.HashSet;
7 import org.galagosearch.core.index.StructuredIndex;
8 import org.galagosearch.tupleflow.Parameters;
9
10 /***
11 *
12 * @author trevor
13 */
14 public class StatisticsGatherer {
15 CountIterator iterator;
16 long documentCount;
17 long termCount;
18 long collectionLength;
19 double averageDocumentLength;
20 HashSet<String> required;
21
22 public StatisticsGatherer(CountIterator iterator) {
23 this.iterator = iterator;
24 this.documentCount = 0;
25 this.termCount = 0;
26 this.collectionLength = 0;
27 this.averageDocumentLength = 0;
28 this.required = new HashSet<String>();
29 }
30
31 public StatisticsGatherer(StructuredIndex index, CountIterator iterator, String[] required) {
32 this(iterator);
33
34 this.required.addAll(Arrays.asList(required));
35 collectionLength = index.getCollectionLength();
36 averageDocumentLength = (double) index.getCollectionLength() / (double) index.
37 getDocumentCount();
38 }
39
40 public void run() throws IOException {
41 while (!iterator.isDone()) {
42 documentCount += 1;
43 termCount += iterator.count();
44
45 iterator.nextDocument();
46 }
47 }
48
49 public long getDocumentCount() {
50 return documentCount;
51 }
52
53 public long getTermCount() {
54 return termCount;
55 }
56
57 public double getAverageDocumentLength() {
58 return averageDocumentLength;
59 }
60
61 public long getCollectionLength() {
62 return collectionLength;
63 }
64
65 public double getCollectionProbability() {
66 return Math.max((double) termCount, 0.5) / (double) collectionLength;
67 }
68
69 public void store(Parameters p) {
70 if (!p.containsKey("collectionLength")) {
71 p.add("collectionLength", Long.toString(
72 getCollectionLength()));
73 }
74 if (!p.containsKey("averageDocumentLength")) {
75 p.add("averageDocumentLength", Double.toString(
76 getAverageDocumentLength()));
77 }
78 if (!p.containsKey("termCount")) {
79 p.add("termCount", Long.toString(getTermCount()));
80 }
81 if (!p.containsKey("documentCount")) {
82 p.add("documentCount",
83 Long.toString(getDocumentCount()));
84 }
85 if (!p.containsKey("collectionProbability")) {
86 p.add("collectionProbability", Double.toString(
87 getCollectionProbability()));
88 }
89 }
90 }