View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.retrieval.structured;
3   
4   import java.io.IOException;
5   import java.util.Arrays;
6   import java.util.HashSet;
7   import org.galagosearch.core.index.StructuredIndex;
8   import org.galagosearch.tupleflow.Parameters;
9   
10  /***
11   *
12   * @author trevor
13   */
14  public class StatisticsGatherer {
15      CountIterator iterator;
16      long documentCount;
17      long termCount;
18      long collectionLength;
19      double averageDocumentLength;
20      HashSet<String> required;
21  
22      public StatisticsGatherer(CountIterator iterator) {
23          this.iterator = iterator;
24          this.documentCount = 0;
25          this.termCount = 0;
26          this.collectionLength = 0;
27          this.averageDocumentLength = 0;
28          this.required = new HashSet<String>();
29      }
30  
31      public StatisticsGatherer(StructuredIndex index, CountIterator iterator, String[] required) {
32          this(iterator);
33  
34          this.required.addAll(Arrays.asList(required));
35          collectionLength = index.getCollectionLength();
36          averageDocumentLength = (double) index.getCollectionLength() / (double) index.
37                  getDocumentCount();
38      }
39  
40      public void run() throws IOException {
41          while (!iterator.isDone()) {
42              documentCount += 1;
43              termCount += iterator.count();
44  
45              iterator.nextDocument();
46          }
47      }
48  
49      public long getDocumentCount() {
50          return documentCount;
51      }
52  
53      public long getTermCount() {
54          return termCount;
55      }
56  
57      public double getAverageDocumentLength() {
58          return averageDocumentLength;
59      }
60  
61      public long getCollectionLength() {
62          return collectionLength;
63      }
64  
65      public double getCollectionProbability() {
66          return Math.max((double) termCount, 0.5) / (double) collectionLength;
67      }
68  
69      public void store(Parameters p) {
70          if (!p.containsKey("collectionLength")) {
71              p.add("collectionLength", Long.toString(
72                                                        getCollectionLength()));
73          }
74          if (!p.containsKey("averageDocumentLength")) {
75              p.add("averageDocumentLength", Double.toString(
76                                                             getAverageDocumentLength()));
77          }
78          if (!p.containsKey("termCount")) {
79              p.add("termCount", Long.toString(getTermCount()));
80          }
81          if (!p.containsKey("documentCount")) {
82              p.add("documentCount",
83                                                     Long.toString(getDocumentCount()));
84          }
85          if (!p.containsKey("collectionProbability")) {
86              p.add("collectionProbability", Double.toString(
87                                                             getCollectionProbability()));
88          }
89      }
90  }