View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.index;
3   
4   import java.io.DataInput;
5   import java.io.FileNotFoundException;
6   import java.io.IOException;
7   import java.io.RandomAccessFile;
8   import java.util.HashMap;
9   import java.util.List;
10  import java.util.Map;
11  import org.galagosearch.core.parse.Document;
12  import org.galagosearch.core.index.IndexReader;
13  import org.galagosearch.core.retrieval.query.Node;
14  import org.galagosearch.core.retrieval.query.NodeType;
15  import org.galagosearch.core.retrieval.structured.CountIterator;
16  import org.galagosearch.core.retrieval.structured.ExtentIterator;
17  import org.galagosearch.core.retrieval.structured.IndexIterator;
18  import org.galagosearch.core.util.ExtentArray;
19  import org.galagosearch.tupleflow.BufferedFileDataStream;
20  import org.galagosearch.tupleflow.Processor;
21  import org.galagosearch.tupleflow.VByteInput;
22  
23  /***
24   * Reads a simple positions-based index, where each inverted list in the
25   * index contains both term count information and term position information.
26   * The term counts data is stored separately from term position information for
27   * faster query processing when no positions are needed.
28   * 
29   * For now, the iterator loads everything into memory before starting query 
30   * processing, which is not a workable solution for larger collections.
31   * 
32   * @author trevor
33   */
34  public class PositionIndexReader implements StructuredIndexPartReader {
35      public class Iterator extends ExtentIterator implements IndexIterator {
36          int documentCount;
37          int totalPositionCount;
38          VByteInput documents;
39          VByteInput counts;
40          VByteInput positions;
41          int documentIndex;
42          int currentDocument;
43          int currentCount;
44          ExtentArray extentArray;
45          IndexReader.Iterator iterator;
46  
47          Iterator(IndexReader.Iterator iterator) throws IOException {
48              this.iterator = iterator;
49              load();
50          }
51  
52          private void load() throws IOException {
53              long startPosition = iterator.getValueStart();
54              long endPosition = iterator.getValueEnd();
55  
56              RandomAccessFile input = reader.getInput();
57              input.seek(startPosition);
58              DataInput stream = new VByteInput(reader.getInput());
59  
60              int options = stream.readInt();
61              documentCount = stream.readInt();
62              totalPositionCount = stream.readInt();
63  
64              long documentByteLength = stream.readLong();
65              long countsByteLength = stream.readLong();
66              long positionsByteLength = stream.readLong();
67  
68              long documentStart = input.getFilePointer();
69              long documentEnd = documentStart + documentByteLength;
70  
71              long countsStart = documentEnd;
72              long countsEnd = countsStart + countsByteLength;
73  
74              long positionsStart = countsEnd;
75              long positionsEnd = positionsStart + positionsByteLength;
76  
77              assert positionsEnd == endPosition;
78  
79              // create streams for each kind of data
80              documents = new VByteInput(new BufferedFileDataStream(input, documentStart, documentEnd));
81              counts = new VByteInput(new BufferedFileDataStream(input, countsStart, countsEnd));
82              positions = new VByteInput(new BufferedFileDataStream(input, positionsStart, positionsEnd));
83  
84              extentArray = new ExtentArray();
85              documentIndex = 0;
86              loadExtents();
87          }
88  
89          private void loadExtents() throws IOException {
90              currentDocument += documents.readInt();
91              currentCount = counts.readInt();
92              extentArray.reset();
93  
94              int position = 0;
95              for (int i = 0; i < currentCount; i++) {
96                  position += positions.readInt();
97                  extentArray.add(currentDocument, position, position + 1);
98              }
99          }
100         
101         public String getRecordString() {
102             StringBuilder builder = new StringBuilder();
103             
104             builder.append(iterator.getKey());
105             builder.append(",");
106             builder.append(currentDocument);
107             for (int i = 0; i < extentArray.getPosition(); ++i) {
108                 builder.append(",");
109                 builder.append(extentArray.getBuffer()[i].begin);
110             }
111             
112             return builder.toString();
113         }
114 
115         public void reset() throws IOException {
116             currentDocument = 0;
117             currentCount = 0;
118             extentArray.reset();
119 
120             load();
121         }
122 
123         public long getByteLength() throws IOException {
124             return iterator.getValueLength();
125         }
126 
127         public String getCurrentTerm() throws IOException {
128             return iterator.getKey();
129         }
130 
131         public void nextDocument() throws IOException {
132             documentIndex += 1;
133 
134             if (!isDone()) {
135                 loadExtents();
136             }
137         }
138 
139         public boolean nextRecord() throws IOException {
140             nextDocument();
141             if (!isDone()) return true;
142             if (iterator.nextKey()) {
143                 load();
144                 return true;
145             }
146             return false;
147         }
148 
149         public boolean isDone() {
150             return documentIndex >= documentCount;
151         }
152 
153         public ExtentArray extents() {
154             return extentArray;
155         }
156 
157         public int document() {
158             return currentDocument;
159         }
160 
161         public int count() {
162             return currentCount;
163         }
164     }
165     IndexReader reader;
166 
167     public PositionIndexReader(IndexReader reader) throws IOException {
168         this.reader = reader;
169     }
170     
171     public PositionIndexReader(String pathname) throws FileNotFoundException, IOException {
172         reader = new IndexReader(pathname);
173     }
174 
175     /***
176      * Returns an iterator pointing at the first term in the index.
177      */
178     public Iterator getIterator() throws IOException {
179         return new Iterator(reader.getIterator());
180     }
181 
182     /***
183      * Returns an iterator pointing at the specified term, or 
184      * null if the term doesn't exist in the inverted file.
185      */
186     public Iterator getTermExtents(String term) throws IOException {
187         IndexReader.Iterator iterator = reader.getIterator(term);
188 
189         if (iterator != null) {
190             return new Iterator(iterator);
191         }
192         return null;
193     }
194 
195     List<Processor<Document>> transformations() {
196         return DocumentTransformationFactory.instance(reader.getManifest());
197     }
198 
199     List<Processor<Document>> transformations(String field) {
200         return transformations();
201     }
202 
203     public void close() throws IOException {
204         reader.close();
205     }
206 
207     public Map<String, NodeType> getNodeTypes() {
208         HashMap<String, NodeType> types = new HashMap<String, NodeType>();
209         types.put("counts", new NodeType(Iterator.class));
210         types.put("extents", new NodeType(Iterator.class));
211         return types;
212     }
213 
214     public IndexIterator getIterator(Node node) throws IOException {
215         // TODO(strohman): handle stemming!!
216         return getTermExtents(node.getDefaultParameter("term"));
217     }
218 }