1
2 package org.galagosearch.core.index;
3
4 import java.io.DataInput;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.RandomAccessFile;
8 import java.util.HashMap;
9 import java.util.List;
10 import java.util.Map;
11 import org.galagosearch.core.parse.Document;
12 import org.galagosearch.core.index.IndexReader;
13 import org.galagosearch.core.retrieval.query.Node;
14 import org.galagosearch.core.retrieval.query.NodeType;
15 import org.galagosearch.core.retrieval.structured.CountIterator;
16 import org.galagosearch.core.retrieval.structured.ExtentIterator;
17 import org.galagosearch.core.retrieval.structured.IndexIterator;
18 import org.galagosearch.core.util.ExtentArray;
19 import org.galagosearch.tupleflow.BufferedFileDataStream;
20 import org.galagosearch.tupleflow.Processor;
21 import org.galagosearch.tupleflow.VByteInput;
22
23 /***
24 * Reads a simple positions-based index, where each inverted list in the
25 * index contains both term count information and term position information.
26 * The term counts data is stored separately from term position information for
27 * faster query processing when no positions are needed.
28 *
29 * For now, the iterator loads everything into memory before starting query
30 * processing, which is not a workable solution for larger collections.
31 *
32 * @author trevor
33 */
34 public class PositionIndexReader implements StructuredIndexPartReader {
35 public class Iterator extends ExtentIterator implements IndexIterator {
36 int documentCount;
37 int totalPositionCount;
38 VByteInput documents;
39 VByteInput counts;
40 VByteInput positions;
41 int documentIndex;
42 int currentDocument;
43 int currentCount;
44 ExtentArray extentArray;
45 IndexReader.Iterator iterator;
46
47 Iterator(IndexReader.Iterator iterator) throws IOException {
48 this.iterator = iterator;
49 load();
50 }
51
52 private void load() throws IOException {
53 long startPosition = iterator.getValueStart();
54 long endPosition = iterator.getValueEnd();
55
56 RandomAccessFile input = reader.getInput();
57 input.seek(startPosition);
58 DataInput stream = new VByteInput(reader.getInput());
59
60 int options = stream.readInt();
61 documentCount = stream.readInt();
62 totalPositionCount = stream.readInt();
63
64 long documentByteLength = stream.readLong();
65 long countsByteLength = stream.readLong();
66 long positionsByteLength = stream.readLong();
67
68 long documentStart = input.getFilePointer();
69 long documentEnd = documentStart + documentByteLength;
70
71 long countsStart = documentEnd;
72 long countsEnd = countsStart + countsByteLength;
73
74 long positionsStart = countsEnd;
75 long positionsEnd = positionsStart + positionsByteLength;
76
77 assert positionsEnd == endPosition;
78
79
80 documents = new VByteInput(new BufferedFileDataStream(input, documentStart, documentEnd));
81 counts = new VByteInput(new BufferedFileDataStream(input, countsStart, countsEnd));
82 positions = new VByteInput(new BufferedFileDataStream(input, positionsStart, positionsEnd));
83
84 extentArray = new ExtentArray();
85 documentIndex = 0;
86 loadExtents();
87 }
88
89 private void loadExtents() throws IOException {
90 currentDocument += documents.readInt();
91 currentCount = counts.readInt();
92 extentArray.reset();
93
94 int position = 0;
95 for (int i = 0; i < currentCount; i++) {
96 position += positions.readInt();
97 extentArray.add(currentDocument, position, position + 1);
98 }
99 }
100
101 public String getRecordString() {
102 StringBuilder builder = new StringBuilder();
103
104 builder.append(iterator.getKey());
105 builder.append(",");
106 builder.append(currentDocument);
107 for (int i = 0; i < extentArray.getPosition(); ++i) {
108 builder.append(",");
109 builder.append(extentArray.getBuffer()[i].begin);
110 }
111
112 return builder.toString();
113 }
114
115 public void reset() throws IOException {
116 currentDocument = 0;
117 currentCount = 0;
118 extentArray.reset();
119
120 load();
121 }
122
123 public long getByteLength() throws IOException {
124 return iterator.getValueLength();
125 }
126
127 public String getCurrentTerm() throws IOException {
128 return iterator.getKey();
129 }
130
131 public void nextDocument() throws IOException {
132 documentIndex += 1;
133
134 if (!isDone()) {
135 loadExtents();
136 }
137 }
138
139 public boolean nextRecord() throws IOException {
140 nextDocument();
141 if (!isDone()) return true;
142 if (iterator.nextKey()) {
143 load();
144 return true;
145 }
146 return false;
147 }
148
149 public boolean isDone() {
150 return documentIndex >= documentCount;
151 }
152
153 public ExtentArray extents() {
154 return extentArray;
155 }
156
157 public int document() {
158 return currentDocument;
159 }
160
161 public int count() {
162 return currentCount;
163 }
164 }
165 IndexReader reader;
166
167 public PositionIndexReader(IndexReader reader) throws IOException {
168 this.reader = reader;
169 }
170
171 public PositionIndexReader(String pathname) throws FileNotFoundException, IOException {
172 reader = new IndexReader(pathname);
173 }
174
175 /***
176 * Returns an iterator pointing at the first term in the index.
177 */
178 public Iterator getIterator() throws IOException {
179 return new Iterator(reader.getIterator());
180 }
181
182 /***
183 * Returns an iterator pointing at the specified term, or
184 * null if the term doesn't exist in the inverted file.
185 */
186 public Iterator getTermExtents(String term) throws IOException {
187 IndexReader.Iterator iterator = reader.getIterator(term);
188
189 if (iterator != null) {
190 return new Iterator(iterator);
191 }
192 return null;
193 }
194
195 List<Processor<Document>> transformations() {
196 return DocumentTransformationFactory.instance(reader.getManifest());
197 }
198
199 List<Processor<Document>> transformations(String field) {
200 return transformations();
201 }
202
203 public void close() throws IOException {
204 reader.close();
205 }
206
207 public Map<String, NodeType> getNodeTypes() {
208 HashMap<String, NodeType> types = new HashMap<String, NodeType>();
209 types.put("counts", new NodeType(Iterator.class));
210 types.put("extents", new NodeType(Iterator.class));
211 return types;
212 }
213
214 public IndexIterator getIterator(Node node) throws IOException {
215
216 return getTermExtents(node.getDefaultParameter("term"));
217 }
218 }