1
2 package org.galagosearch.core.index;
3
4 import java.io.ByteArrayInputStream;
5 import java.io.DataInputStream;
6 import java.io.EOFException;
7 import java.io.File;
8 import java.io.FileNotFoundException;
9 import java.io.IOException;
10 import java.io.RandomAccessFile;
11 import java.util.zip.GZIPInputStream;
12 import org.galagosearch.core.index.IndexWriter;
13 import org.galagosearch.core.index.VocabularyReader.TermSlot;
14 import org.galagosearch.core.index.VocabularyReader;
15 import org.galagosearch.tupleflow.BufferedFileDataStream;
16 import org.galagosearch.tupleflow.DataStream;
17 import org.galagosearch.tupleflow.MemoryDataStream;
18 import org.galagosearch.tupleflow.Parameters;
19 import org.galagosearch.tupleflow.Utility;
20
21 /***
22 * <p>This implements the core functionality for all inverted list readers. It can
23 * also be used as a read-only TreeMap for disk-based data structures. In Galago,
24 * it is used both to store index data and to store documents.</p>
25 *
26 * <p>An index is a mapping from String to byte[]. If compression is turned on, the
27 * value must be small enough that it fits in memory. If compression is off, values
28 * are streamed directly from disk so there is no size restriction. Indexes support
29 * iteration over all keys, or direct lookup of a single key. The structure is optimized
30 * to support fast random lookup on disks.</p>
31 *
32 * <p>Data is stored in blocks, typically 32K each. Each block has a prefix-compressed
33 * set of keys at the beginning, followed by a block of value data. IndexWriter/IndexReader
34 * can GZip compress that value data, or it can be stored uncompressed. For inverted list
35 * data it's best to use your own compression, but for text data the GZip compression
36 * is a good choice.</p>
37 *
38 * <p>Typically this class is extended by composition instead of inheritance.</p>
39 *
40 * @author trevor
41 */
42 public class IndexReader {
43 VocabularyReader vocabulary;
44 RandomAccessFile input;
45 Parameters manifest;
46 int blockSize = 65536;
47 int vocabGroup = 16;
48 long vocabularyOffset;
49 long manifestOffset;
50 long footerOffset;
51 boolean isCompressed;
52
53 private static class VocabularyBlock {
54 long startFileOffset;
55 long startValueFileOffset;
56 long endValueFileOffset;
57
58 long[] endValueOffsets;
59 String[] keys;
60
61 public VocabularyBlock(
62 long startFileOffset,
63 long startValueFileOffset,
64 long endValueFileOffset,
65 long[] endValueOffsets, String[] keys) {
66 this.keys = keys;
67 this.endValueOffsets = endValueOffsets;
68 this.startFileOffset = startFileOffset;
69 this.startValueFileOffset = startValueFileOffset;
70 this.endValueFileOffset = endValueFileOffset;
71 }
72
73 public long getValuesStart() {
74 return startValueFileOffset;
75 }
76
77 public long getValuesEnd() {
78 return endValueFileOffset;
79 }
80
81 public long getBlockEnd() {
82 return getValuesEnd();
83 }
84
85 public long getListStart(int index) {
86 if (index == 0) {
87 return startValueFileOffset;
88 }
89 return endValueFileOffset - endValueOffsets[index - 1];
90 }
91
92 public long getListEnd(int index) {
93 return endValueFileOffset - endValueOffsets[index];
94 }
95
96 public long getUncompressedEndOffset(int index) {
97 return endValueOffsets[index];
98 }
99
100 public boolean hasMore(int index) {
101 return endValueOffsets.length > (index + 1);
102 }
103
104 public int findIndex(String term) {
105
106
107 for (int i = 0; i < keys.length; i++) {
108 if (keys[i].equals(term)) {
109 return i;
110 }
111 }
112 return -1;
113 }
114
115 private String getKey(int termIndex) {
116 return this.keys[termIndex];
117 }
118 }
119
120 public class Iterator {
121 VocabularyBlock block;
122 byte[] decompressedData;
123 String key;
124 int keyIndex;
125 boolean done;
126
127 Iterator(VocabularyBlock block, int index) throws IOException {
128 this.block = block;
129 keyIndex = index;
130 done = false;
131
132 loadIndex();
133 decompressedData = null;
134 }
135
136 void loadIndex() throws IOException {
137 key = "";
138
139 if (block == null || keyIndex < 0) {
140 done = true;
141 return;
142 }
143 key = block.getKey(keyIndex);
144 }
145
146 void invalidateBlock() {
147 decompressedData = null;
148 }
149
150 void decompressBlock() throws IOException {
151 int blockLength = (int) (block.getValuesEnd() - block.getValuesStart());
152 byte[] data = new byte[blockLength];
153 input.seek(block.getValuesStart());
154 input.readFully(data);
155
156 ByteArrayInputStream in = new ByteArrayInputStream(data);
157 DataInputStream dataIn = new DataInputStream(in);
158 int uncompressedLength = dataIn.readInt();
159
160 GZIPInputStream stream = new GZIPInputStream(in);
161 decompressedData = new byte[uncompressedLength];
162 int totalRead = 0;
163 while (totalRead < uncompressedLength) {
164 int remaining = decompressedData.length - totalRead;
165 int bytesRead = stream.read(decompressedData, totalRead, remaining);
166 if (bytesRead <= 0) {
167 throw new EOFException("Too little data was found.");
168 }
169 totalRead += bytesRead;
170 }
171 }
172
173 public void skipTo(byte[] key) throws IOException {
174 TermSlot slot = vocabulary.get(key);
175 if (slot == null) {
176 done = true;
177 } else {
178 invalidateBlock();
179 block = readVocabularyBlock(slot.begin);
180 keyIndex = 0;
181 while (keyIndex < block.keys.length) {
182 byte[] blockKey = Utility.makeBytes(block.keys[keyIndex]);
183 if (Utility.compare(key, blockKey) <= 0) {
184 loadIndex();
185 return;
186 }
187 keyIndex++;
188 }
189 done = true;
190 }
191 }
192
193 /***
194 * Returns true if no more keys remain to be read.
195 */
196 public boolean isDone() {
197 return done;
198 }
199
200 /***
201 * Returns the byte offset of the end of this block.
202 */
203 public long getFilePosition() {
204 return block.getBlockEnd();
205 }
206
207 /***
208 * Returns the key associated with the current inverted list.
209 */
210 public String getKey() {
211 return key;
212 }
213
214 /***
215 * Returns the value as a buffered stream.
216 */
217 public DataStream getValueStream() throws IOException {
218 if (isCompressed) {
219
220
221 if (decompressedData == null) {
222 decompressBlock();
223 }
224
225 int start = 0;
226 int end = (int) (decompressedData.length - block.getUncompressedEndOffset(keyIndex));
227 if (keyIndex > 0) {
228 start = (int) (decompressedData.length - block.getUncompressedEndOffset(keyIndex - 1));
229 }
230 int length = end - start;
231
232 return new MemoryDataStream(decompressedData, start, length);
233 } else {
234 return blockStream(this);
235 }
236 }
237
238 /***
239 * Returns the length of the value, in bytes.
240 */
241 public long getValueLength() throws IOException {
242 return getValueStream().length();
243 }
244
245 /***
246 * Returns the value as a string.
247 */
248
249 public String getValueString() throws IOException {
250 DataStream stream = getValueStream();
251 assert stream.length() < Integer.MAX_VALUE;
252 byte[] data = new byte[(int) stream.length()];
253 stream.readFully(data);
254 return Utility.makeString(data);
255 }
256
257 /***
258 * Returns the byte offset
259 * of the beginning of the current inverted list,
260 * relative to the start of the whole inverted file.
261 */
262 public long getValueStart() {
263 return block.getListStart(keyIndex);
264 }
265
266 public long getDataStart() {
267 if (isCompressed) return -1;
268 return block.getListStart(keyIndex);
269 }
270
271 public long getDataEnd() {
272 if (isCompressed) return -1;
273 return block.getListEnd(keyIndex);
274 }
275
276 /***
277 * Returns the byte offset
278 * of the end of the current inverted list,
279 * relative to the start of the whole inverted file.
280 */
281 public long getValueEnd() {
282 return block.getListEnd(keyIndex);
283 }
284
285 /***
286 * Advances to the next key in the index.
287 *
288 * @return true if the advance was successful, false if no more keys remain.
289 * @throws java.io.IOException
290 */
291 public boolean nextKey() throws IOException {
292 if (block.hasMore(keyIndex)) {
293 keyIndex++;
294 } else if (block.getBlockEnd() >= IndexReader.this.vocabularyOffset) {
295 invalidateBlock();
296 done = true;
297 return false;
298 } else {
299 invalidateBlock();
300 block = readVocabularyBlock(block.getBlockEnd());
301 keyIndex = 0;
302 }
303
304 loadIndex();
305 return true;
306 }
307 }
308
309 /***
310 * Opens an index found in the at pathname.
311 *
312 * @param pathname Filename of the index to open.
313 * @throws FileNotFoundException
314 * @throws IOException
315 */
316 public IndexReader(String pathname) throws FileNotFoundException, IOException {
317 input = new RandomAccessFile(pathname, "r");
318
319
320 long length = input.length();
321 footerOffset = length - 2*Integer.SIZE/8 - 3*Long.SIZE/8 - 1;
322 input.seek(footerOffset);
323
324
325 vocabularyOffset = input.readLong();
326 manifestOffset = input.readLong();
327 blockSize = input.readInt();
328 vocabGroup = input.readInt();
329 isCompressed = input.readBoolean();
330 long magicNumber = input.readLong();
331
332 if (magicNumber != IndexWriter.MAGIC_NUMBER) {
333 throw new IOException("This does not appear to be an index file (wrong magic number)");
334 }
335 long invertedListLength = vocabularyOffset;
336 long vocabularyLength = manifestOffset - vocabularyOffset;
337
338 input.seek(vocabularyOffset);
339 vocabulary = new VocabularyReader(input, invertedListLength, vocabularyLength);
340
341 input.seek(manifestOffset);
342 byte[] xmlData = new byte[(int) (footerOffset - manifestOffset)];
343 input.read(xmlData);
344 manifest = new Parameters(xmlData);
345 }
346
347 /***
348 * Returns true if the file specified by this pathname was probably written by IndexWriter.
349 * If this method returns false, the file is definitely not readable by IndexReader.
350 *
351 * @param pathname
352 * @return
353 * @throws java.io.FileNotFoundException
354 * @throws java.io.IOException
355 */
356 public static boolean isIndexFile(String pathname) throws FileNotFoundException, IOException {
357 RandomAccessFile f = new RandomAccessFile(pathname, "r");
358 long length = f.length();
359 long magicNumber = 0;
360
361 if (length > Long.SIZE/8) {
362 f.seek(length-Long.SIZE/8);
363 magicNumber = f.readLong();
364 }
365 f.close();
366
367 boolean result = (magicNumber == IndexWriter.MAGIC_NUMBER);
368 return result;
369 }
370
371 /***
372 * Identical to the {@link #IndexReader(String) other constructor}, except this
373 * one takes a File object instead of a string as the parameter.
374 *
375 * @param pathname
376 * @throws java.io.FileNotFoundException
377 * @throws java.io.IOException
378 */
379 public IndexReader(File pathname) throws FileNotFoundException, IOException {
380 this(pathname.toString());
381 }
382
383 /***
384 * Returns the vocabulary structure for this IndexReader. Note that the vocabulary
385 * contains only the first key in each block.
386 */
387 public VocabularyReader getVocabulary() {
388 return vocabulary;
389 }
390
391 /***
392 * Returns an iterator pointing to the very first key in the index.
393 * This is typically used for iterating through the entire index,
394 * which might be useful for testing and debugging tools, but probably
395 * not for traditional document retrieval.
396 */
397 public Iterator getIterator() throws IOException {
398 VocabularyBlock block = readVocabularyBlock(0);
399 Iterator result = new Iterator(block, 0);
400 result.loadIndex();
401 return result;
402 }
403
404 /***
405 * Returns an iterator pointing at a specific key. Returns
406 * null if the key is not found in the index.
407 */
408 public Iterator getIterator(String key) throws IOException {
409
410 VocabularyReader.TermSlot slot = vocabulary.get(key);
411
412 if (slot == null) {
413 return null;
414 }
415 VocabularyBlock block = readVocabularyBlock(slot.begin);
416 int index = block.findIndex(key);
417
418 if (index >= 0) {
419 return new Iterator(block, index);
420 }
421 return null;
422 }
423
424 /***
425 * Gets the value stored in the index associated with this key.
426 * @param key
427 * @return The index value for this key, or null if there is no such value.
428 * @throws java.io.IOException
429 */
430 public String getValueString(String key) throws IOException {
431 Iterator iter = getIterator(key);
432
433 if (iter == null) {
434 return null;
435 }
436 return iter.getValueString();
437 }
438
439 /***
440 * Gets the value stored in the index associated with this key.
441 *
442 * @param key
443 * @return The index value for this key, or null if there is no such value.
444 * @throws java.io.IOException
445 */
446
447 public DataStream getValueStream(String key) throws IOException {
448 Iterator iter = getIterator(key);
449
450 if (iter == null) {
451 return null;
452 }
453 return iter.getValueStream();
454 }
455
456 /***
457 * Returns a Parameters object that contains metadata about
458 * the contents of the index. This is the place to store important
459 * data about the index contents, like what stemmer was used or the
460 * total number of terms in the collection.
461 */
462 public Parameters getManifest() {
463 return manifest;
464 }
465
466 /***
467 * Like the other blockStream variant, but this one uses
468 * the current file location as the starting offset.
469 */
470 public DataStream blockStream(long len) throws IOException {
471 return blockStream(input.getFilePointer(), len);
472 }
473
474 /***
475 * This convenience method returns a DataStream for
476 * a region of an inverted file.
477 */
478 public DataStream blockStream(long offset, long length) throws IOException {
479 long fileLength = input.length();
480 assert offset <= fileLength;
481 length = Math.min(fileLength - offset, length);
482
483 return new BufferedFileDataStream(input, offset, length + offset);
484 }
485
486 /***
487 * This convenience method returns a DataStream for
488 * the region of the inverted file pointed to by the iterator.
489 */
490 public DataStream blockStream(Iterator iter) throws IOException {
491 return new BufferedFileDataStream(input, iter.getDataStart(), iter.getDataEnd());
492 }
493
494 /***
495 * Returns the file object for the inverted file. This is useful for actually
496 * reading the data from a byte range returned by the iterator.
497 */
498 public RandomAccessFile getInput() {
499 return input;
500 }
501
502 /***
503 * Closes all files associated with the IndexReader.
504 */
505 public void close() throws IOException {
506 input.close();
507 }
508
509 /***
510 * Reads vocabulary data from a block of the inverted file.
511 *
512 * The inverted file is structured into blocks which contain a little
513 * bit of compressed vocabulary information followed by inverted list
514 * information. The inverted list information is application specific,
515 * and is not handled by this class. This method reads in the vocabulary
516 * information for a particular block and returns it in a VocabularyBlock
517 * object, which allows for quick navigation to a particular key.
518 *
519 * The C++ version of this is much more efficient, because it makes no
520 * attempt to decode the whole block of information. However, for the
521 * Java version I thought that simplicity (and testability) was more
522 * important than speed. The VocabularyBlock structure helps make iteration
523 * over the entire inverted file possible.
524 */
525 VocabularyBlock readVocabularyBlock(long slotBegin) throws IOException {
526
527 DataStream blockStream = blockStream(slotBegin, blockSize);
528
529
530 long endBlock = blockStream.readLong();
531 long wordCount = blockStream.readLong();
532
533 int prefixLength = blockStream.readUnsignedByte();
534 byte[] prefixBytes = new byte[prefixLength];
535 blockStream.readFully(prefixBytes);
536
537 int wordBlockCount = (int) Math.ceil((double) wordCount / vocabGroup);
538 short[] wordBlockEnds = new short[wordBlockCount];
539 String[] words = new String[(int) wordCount];
540 long[] invertedListEnds = new long[(int) wordCount];
541
542 for (int i = 0; i < wordBlockCount; i++) {
543 wordBlockEnds[i] = blockStream.readShort();
544 }
545
546 for (int i = 0; i < wordCount; i++) {
547 invertedListEnds[i] = blockStream.readShort();
548 }
549
550 for (int i = 0; i < wordCount; i += vocabGroup) {
551 int suffixLength = blockStream.readUnsignedByte();
552 int wordLength = suffixLength + prefixLength;
553 byte[] wordBytes = new byte[wordLength];
554 byte[] lastWordBytes = wordBytes;
555 System.arraycopy(prefixBytes, 0, wordBytes, 0, prefixBytes.length);
556 int end = (int) Math.min(wordCount, i + vocabGroup);
557
558 blockStream.readFully(wordBytes, prefixBytes.length,
559 wordBytes.length - prefixBytes.length);
560 words[i] = Utility.makeString(wordBytes);
561
562 for (int j = i + 1; j < end; j++) {
563 int common = blockStream.readUnsignedByte();
564 wordLength = blockStream.readUnsignedByte();
565 assert wordLength >= 0 : "Negative word length: " + wordLength + " " + j;
566 assert wordLength >= common : "word length too small: " + wordLength + " " + common + " " + j;
567 wordBytes = new byte[wordLength];
568
569 try {
570 System.arraycopy(lastWordBytes, 0, wordBytes, 0, common);
571 blockStream.readFully(wordBytes, common, wordLength - common);
572 } catch (ArrayIndexOutOfBoundsException e) {
573 System.out.println("wl: " + wordLength + " c: " + common);
574 throw e;
575 }
576 words[j] = Utility.makeString(wordBytes);
577 lastWordBytes = wordBytes;
578 }
579 }
580
581 int suffixBytes = wordBlockEnds[wordBlockEnds.length - 1];
582 long headerLength = 8 +
583 8 +
584 1 + prefixLength +
585 2 * wordBlockCount +
586 2 * wordCount +
587 suffixBytes;
588
589 long startInvertedLists = slotBegin + headerLength;
590 return new VocabularyBlock(slotBegin, startInvertedLists, endBlock, invertedListEnds, words);
591 }
592 }