View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.index;
3   
4   import java.io.ByteArrayInputStream;
5   import java.io.DataInputStream;
6   import java.io.EOFException;
7   import java.io.File;
8   import java.io.FileNotFoundException;
9   import java.io.IOException;
10  import java.io.RandomAccessFile;
11  import java.util.zip.GZIPInputStream;
12  import org.galagosearch.core.index.IndexWriter;
13  import org.galagosearch.core.index.VocabularyReader.TermSlot;
14  import org.galagosearch.core.index.VocabularyReader;
15  import org.galagosearch.tupleflow.BufferedFileDataStream;
16  import org.galagosearch.tupleflow.DataStream;
17  import org.galagosearch.tupleflow.MemoryDataStream;
18  import org.galagosearch.tupleflow.Parameters;
19  import org.galagosearch.tupleflow.Utility;
20  
21  /***
22   * <p>This implements the core functionality for all inverted list readers.  It can
23   * also be used as a read-only TreeMap for disk-based data structures.  In Galago,
24   * it is used both to store index data and to store documents.</p>
25   * 
26   * <p>An index is a mapping from String to byte[].  If compression is turned on, the
27   * value must be small enough that it fits in memory.  If compression is off, values
28   * are streamed directly from disk so there is no size restriction.  Indexes support
29   * iteration over all keys, or direct lookup of a single key.  The structure is optimized
30   * to support fast random lookup on disks.</p>
31   * 
32   * <p>Data is stored in blocks, typically 32K each.  Each block has a prefix-compressed
33   * set of keys at the beginning, followed by a block of value data.  IndexWriter/IndexReader
34   * can GZip compress that value data, or it can be stored uncompressed.  For inverted list
35   * data it's best to use your own compression, but for text data the GZip compression
36   * is a good choice.</p>
37   * 
38   * <p>Typically this class is extended by composition instead of inheritance.</p>
39   * 
40   * @author trevor
41   */
42  public class IndexReader {
43      VocabularyReader vocabulary;
44      RandomAccessFile input;
45      Parameters manifest;
46      int blockSize = 65536;
47      int vocabGroup = 16;
48      long vocabularyOffset;
49      long manifestOffset;
50      long footerOffset;
51      boolean isCompressed;
52      
53      private static class VocabularyBlock {
54          long startFileOffset;
55          long startValueFileOffset;
56          long endValueFileOffset;
57  
58          long[] endValueOffsets;
59          String[] keys;
60  
61          public VocabularyBlock(
62                  long startFileOffset,
63                  long startValueFileOffset,
64                  long endValueFileOffset,
65                  long[] endValueOffsets, String[] keys) {
66              this.keys = keys;
67              this.endValueOffsets = endValueOffsets;
68              this.startFileOffset = startFileOffset;
69              this.startValueFileOffset = startValueFileOffset;
70              this.endValueFileOffset = endValueFileOffset;
71          }
72          
73          public long getValuesStart() {
74              return startValueFileOffset;
75          }
76          
77          public long getValuesEnd() {
78              return endValueFileOffset;
79          }
80  
81          public long getBlockEnd() {
82              return getValuesEnd();
83          }
84  
85          public long getListStart(int index) {
86              if (index == 0) {
87                  return startValueFileOffset;
88              }
89              return endValueFileOffset - endValueOffsets[index - 1];
90          }
91  
92          public long getListEnd(int index) {
93              return endValueFileOffset - endValueOffsets[index];
94          }
95          
96          public long getUncompressedEndOffset(int index) {
97              return endValueOffsets[index];
98          }
99  
100         public boolean hasMore(int index) {
101             return endValueOffsets.length > (index + 1);
102         }
103 
104         public int findIndex(String term) {
105             // Need to do a linear search here because the vocabulary order
106             // does not necessarily match Java's idea of alphabetical order
107             for (int i = 0; i < keys.length; i++) {
108                 if (keys[i].equals(term)) {
109                     return i;
110                 }
111             }
112             return -1;
113         }
114 
115         private String getKey(int termIndex) {
116             return this.keys[termIndex];
117         }
118     }
119 
120     public class Iterator {
121         VocabularyBlock block;
122         byte[] decompressedData;
123         String key;
124         int keyIndex;
125         boolean done;
126 
127         Iterator(VocabularyBlock block, int index) throws IOException {
128             this.block = block;
129             keyIndex = index;
130             done = false;
131 
132             loadIndex();
133             decompressedData = null;
134         }
135 
136         void loadIndex() throws IOException {
137             key = "";
138 
139             if (block == null || keyIndex < 0) {
140                 done = true;
141                 return;
142             }
143             key = block.getKey(keyIndex);
144         }
145 
146         void invalidateBlock() {
147             decompressedData = null;
148         }
149         
150         void decompressBlock() throws IOException {
151             int blockLength = (int) (block.getValuesEnd() - block.getValuesStart());
152             byte[] data = new byte[blockLength];
153             input.seek(block.getValuesStart());
154             input.readFully(data);
155             
156             ByteArrayInputStream in = new ByteArrayInputStream(data);
157             DataInputStream dataIn = new DataInputStream(in);
158             int uncompressedLength = dataIn.readInt();
159             
160             GZIPInputStream stream = new GZIPInputStream(in);
161             decompressedData = new byte[uncompressedLength];
162             int totalRead = 0;
163             while (totalRead < uncompressedLength) {
164                 int remaining = decompressedData.length - totalRead;
165                 int bytesRead = stream.read(decompressedData, totalRead, remaining);
166                 if (bytesRead <= 0) {
167                     throw new EOFException("Too little data was found.");
168                 }
169                 totalRead += bytesRead;
170             }
171         }
172         
173         public void skipTo(byte[] key) throws IOException {
174             TermSlot slot = vocabulary.get(key);
175             if (slot == null) {
176                 done = true;
177             } else {
178                 invalidateBlock();
179                 block = readVocabularyBlock(slot.begin);
180                 keyIndex = 0;
181                 while (keyIndex < block.keys.length) {
182                     byte[] blockKey = Utility.makeBytes(block.keys[keyIndex]);
183                     if (Utility.compare(key, blockKey) <= 0) {
184                         loadIndex();
185                         return;
186                     }
187                     keyIndex++;
188                 }
189                 done = true;
190             }
191         }
192 
193         /***
194          * Returns true if no more keys remain to be read.
195          */
196         public boolean isDone() {
197             return done;
198         }
199         
200         /***
201          * Returns the byte offset of the end of this block.
202          */
203         public long getFilePosition() {
204             return block.getBlockEnd();
205         }
206         
207         /***
208          * Returns the key associated with the current inverted list.
209          */
210         public String getKey() {
211             return key;
212         }
213         
214         /***
215          * Returns the value as a buffered stream.
216          */
217         public DataStream getValueStream() throws IOException {
218             if (isCompressed) {
219                 // Lazy decompression allows fast scans over the key space
220                 // of a table without decompressing all the values.
221                 if (decompressedData == null) {
222                     decompressBlock();
223                 }
224 
225                 int start = 0;
226                 int end = (int) (decompressedData.length - block.getUncompressedEndOffset(keyIndex));
227                 if (keyIndex > 0) {
228                     start = (int) (decompressedData.length - block.getUncompressedEndOffset(keyIndex - 1));
229                 }
230                 int length = end - start;
231                 
232                 return new MemoryDataStream(decompressedData, start, length);
233             } else {
234                 return blockStream(this);
235             }
236         }
237         
238         /***
239          * Returns the length of the value, in bytes.
240          */
241         public long getValueLength() throws IOException {
242             return getValueStream().length();
243         }
244 
245         /***
246          * Returns the value as a string.
247          */
248         
249         public String getValueString() throws IOException {
250             DataStream stream = getValueStream();
251             assert stream.length() < Integer.MAX_VALUE;
252             byte[] data = new byte[(int) stream.length()];
253             stream.readFully(data);
254             return Utility.makeString(data);
255         }
256         
257         /***
258          * Returns the byte offset
259          * of the beginning of the current inverted list,
260          * relative to the start of the whole inverted file.
261          */
262         public long getValueStart() {
263             return block.getListStart(keyIndex);
264         }
265         
266         public long getDataStart() {
267             if (isCompressed) return -1;
268             return block.getListStart(keyIndex);
269         }
270         
271         public long getDataEnd() {
272             if (isCompressed) return -1;
273             return block.getListEnd(keyIndex);
274         }
275 
276         /***
277          * Returns the byte offset
278          * of the end of the current inverted list,
279          * relative to the start of the whole inverted file.
280          */
281         public long getValueEnd() {
282             return block.getListEnd(keyIndex);
283         }
284 
285         /***
286          * Advances to the next key in the index.
287          * 
288          * @return true if the advance was successful, false if no more keys remain.
289          * @throws java.io.IOException
290          */
291         public boolean nextKey() throws IOException {
292             if (block.hasMore(keyIndex)) {
293                 keyIndex++;
294             } else if (block.getBlockEnd() >= IndexReader.this.vocabularyOffset) {
295                 invalidateBlock();
296                 done = true;
297                 return false;
298             } else {
299                 invalidateBlock();
300                 block = readVocabularyBlock(block.getBlockEnd());
301                 keyIndex = 0;
302             }
303 
304             loadIndex();
305             return true;
306         }
307     }
308 
309     /***
310      * Opens an index found in the at pathname.
311      * 
312      * @param pathname Filename of the index to open.
313      * @throws FileNotFoundException
314      * @throws IOException
315      */
316     public IndexReader(String pathname) throws FileNotFoundException, IOException {
317         input = new RandomAccessFile(pathname, "r");
318 
319         // Seek to the end of the file
320         long length = input.length();
321         footerOffset = length - 2*Integer.SIZE/8 - 3*Long.SIZE/8 - 1; 
322         input.seek(footerOffset);
323         
324         // Now, read metadata values:
325         vocabularyOffset = input.readLong();
326         manifestOffset = input.readLong();
327         blockSize = input.readInt();
328         vocabGroup = input.readInt();
329         isCompressed = input.readBoolean();
330         long magicNumber = input.readLong();
331         
332         if (magicNumber != IndexWriter.MAGIC_NUMBER) {
333             throw new IOException("This does not appear to be an index file (wrong magic number)");
334         }
335         long invertedListLength = vocabularyOffset;
336         long vocabularyLength = manifestOffset - vocabularyOffset;
337         
338         input.seek(vocabularyOffset);
339         vocabulary = new VocabularyReader(input, invertedListLength, vocabularyLength);
340         
341         input.seek(manifestOffset);
342         byte[] xmlData = new byte[(int) (footerOffset - manifestOffset)];
343         input.read(xmlData);
344         manifest = new Parameters(xmlData);
345     }
346 
347     /***
348      * Returns true if the file specified by this pathname was probably written by IndexWriter.
349      * If this method returns false, the file is definitely not readable by IndexReader.
350      * 
351      * @param pathname
352      * @return
353      * @throws java.io.FileNotFoundException
354      * @throws java.io.IOException
355      */
356     public static boolean isIndexFile(String pathname) throws FileNotFoundException, IOException {
357         RandomAccessFile f = new RandomAccessFile(pathname, "r");
358         long length = f.length();
359         long magicNumber = 0;
360         
361         if (length > Long.SIZE/8) {
362             f.seek(length-Long.SIZE/8);
363             magicNumber = f.readLong();
364         }
365         f.close();
366         
367         boolean result = (magicNumber == IndexWriter.MAGIC_NUMBER);
368         return result;
369     }
370     
371     /***
372      * Identical to the {@link #IndexReader(String) other constructor}, except this
373      * one takes a File object instead of a string as the parameter.
374      * 
375      * @param pathname
376      * @throws java.io.FileNotFoundException
377      * @throws java.io.IOException
378      */
379     public IndexReader(File pathname) throws FileNotFoundException, IOException {
380         this(pathname.toString());
381     }
382 
383     /***
384      * Returns the vocabulary structure for this IndexReader.  Note that the vocabulary
385      * contains only the first key in each block.
386      */
387     public VocabularyReader getVocabulary() {
388         return vocabulary;
389     }
390     
391     /***
392      * Returns an iterator pointing to the very first key in the index.
393      * This is typically used for iterating through the entire index,
394      * which might be useful for testing and debugging tools, but probably
395      * not for traditional document retrieval.
396      */
397     public Iterator getIterator() throws IOException {
398         VocabularyBlock block = readVocabularyBlock(0);
399         Iterator result = new Iterator(block, 0);
400         result.loadIndex();
401         return result;
402     }
403 
404     /***
405      * Returns an iterator pointing at a specific key.  Returns
406      * null if the key is not found in the index.
407      */
408     public Iterator getIterator(String key) throws IOException {
409         // read from offset to offset in the vocab structure (right?)
410         VocabularyReader.TermSlot slot = vocabulary.get(key);
411 
412         if (slot == null) {
413             return null;
414         }
415         VocabularyBlock block = readVocabularyBlock(slot.begin);
416         int index = block.findIndex(key);
417 
418         if (index >= 0) {
419             return new Iterator(block, index);
420         }
421         return null;
422     }
423     
424     /***
425      * Gets the value stored in the index associated with this key.
426      * @param key
427      * @return The index value for this key, or null if there is no such value.
428      * @throws java.io.IOException
429      */
430     public String getValueString(String key) throws IOException {
431         Iterator iter = getIterator(key);
432         
433         if (iter == null) {
434             return null;
435         }
436         return iter.getValueString();
437     }
438     
439     /***
440      * Gets the value stored in the index associated with this key.
441      * 
442      * @param key
443      * @return The index value for this key, or null if there is no such value.
444      * @throws java.io.IOException
445      */
446     
447     public DataStream getValueStream(String key) throws IOException {
448         Iterator iter = getIterator(key);
449         
450         if (iter == null) {
451             return null;
452         }
453         return iter.getValueStream();
454     }
455     
456     /***
457      * Returns a Parameters object that contains metadata about
458      * the contents of the index.  This is the place to store important
459      * data about the index contents, like what stemmer was used or the
460      * total number of terms in the collection.
461      */
462     public Parameters getManifest() {
463         return manifest;
464     }
465 
466     /***
467      * Like the other blockStream variant, but this one uses
468      * the current file location as the starting offset.
469      */
470     public DataStream blockStream(long len) throws IOException {
471         return blockStream(input.getFilePointer(), len);
472     }
473 
474     /***
475      * This convenience method returns a DataStream for
476      * a region of an inverted file.
477      */
478     public DataStream blockStream(long offset, long length) throws IOException {
479         long fileLength = input.length();
480         assert offset <= fileLength;
481         length = Math.min(fileLength - offset, length);
482 
483         return new BufferedFileDataStream(input, offset, length + offset);
484     }
485 
486     /***
487      * This convenience method returns a DataStream for
488      * the region of the inverted file pointed to by the iterator.
489      */
490     public DataStream blockStream(Iterator iter) throws IOException {
491         return new BufferedFileDataStream(input, iter.getDataStart(), iter.getDataEnd());
492     }
493 
494     /***
495      * Returns the file object for the inverted file.  This is useful for actually
496      * reading the data from a byte range returned by the iterator.
497      */
498     public RandomAccessFile getInput() {
499         return input;
500     }
501 
502     /***
503      * Closes all files associated with the IndexReader.
504      */
505     public void close() throws IOException {
506         input.close();
507     }
508     
509     /***
510      * Reads vocabulary data from a block of the inverted file.
511      * 
512      * The inverted file is structured into blocks which contain a little
513      * bit of compressed vocabulary information followed by inverted list
514      * information.  The inverted list information is application specific,
515      * and is not handled by this class.  This method reads in the vocabulary
516      * information for a particular block and returns it in a VocabularyBlock
517      * object, which allows for quick navigation to a particular key.
518      * 
519      * The C++ version of this is much more efficient, because it makes no
520      * attempt to decode the whole block of information.  However, for the 
521      * Java version I thought that simplicity (and testability) was more
522      * important than speed.  The VocabularyBlock structure helps make iteration
523      * over the entire inverted file possible.
524      */
525     VocabularyBlock readVocabularyBlock(long slotBegin) throws IOException {
526         // read in a block of data here
527         DataStream blockStream = blockStream(slotBegin, blockSize);
528 
529         // now we decode everything from the stream
530         long endBlock = blockStream.readLong();
531         long wordCount = blockStream.readLong();
532 
533         int prefixLength = blockStream.readUnsignedByte();
534         byte[] prefixBytes = new byte[prefixLength];
535         blockStream.readFully(prefixBytes);
536 
537         int wordBlockCount = (int) Math.ceil((double) wordCount / vocabGroup);
538         short[] wordBlockEnds = new short[wordBlockCount];
539         String[] words = new String[(int) wordCount];
540         long[] invertedListEnds = new long[(int) wordCount];
541 
542         for (int i = 0; i < wordBlockCount; i++) {
543             wordBlockEnds[i] = blockStream.readShort();
544         }
545 
546         for (int i = 0; i < wordCount; i++) {
547             invertedListEnds[i] = blockStream.readShort();
548         }
549 
550         for (int i = 0; i < wordCount; i += vocabGroup) {
551             int suffixLength = blockStream.readUnsignedByte();
552             int wordLength = suffixLength + prefixLength;
553             byte[] wordBytes = new byte[wordLength];
554             byte[] lastWordBytes = wordBytes;
555             System.arraycopy(prefixBytes, 0, wordBytes, 0, prefixBytes.length);
556             int end = (int) Math.min(wordCount, i + vocabGroup);
557 
558             blockStream.readFully(wordBytes, prefixBytes.length,
559                                   wordBytes.length - prefixBytes.length);
560             words[i] = Utility.makeString(wordBytes);
561 
562             for (int j = i + 1; j < end; j++) {
563                 int common = blockStream.readUnsignedByte();
564                 wordLength = blockStream.readUnsignedByte();
565                 assert wordLength >= 0 : "Negative word length: " + wordLength + " " + j;
566                 assert wordLength >= common : "word length too small: " + wordLength + " " + common + " " + j;
567                 wordBytes = new byte[wordLength];
568 
569                 try {
570                     System.arraycopy(lastWordBytes, 0, wordBytes, 0, common);
571                     blockStream.readFully(wordBytes, common, wordLength - common);
572                 } catch (ArrayIndexOutOfBoundsException e) {
573                     System.out.println("wl: " + wordLength + " c: " + common);
574                     throw e;
575                 }
576                 words[j] = Utility.makeString(wordBytes);
577                 lastWordBytes = wordBytes;
578             }
579         }
580 
581         int suffixBytes = wordBlockEnds[wordBlockEnds.length - 1];
582         long headerLength = 8 + // word count
583                 8 + // block end
584                 1 + prefixLength + // word prefix bytes
585                 2 * wordBlockCount + // word lengths 
586                 2 * wordCount + // inverted list endings
587                 suffixBytes;          // suffix storage 
588 
589         long startInvertedLists = slotBegin + headerLength;
590         return new VocabularyBlock(slotBegin, startInvertedLists, endBlock, invertedListEnds, words);
591     }
592 }