Coverage Report - org.galagosearch.core.index.IndexReader
 
Classes in this File Line Coverage Branch Coverage Complexity
IndexReader
86%
93/108
59%
20/34
2.024
IndexReader$Iterator
89%
76/85
69%
22/32
2.024
IndexReader$VocabularyBlock
95%
21/22
88%
7/8
2.024
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 package org.galagosearch.core.index;
 3  
 
 4  
 import java.io.ByteArrayInputStream;
 5  
 import java.io.DataInputStream;
 6  
 import java.io.EOFException;
 7  
 import java.io.File;
 8  
 import java.io.FileNotFoundException;
 9  
 import java.io.IOException;
 10  
 import java.io.RandomAccessFile;
 11  
 import java.util.zip.GZIPInputStream;
 12  
 import org.galagosearch.core.index.IndexWriter;
 13  
 import org.galagosearch.core.index.VocabularyReader.TermSlot;
 14  
 import org.galagosearch.core.index.VocabularyReader;
 15  
 import org.galagosearch.tupleflow.BufferedFileDataStream;
 16  
 import org.galagosearch.tupleflow.DataStream;
 17  
 import org.galagosearch.tupleflow.MemoryDataStream;
 18  
 import org.galagosearch.tupleflow.Parameters;
 19  
 import org.galagosearch.tupleflow.Utility;
 20  
 
 21  
 /**
 22  
  * <p>This implements the core functionality for all inverted list readers.  It can
 23  
  * also be used as a read-only TreeMap for disk-based data structures.  In Galago,
 24  
  * it is used both to store index data and to store documents.</p>
 25  
  * 
 26  
  * <p>An index is a mapping from String to byte[].  If compression is turned on, the
 27  
  * value must be small enough that it fits in memory.  If compression is off, values
 28  
  * are streamed directly from disk so there is no size restriction.  Indexes support
 29  
  * iteration over all keys, or direct lookup of a single key.  The structure is optimized
 30  
  * to support fast random lookup on disks.</p>
 31  
  * 
 32  
  * <p>Data is stored in blocks, typically 32K each.  Each block has a prefix-compressed
 33  
  * set of keys at the beginning, followed by a block of value data.  IndexWriter/IndexReader
 34  
  * can GZip compress that value data, or it can be stored uncompressed.  For inverted list
 35  
  * data it's best to use your own compression, but for text data the GZip compression
 36  
  * is a good choice.</p>
 37  
  * 
 38  
  * <p>Typically this class is extended by composition instead of inheritance.</p>
 39  
  * 
 40  
  * @author trevor
 41  
  */
 42  4
 public class IndexReader {
 43  
     VocabularyReader vocabulary;
 44  
     RandomAccessFile input;
 45  
     Parameters manifest;
 46  88
     int blockSize = 65536;
 47  88
     int vocabGroup = 16;
 48  
     long vocabularyOffset;
 49  
     long manifestOffset;
 50  
     long footerOffset;
 51  
     boolean isCompressed;
 52  
     
 53  8120
     private static class VocabularyBlock {
 54  
         long startFileOffset;
 55  
         long startValueFileOffset;
 56  
         long endValueFileOffset;
 57  
 
 58  
         long[] endValueOffsets;
 59  
         String[] keys;
 60  
 
 61  
         public VocabularyBlock(
 62  
                 long startFileOffset,
 63  
                 long startValueFileOffset,
 64  
                 long endValueFileOffset,
 65  8092
                 long[] endValueOffsets, String[] keys) {
 66  8092
             this.keys = keys;
 67  8092
             this.endValueOffsets = endValueOffsets;
 68  8092
             this.startFileOffset = startFileOffset;
 69  8092
             this.startValueFileOffset = startValueFileOffset;
 70  8092
             this.endValueFileOffset = endValueFileOffset;
 71  8092
         }
 72  
         
 73  
         public long getValuesStart() {
 74  8016
             return startValueFileOffset;
 75  
         }
 76  
         
 77  
         public long getValuesEnd() {
 78  4024
             return endValueFileOffset;
 79  
         }
 80  
 
 81  
         public long getBlockEnd() {
 82  16
             return getValuesEnd();
 83  
         }
 84  
 
 85  
         public long getListStart(int index) {
 86  4060
             if (index == 0) {
 87  2032
                 return startValueFileOffset;
 88  
             }
 89  2028
             return endValueFileOffset - endValueOffsets[index - 1];
 90  
         }
 91  
 
 92  
         public long getListEnd(int index) {
 93  4060
             return endValueFileOffset - endValueOffsets[index];
 94  
         }
 95  
         
 96  
         public long getUncompressedEndOffset(int index) {
 97  6008
             return endValueOffsets[index];
 98  
         }
 99  
 
 100  
         public boolean hasMore(int index) {
 101  24
             return endValueOffsets.length > (index + 1);
 102  
         }
 103  
 
 104  
         public int findIndex(String term) {
 105  
             // Need to do a linear search here because the vocabulary order
 106  
             // does not necessarily match Java's idea of alphabetical order
 107  12060
             for (int i = 0; i < keys.length; i++) {
 108  12060
                 if (keys[i].equals(term)) {
 109  8044
                     return i;
 110  
                 }
 111  
             }
 112  0
             return -1;
 113  
         }
 114  
 
 115  
         private String getKey(int termIndex) {
 116  8120
             return this.keys[termIndex];
 117  
         }
 118  
     }
 119  
 
 120  4
     public class Iterator {
 121  
         VocabularyBlock block;
 122  
         byte[] decompressedData;
 123  
         String key;
 124  
         int keyIndex;
 125  
         boolean done;
 126  
 
 127  8072
         Iterator(VocabularyBlock block, int index) throws IOException {
 128  8072
             this.block = block;
 129  8072
             keyIndex = index;
 130  8072
             done = false;
 131  
 
 132  8072
             loadIndex();
 133  8072
             decompressedData = null;
 134  8072
         }
 135  
 
 136  
         void loadIndex() throws IOException {
 137  8120
             key = "";
 138  
 
 139  8120
             if (block == null || keyIndex < 0) {
 140  0
                 done = true;
 141  0
                 return;
 142  
             }
 143  8120
             key = block.getKey(keyIndex);
 144  8120
         }
 145  
 
 146  
         void invalidateBlock() {
 147  36
             decompressedData = null;
 148  36
         }
 149  
         
 150  
         void decompressBlock() throws IOException {
 151  4008
             int blockLength = (int) (block.getValuesEnd() - block.getValuesStart());
 152  4008
             byte[] data = new byte[blockLength];
 153  4008
             input.seek(block.getValuesStart());
 154  4008
             input.readFully(data);
 155  
             
 156  4008
             ByteArrayInputStream in = new ByteArrayInputStream(data);
 157  4008
             DataInputStream dataIn = new DataInputStream(in);
 158  4008
             int uncompressedLength = dataIn.readInt();
 159  
             
 160  4008
             GZIPInputStream stream = new GZIPInputStream(in);
 161  4008
             decompressedData = new byte[uncompressedLength];
 162  4008
             int totalRead = 0;
 163  8016
             while (totalRead < uncompressedLength) {
 164  4008
                 int remaining = decompressedData.length - totalRead;
 165  4008
                 int bytesRead = stream.read(decompressedData, totalRead, remaining);
 166  4008
                 if (bytesRead <= 0) {
 167  0
                     throw new EOFException("Too little data was found.");
 168  
                 }
 169  4008
                 totalRead += bytesRead;
 170  4008
             }
 171  4008
         }
 172  
         
 173  
         public void skipTo(byte[] key) throws IOException {
 174  20
             TermSlot slot = vocabulary.get(key);
 175  20
             if (slot == null) {
 176  0
                 done = true;
 177  
             } else {
 178  20
                 invalidateBlock();
 179  20
                 block = readVocabularyBlock(slot.begin);
 180  20
                 keyIndex = 0;
 181  36
                 while (keyIndex < block.keys.length) {
 182  28
                     byte[] blockKey = Utility.makeBytes(block.keys[keyIndex]);
 183  28
                     if (Utility.compare(key, blockKey) <= 0) {
 184  12
                         loadIndex();
 185  12
                         return;
 186  
                     }
 187  16
                     keyIndex++;
 188  16
                 }
 189  8
                 done = true;
 190  
             }
 191  8
         }
 192  
 
 193  
         /**
 194  
          * Returns true if no more keys remain to be read.
 195  
          */
 196  
         public boolean isDone() {
 197  32
             return done;
 198  
         }
 199  
         
 200  
         /**
 201  
          * Returns the byte offset of the end of this block.
 202  
          */
 203  
         public long getFilePosition() {
 204  0
             return block.getBlockEnd();
 205  
         }
 206  
         
 207  
         /**
 208  
          * Returns the key associated with the current inverted list.
 209  
          */
 210  
         public String getKey() {
 211  32
             return key;
 212  
         }
 213  
         
 214  
         /**
 215  
          * Returns the value as a buffered stream.
 216  
          */
 217  
         public DataStream getValueStream() throws IOException {
 218  8044
             if (isCompressed) {
 219  
                 // Lazy decompression allows fast scans over the key space
 220  
                 // of a table without decompressing all the values.
 221  4008
                 if (decompressedData == null) {
 222  4008
                     decompressBlock();
 223  
                 }
 224  
 
 225  4008
                 int start = 0;
 226  4008
                 int end = (int) (decompressedData.length - block.getUncompressedEndOffset(keyIndex));
 227  4008
                 if (keyIndex > 0) {
 228  2000
                     start = (int) (decompressedData.length - block.getUncompressedEndOffset(keyIndex - 1));
 229  
                 }
 230  4008
                 int length = end - start;
 231  
                 
 232  4008
                 return new MemoryDataStream(decompressedData, start, length);
 233  
             } else {
 234  4036
                 return blockStream(this);
 235  
             }
 236  
         }
 237  
         
 238  
         /**
 239  
          * Returns the length of the value, in bytes.
 240  
          */
 241  
         public long getValueLength() throws IOException {
 242  0
             return getValueStream().length();
 243  
         }
 244  
 
 245  
         /**
 246  
          * Returns the value as a string.
 247  
          */
 248  
         
 249  
         public String getValueString() throws IOException {
 250  8012
             DataStream stream = getValueStream();
 251  8012
             assert stream.length() < Integer.MAX_VALUE;
 252  8012
             byte[] data = new byte[(int) stream.length()];
 253  8012
             stream.readFully(data);
 254  8012
             return Utility.makeString(data);
 255  
         }
 256  
         
 257  
         /**
 258  
          * Returns the byte offset
 259  
          * of the beginning of the current inverted list,
 260  
          * relative to the start of the whole inverted file.
 261  
          */
 262  
         public long getValueStart() {
 263  24
             return block.getListStart(keyIndex);
 264  
         }
 265  
         
 266  
         public long getDataStart() {
 267  4036
             if (isCompressed) return -1;
 268  4036
             return block.getListStart(keyIndex);
 269  
         }
 270  
         
 271  
         public long getDataEnd() {
 272  4036
             if (isCompressed) return -1;
 273  4036
             return block.getListEnd(keyIndex);
 274  
         }
 275  
 
 276  
         /**
 277  
          * Returns the byte offset
 278  
          * of the end of the current inverted list,
 279  
          * relative to the start of the whole inverted file.
 280  
          */
 281  
         public long getValueEnd() {
 282  24
             return block.getListEnd(keyIndex);
 283  
         }
 284  
 
 285  
         /**
 286  
          * Advances to the next key in the index.
 287  
          * 
 288  
          * @return true if the advance was successful, false if no more keys remain.
 289  
          * @throws java.io.IOException
 290  
          */
 291  
         public boolean nextKey() throws IOException {
 292  24
             if (block.hasMore(keyIndex)) {
 293  8
                 keyIndex++;
 294  16
             } else if (block.getBlockEnd() >= IndexReader.this.vocabularyOffset) {
 295  16
                 invalidateBlock();
 296  16
                 done = true;
 297  16
                 return false;
 298  
             } else {
 299  0
                 invalidateBlock();
 300  0
                 block = readVocabularyBlock(block.getBlockEnd());
 301  0
                 keyIndex = 0;
 302  
             }
 303  
 
 304  8
             loadIndex();
 305  8
             return true;
 306  
         }
 307  
     }
 308  
 
 309  
     /**
 310  
      * Opens an index found in the at pathname.
 311  
      * 
 312  
      * @param pathname Filename of the index to open.
 313  
      * @throws FileNotFoundException
 314  
      * @throws IOException
 315  
      */
 316  88
     public IndexReader(String pathname) throws FileNotFoundException, IOException {
 317  88
         input = new RandomAccessFile(pathname, "r");
 318  
 
 319  
         // Seek to the end of the file
 320  88
         long length = input.length();
 321  88
         footerOffset = length - 2*Integer.SIZE/8 - 3*Long.SIZE/8 - 1; 
 322  88
         input.seek(footerOffset);
 323  
         
 324  
         // Now, read metadata values:
 325  88
         vocabularyOffset = input.readLong();
 326  88
         manifestOffset = input.readLong();
 327  88
         blockSize = input.readInt();
 328  88
         vocabGroup = input.readInt();
 329  88
         isCompressed = input.readBoolean();
 330  88
         long magicNumber = input.readLong();
 331  
         
 332  88
         if (magicNumber != IndexWriter.MAGIC_NUMBER) {
 333  0
             throw new IOException("This does not appear to be an index file (wrong magic number)");
 334  
         }
 335  88
         long invertedListLength = vocabularyOffset;
 336  88
         long vocabularyLength = manifestOffset - vocabularyOffset;
 337  
         
 338  88
         input.seek(vocabularyOffset);
 339  88
         vocabulary = new VocabularyReader(input, invertedListLength, vocabularyLength);
 340  
         
 341  88
         input.seek(manifestOffset);
 342  88
         byte[] xmlData = new byte[(int) (footerOffset - manifestOffset)];
 343  88
         input.read(xmlData);
 344  88
         manifest = new Parameters(xmlData);
 345  88
     }
 346  
 
 347  
     /**
 348  
      * Returns true if the file specified by this pathname was probably written by IndexWriter.
 349  
      * If this method returns false, the file is definitely not readable by IndexReader.
 350  
      * 
 351  
      * @param pathname
 352  
      * @return
 353  
      * @throws java.io.FileNotFoundException
 354  
      * @throws java.io.IOException
 355  
      */
 356  
     public static boolean isIndexFile(String pathname) throws FileNotFoundException, IOException {
 357  56
         RandomAccessFile f = new RandomAccessFile(pathname, "r");
 358  56
         long length = f.length();
 359  56
         long magicNumber = 0;
 360  
         
 361  56
         if (length > Long.SIZE/8) {
 362  56
             f.seek(length-Long.SIZE/8);
 363  56
             magicNumber = f.readLong();
 364  
         }
 365  56
         f.close();
 366  
         
 367  56
         boolean result = (magicNumber == IndexWriter.MAGIC_NUMBER);
 368  56
         return result;
 369  
     }
 370  
     
 371  
     /**
 372  
      * Identical to the {@link #IndexReader(String) other constructor}, except this
 373  
      * one takes a File object instead of a string as the parameter.
 374  
      * 
 375  
      * @param pathname
 376  
      * @throws java.io.FileNotFoundException
 377  
      * @throws java.io.IOException
 378  
      */
 379  
     public IndexReader(File pathname) throws FileNotFoundException, IOException {
 380  0
         this(pathname.toString());
 381  0
     }
 382  
 
 383  
     /**
 384  
      * Returns the vocabulary structure for this IndexReader.  Note that the vocabulary
 385  
      * contains only the first key in each block.
 386  
      */
 387  
     public VocabularyReader getVocabulary() {
 388  0
         return vocabulary;
 389  
     }
 390  
     
 391  
     /**
 392  
      * Returns an iterator pointing to the very first key in the index.
 393  
      * This is typically used for iterating through the entire index,
 394  
      * which might be useful for testing and debugging tools, but probably
 395  
      * not for traditional document retrieval.
 396  
      */
 397  
     public Iterator getIterator() throws IOException {
 398  28
         VocabularyBlock block = readVocabularyBlock(0);
 399  28
         Iterator result = new Iterator(block, 0);
 400  28
         result.loadIndex();
 401  28
         return result;
 402  
     }
 403  
 
 404  
     /**
 405  
      * Returns an iterator pointing at a specific key.  Returns
 406  
      * null if the key is not found in the index.
 407  
      */
 408  
     public Iterator getIterator(String key) throws IOException {
 409  
         // read from offset to offset in the vocab structure (right?)
 410  8044
         VocabularyReader.TermSlot slot = vocabulary.get(key);
 411  
 
 412  8044
         if (slot == null) {
 413  0
             return null;
 414  
         }
 415  8044
         VocabularyBlock block = readVocabularyBlock(slot.begin);
 416  8044
         int index = block.findIndex(key);
 417  
 
 418  8044
         if (index >= 0) {
 419  8044
             return new Iterator(block, index);
 420  
         }
 421  0
         return null;
 422  
     }
 423  
     
 424  
     /**
 425  
      * Gets the value stored in the index associated with this key.
 426  
      * @param key
 427  
      * @return The index value for this key, or null if there is no such value.
 428  
      * @throws java.io.IOException
 429  
      */
 430  
     public String getValueString(String key) throws IOException {
 431  8008
         Iterator iter = getIterator(key);
 432  
         
 433  8008
         if (iter == null) {
 434  0
             return null;
 435  
         }
 436  8008
         return iter.getValueString();
 437  
     }
 438  
     
 439  
     /**
 440  
      * Gets the value stored in the index associated with this key.
 441  
      * 
 442  
      * @param key
 443  
      * @return The index value for this key, or null if there is no such value.
 444  
      * @throws java.io.IOException
 445  
      */
 446  
     
 447  
     public DataStream getValueStream(String key) throws IOException {
 448  0
         Iterator iter = getIterator(key);
 449  
         
 450  0
         if (iter == null) {
 451  0
             return null;
 452  
         }
 453  0
         return iter.getValueStream();
 454  
     }
 455  
     
 456  
     /**
 457  
      * Returns a Parameters object that contains metadata about
 458  
      * the contents of the index.  This is the place to store important
 459  
      * data about the index contents, like what stemmer was used or the
 460  
      * total number of terms in the collection.
 461  
      */
 462  
     public Parameters getManifest() {
 463  48
         return manifest;
 464  
     }
 465  
 
 466  
     /**
 467  
      * Like the other blockStream variant, but this one uses
 468  
      * the current file location as the starting offset.
 469  
      */
 470  
     public DataStream blockStream(long len) throws IOException {
 471  0
         return blockStream(input.getFilePointer(), len);
 472  
     }
 473  
 
 474  
     /**
 475  
      * This convenience method returns a DataStream for
 476  
      * a region of an inverted file.
 477  
      */
 478  
     public DataStream blockStream(long offset, long length) throws IOException {
 479  8092
         long fileLength = input.length();
 480  8092
         assert offset <= fileLength;
 481  8092
         length = Math.min(fileLength - offset, length);
 482  
 
 483  8092
         return new BufferedFileDataStream(input, offset, length + offset);
 484  
     }
 485  
 
 486  
     /**
 487  
      * This convenience method returns a DataStream for
 488  
      * the region of the inverted file pointed to by the iterator.
 489  
      */
 490  
     public DataStream blockStream(Iterator iter) throws IOException {
 491  4036
         return new BufferedFileDataStream(input, iter.getDataStart(), iter.getDataEnd());
 492  
     }
 493  
 
 494  
     /**
 495  
      * Returns the file object for the inverted file.  This is useful for actually
 496  
      * reading the data from a byte range returned by the iterator.
 497  
      */
 498  
     public RandomAccessFile getInput() {
 499  48
         return input;
 500  
     }
 501  
 
 502  
     /**
 503  
      * Closes all files associated with the IndexReader.
 504  
      */
 505  
     public void close() throws IOException {
 506  40
         input.close();
 507  40
     }
 508  
     
 509  
     /**
 510  
      * Reads vocabulary data from a block of the inverted file.
 511  
      * 
 512  
      * The inverted file is structured into blocks which contain a little
 513  
      * bit of compressed vocabulary information followed by inverted list
 514  
      * information.  The inverted list information is application specific,
 515  
      * and is not handled by this class.  This method reads in the vocabulary
 516  
      * information for a particular block and returns it in a VocabularyBlock
 517  
      * object, which allows for quick navigation to a particular key.
 518  
      * 
 519  
      * The C++ version of this is much more efficient, because it makes no
 520  
      * attempt to decode the whole block of information.  However, for the 
 521  
      * Java version I thought that simplicity (and testability) was more
 522  
      * important than speed.  The VocabularyBlock structure helps make iteration
 523  
      * over the entire inverted file possible.
 524  
      */
 525  
     VocabularyBlock readVocabularyBlock(long slotBegin) throws IOException {
 526  
         // read in a block of data here
 527  8092
         DataStream blockStream = blockStream(slotBegin, blockSize);
 528  
 
 529  
         // now we decode everything from the stream
 530  8092
         long endBlock = blockStream.readLong();
 531  8092
         long wordCount = blockStream.readLong();
 532  
 
 533  8092
         int prefixLength = blockStream.readUnsignedByte();
 534  8092
         byte[] prefixBytes = new byte[prefixLength];
 535  8092
         blockStream.readFully(prefixBytes);
 536  
 
 537  8092
         int wordBlockCount = (int) Math.ceil((double) wordCount / vocabGroup);
 538  8092
         short[] wordBlockEnds = new short[wordBlockCount];
 539  8092
         String[] words = new String[(int) wordCount];
 540  8092
         long[] invertedListEnds = new long[(int) wordCount];
 541  
 
 542  16184
         for (int i = 0; i < wordBlockCount; i++) {
 543  8092
             wordBlockEnds[i] = blockStream.readShort();
 544  
         }
 545  
 
 546  24244
         for (int i = 0; i < wordCount; i++) {
 547  16152
             invertedListEnds[i] = blockStream.readShort();
 548  
         }
 549  
 
 550  16184
         for (int i = 0; i < wordCount; i += vocabGroup) {
 551  8092
             int suffixLength = blockStream.readUnsignedByte();
 552  8092
             int wordLength = suffixLength + prefixLength;
 553  8092
             byte[] wordBytes = new byte[wordLength];
 554  8092
             byte[] lastWordBytes = wordBytes;
 555  8092
             System.arraycopy(prefixBytes, 0, wordBytes, 0, prefixBytes.length);
 556  8092
             int end = (int) Math.min(wordCount, i + vocabGroup);
 557  
 
 558  8092
             blockStream.readFully(wordBytes, prefixBytes.length,
 559  
                                   wordBytes.length - prefixBytes.length);
 560  8092
             words[i] = Utility.makeString(wordBytes);
 561  
 
 562  16152
             for (int j = i + 1; j < end; j++) {
 563  8060
                 int common = blockStream.readUnsignedByte();
 564  8060
                 wordLength = blockStream.readUnsignedByte();
 565  8060
                 assert wordLength >= 0 : "Negative word length: " + wordLength + " " + j;
 566  8060
                 assert wordLength >= common : "word length too small: " + wordLength + " " + common + " " + j;
 567  8060
                 wordBytes = new byte[wordLength];
 568  
 
 569  
                 try {
 570  8060
                     System.arraycopy(lastWordBytes, 0, wordBytes, 0, common);
 571  8060
                     blockStream.readFully(wordBytes, common, wordLength - common);
 572  0
                 } catch (ArrayIndexOutOfBoundsException e) {
 573  0
                     System.out.println("wl: " + wordLength + " c: " + common);
 574  0
                     throw e;
 575  8060
                 }
 576  8060
                 words[j] = Utility.makeString(wordBytes);
 577  8060
                 lastWordBytes = wordBytes;
 578  
             }
 579  
         }
 580  
 
 581  8092
         int suffixBytes = wordBlockEnds[wordBlockEnds.length - 1];
 582  8092
         long headerLength = 8 + // word count
 583  
                 8 + // block end
 584  
                 1 + prefixLength + // word prefix bytes
 585  
                 2 * wordBlockCount + // word lengths 
 586  
                 2 * wordCount + // inverted list endings
 587  
                 suffixBytes;          // suffix storage 
 588  
 
 589  8092
         long startInvertedLists = slotBegin + headerLength;
 590  8092
         return new VocabularyBlock(slotBegin, startInvertedLists, endBlock, invertedListEnds, words);
 591  
     }
 592  
 }