Coverage Report - org.galagosearch.core.index.IndexWriter
 
Classes in this File Line Coverage Branch Coverage Complexity
IndexWriter
88%
110/125
59%
48/82
0
IndexWriter$CompressedListData
100%
20/20
100%
4/4
0
IndexWriter$ListData
N/A
N/A
0
IndexWriter$UncompressedListData
100%
11/11
100%
4/4
0
IndexWriter$VocabularyHeader
100%
55/55
75%
18/24
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 package org.galagosearch.core.index;
 3  
 
 4  
 import java.io.BufferedOutputStream;
 5  
 import java.io.ByteArrayOutputStream;
 6  
 import java.io.DataOutputStream;
 7  
 import java.io.File;
 8  
 import java.io.FileNotFoundException;
 9  
 import java.io.FileOutputStream;
 10  
 import java.io.IOException;
 11  
 import java.io.OutputStream;
 12  
 import java.util.ArrayList;
 13  
 import java.util.List;
 14  
 import java.util.zip.GZIPOutputStream;
 15  
 import org.galagosearch.tupleflow.Counter;
 16  
 import org.galagosearch.tupleflow.Parameters;
 17  
 import org.galagosearch.tupleflow.TupleFlowParameters;
 18  
 import org.galagosearch.tupleflow.Utility;
 19  
 
 20  
 /**
 21  
  * This class writes index files, which are used for most Galago indexes.
 22  
  * 
 23  
  * An index is a mapping between a key and a value, much like a TreeMap.  The keys are
 24  
  * sorted to allow iteration over the whole file.  Keys are stored using prefix
 25  
  * compression to save space.  The structure is designed for fast random access on disk.
 26  
  * 
 27  
  * For indexes, we assume that the data in each value is already compressed, so IndexWriter
 28  
  * does no additional compression.  However, if the isCompressed flag is set, IndexWriter
 29  
  * will compress the value data.  This is convenient for storing documents in an index.
 30  
  * 
 31  
  * Keys cannot be longer than 256 bytes, and they must be added in sorted order.
 32  
  * 
 33  
  * @author trevor
 34  
  */
 35  4
 public class IndexWriter {
 36  
     public static final long MAGIC_NUMBER = 0x1a2b3c4d5e6f7a8bL;
 37  
 
 38  
     DataOutputStream output;
 39  
     final VocabularyWriter vocabulary;
 40  
     Parameters manifest;
 41  
     ArrayList<IndexElement> lists;
 42  
 
 43  88
     int blockSize = 32768;
 44  88
     int vocabGroup = 16;
 45  88
     long filePosition = 0;
 46  88
     long listBytes = 0;
 47  
     // compression isn't supported yet
 48  88
     boolean isCompressed = false;
 49  
 
 50  88
     Counter recordsWritten = null;
 51  88
     Counter blocksWritten = null;
 52  
 
 53  
     /**
 54  
      * Creates a new instance of IndexWriter
 55  
      */
 56  
     public IndexWriter(String outputFilename, Parameters parameters)
 57  88
             throws FileNotFoundException, IOException {
 58  
         // Create the parent directory:
 59  88
         new File(outputFilename).getParentFile().mkdirs();
 60  
 
 61  88
         blockSize = (int) parameters.get("blockSize", 32768);
 62  88
         isCompressed = parameters.get("isCompressed", false);
 63  88
         output = new DataOutputStream(new BufferedOutputStream(
 64  
                                       new FileOutputStream(outputFilename)));
 65  88
         vocabulary = new VocabularyWriter();
 66  88
         manifest = new Parameters();
 67  88
         manifest.copy(parameters);
 68  88
         lists = new ArrayList<IndexElement>();
 69  88
     }
 70  
     
 71  
     public IndexWriter(String outputFilename)
 72  0
             throws FileNotFoundException, IOException {
 73  0
         output = new DataOutputStream(new BufferedOutputStream(
 74  
                                       new FileOutputStream(outputFilename)));
 75  0
         vocabulary = new VocabularyWriter();
 76  0
         manifest = new Parameters();
 77  0
         lists = new ArrayList<IndexElement>();
 78  0
     }
 79  
 
 80  
     public IndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
 81  56
         this(parameters.getXML().get("filename"), parameters.getXML());
 82  56
         recordsWritten = parameters.getCounter("Records Written");
 83  56
         blocksWritten = parameters.getCounter("Blocks Written");
 84  56
     }
 85  
     
 86  
     /**
 87  
      * Returns the current copy of the manifest, which will be stored in
 88  
      * the completed index file.  This data is not written until close() is called.
 89  
      */
 90  
     
 91  
     public Parameters getManifest() {
 92  112
         return manifest;
 93  
     }
 94  
 
 95  
     /** 
 96  
      * Gives a conservative estimate of the buffered size of the data,
 97  
      * excluding the most recent inverted list.
 98  
      * Does not include savings due to key overlap compression.
 99  
      */
 100  
     public long bufferedSize() {
 101  12208
         long extra = 8 + // end of block
 102  
                 8 + // key count
 103  
                 1; // overlap length
 104  
 
 105  12208
         return listBytes + extra;
 106  
     }
 107  
 
 108  
     public void updateBufferedSize(IndexElement list) {
 109  8128
         long extra = 1 + // byte for key length
 110  
                 1;  // byte for overlap with previous key
 111  
 
 112  8128
         listBytes += invertedListLength(list);
 113  8128
         listBytes += extra;
 114  8128
     }
 115  
 
 116  
     private long invertedListLength(IndexElement list) {
 117  16256
         long listLength = 0;
 118  
 
 119  16256
         listLength += list.key().length;
 120  16256
         listLength += 2; // key length bytes
 121  16256
         listLength += 2; // file offset bytes
 122  
 
 123  16256
         listLength += list.dataLength();
 124  16256
         return listLength;
 125  
     }
 126  
 
 127  
     /**
 128  
      * Flush all lists out to disk.
 129  
      */
 130  
     public void flush() throws IOException {
 131  
         // if there aren't any lists, quit now
 132  4080
         if (lists.size() == 0) {
 133  0
             return;        // write everything out
 134  
         }
 135  4080
         writeBlock(lists, bufferedSize());
 136  
 
 137  
         // remove all of the current data
 138  4080
         lists = new ArrayList<IndexElement>();
 139  4080
         listBytes = 0;
 140  4080
     }
 141  
 
 142  
     public long getBlockSize() {
 143  0
         return blockSize;
 144  
     }
 145  
 
 146  
     private boolean lessThanOrEqualTo(byte[] one, byte[] two) {
 147  4048
         boolean isOneShorterOrEqualLength = (one.length <= two.length);
 148  4048
         int commonLength = Math.min(one.length, two.length);
 149  
         
 150  20048
         for (int i = 0; i < commonLength; i++) {
 151  20048
             int a = one[i];
 152  20048
             int b = two[i];
 153  20048
             a &= 0xFF;
 154  20048
             b &= 0xFF;
 155  20048
             if (a < b) {
 156  4048
                 return true;
 157  
             }
 158  16000
             if (b < a) {
 159  0
                 return false;
 160  
             }
 161  
         }
 162  
         
 163  0
         return isOneShorterOrEqualLength;
 164  
     }
 165  
 
 166  
     /**
 167  
      * Returns true if the lists are sorted in ascending order by
 168  
      * key.
 169  
      * 
 170  
      * @param blockLists
 171  
      */
 172  
     public boolean wordsInOrder(List<IndexElement> blockLists) {
 173  8128
         for (int i = 0; i < blockLists.size() - 1; i++) {
 174  4048
             boolean result = lessThanOrEqualTo(blockLists.get(i).key(),
 175  
                                                blockLists.get(i + 1).key());
 176  4048
             if (result == false) {
 177  0
                 return false;
 178  
             }
 179  
         }
 180  4080
         return true;
 181  
     }
 182  
     
 183  
     interface ListData {
 184  
         long length();
 185  
         long encodedLength();
 186  
         void write(OutputStream stream) throws IOException;
 187  
     }
 188  
     
 189  
     class UncompressedListData implements ListData {
 190  
         List<IndexElement> blockLists;
 191  
         
 192  2064
         UncompressedListData(List<IndexElement> blockLists) {
 193  2064
             this.blockLists = blockLists;
 194  2064
         }
 195  
         
 196  
         public long length() {
 197  4128
             long totalLength = 0;
 198  4128
             for (IndexElement e : blockLists) {
 199  8224
                 totalLength += e.dataLength();
 200  
             }
 201  4128
             return totalLength;
 202  
         }
 203  
         
 204  
         public long encodedLength() {
 205  2064
             return length();
 206  
         }
 207  
         
 208  
         public void write(OutputStream stream) throws IOException {
 209  2064
             for (IndexElement e : blockLists) {
 210  4112
                 e.write(stream);
 211  
             }
 212  2064
         }
 213  
     }
 214  
     
 215  
     class CompressedListData implements ListData {
 216  
         List<IndexElement> blockLists;
 217  
         byte[] compressedData;
 218  
         
 219  2016
         CompressedListData(List<IndexElement> blockLists) throws IOException {
 220  2016
             this.blockLists = blockLists;
 221  2016
             compress();
 222  2016
         }
 223  
         
 224  
         void compress() throws IOException {
 225  2016
             ByteArrayOutputStream stream = new ByteArrayOutputStream();
 226  
             
 227  
             // write the uncompressed length here
 228  2016
             DataOutputStream s = new DataOutputStream(stream);
 229  2016
             s.writeInt((int)length());
 230  
             
 231  2016
             GZIPOutputStream gzipStream = new GZIPOutputStream(stream);
 232  2016
             for (IndexElement element : blockLists) {
 233  4016
                 element.write(gzipStream);
 234  
             }
 235  
             
 236  2016
             gzipStream.close();
 237  2016
             compressedData = stream.toByteArray();
 238  2016
         }
 239  
         
 240  
         public long length() {
 241  4032
             long totalLength = 0;
 242  4032
             for (IndexElement e : blockLists) {
 243  8032
                 totalLength += e.dataLength();
 244  
             }
 245  4032
             return totalLength;
 246  
         }
 247  
         
 248  
         public long encodedLength() {
 249  2016
             return compressedData.length;
 250  
         }
 251  
         
 252  
         public void write(OutputStream stream) throws IOException {
 253  2016
             stream.write(compressedData);
 254  2016
         }
 255  
     }
 256  
     
 257  4
     static class VocabularyHeader {
 258  
         ArrayList<byte[]> keys;
 259  
         short[] ends;
 260  4080
         ByteArrayOutputStream wordByteStream = new ByteArrayOutputStream();
 261  4080
         DataOutputStream vocabOutput = new DataOutputStream(wordByteStream);
 262  
         int blockOverlap;
 263  
         int groupCount;
 264  
         int vocabGroupSize;
 265  
         
 266  4080
         VocabularyHeader(List<IndexElement> blockLists, int vocabGroupSize) {
 267  4080
             keys = new ArrayList<byte[]>();
 268  4080
             this.vocabGroupSize = vocabGroupSize;
 269  4080
             for (IndexElement list : blockLists) {
 270  8128
                 keys.add(list.key());
 271  
             }
 272  4080
         }
 273  
 
 274  
         int prefixOverlap(byte[] firstTerm, byte[] lastTerm, int start) {
 275  8128
             int maximum = Math.min(firstTerm.length - start, lastTerm.length - start);
 276  8128
             maximum = Math.min(Byte.MAX_VALUE - 1, maximum);
 277  
 
 278  40380
             for (int i = start; i < maximum; i++) {
 279  40348
                 if (firstTerm[i] != lastTerm[i]) {
 280  8096
                     return i - start;
 281  
                 }
 282  
             }
 283  
 
 284  32
             return maximum;
 285  
         }
 286  
 
 287  
         int prefixOverlap(byte[] firstTerm, byte[] secondTerm) {
 288  8128
             return prefixOverlap(firstTerm, secondTerm, 0);
 289  
         }
 290  
         
 291  
         void calculateBlockPrefix() {
 292  
             // vocabulary group (prefix sharing)
 293  4080
             byte[] firstWord = keys.get(0);
 294  4080
             byte[] lastWord = keys.get(keys.size() - 1);
 295  
 
 296  
             // determine how many prefix characters are in common among all terms in this block
 297  4080
             blockOverlap = prefixOverlap(firstWord, lastWord);
 298  4080
         }
 299  
 
 300  
         void build() throws IOException {
 301  4080
             calculateBlockPrefix();
 302  
 
 303  4080
             groupCount = (int) Math.ceil((float) keys.size() / vocabGroupSize);
 304  4080
             ends = new short[groupCount];
 305  
 
 306  
             // write key data: outer loop is for each vocabulary group
 307  8160
             for (int i = 0; i < keys.size(); i += vocabGroupSize) {
 308  4080
                 byte[] word = keys.get(i);
 309  4080
                 byte[] lastWord = word;
 310  
                 assert word.length >= blockOverlap :
 311  
                     "Overlap: " + blockOverlap + " too small for " + word.length +
 312  4080
                     " (" + Utility.makeString(word) + ")";
 313  4080
                 assert word.length < 256;
 314  
 
 315  
                 // this is the first word in the group
 316  4080
                 vocabOutput.writeByte(word.length - blockOverlap);
 317  4080
                 vocabOutput.write(word, blockOverlap, word.length - blockOverlap);
 318  4080
                 int end = Math.min(keys.size(), i + vocabGroupSize);
 319  
 
 320  
                 // inner loop is for the remaining terms in each vocabulary group
 321  8128
                 for (int j = i + 1; j < end; j++) {
 322  4048
                     assert word.length < 256;
 323  
 
 324  
                     // write only new data (reference the previous key for prefix compression)
 325  4048
                     word = keys.get(j);
 326  4048
                     int common = this.prefixOverlap(lastWord, word);
 327  4048
                     vocabOutput.writeByte((byte) common);
 328  4048
                     vocabOutput.writeByte(word.length);
 329  4048
                     vocabOutput.write(word, common, word.length - common);
 330  4048
                     lastWord = word;
 331  
                 }
 332  
 
 333  4080
                 ends[i / vocabGroupSize] = (short) vocabOutput.size();
 334  
             }
 335  4080
             vocabOutput.close();
 336  4080
         }
 337  
 
 338  
         int getBlockOverlap() {
 339  4080
             return blockOverlap;
 340  
         }
 341  
         
 342  
         int getGroupCount() {
 343  4080
             return groupCount;
 344  
         }
 345  
 
 346  
         int getKeyCount() {
 347  8160
             return keys.size();
 348  
         }
 349  
         
 350  
         int getKeyDataLength() {
 351  4080
             return wordByteStream.size();
 352  
         }
 353  
         
 354  
         byte[] getFirstWord() {
 355  8160
             return keys.get(0);
 356  
         }
 357  
         
 358  
         void writeKeyHeader(DataOutputStream output) throws IOException {
 359  
             // write key count
 360  4080
             output.writeLong(getKeyCount());
 361  
 
 362  
             // write key prefix
 363  4080
             output.writeByte((byte) blockOverlap);
 364  4080
             output.write(getFirstWord(), 0, blockOverlap);
 365  
 
 366  
             // write key block lengths
 367  8160
             for (short wordBlockEnd : ends) {
 368  4080
                 output.writeShort(wordBlockEnd);
 369  
             }
 370  4080
         }
 371  
         
 372  
         void writeKeyData(DataOutputStream output) throws IOException {
 373  4080
             output.write(wordByteStream.toByteArray());
 374  4080
         }
 375  
     }
 376  
 
 377  
     public void writeBlock(List<IndexElement> blockLists, long length) throws IOException {
 378  4080
         assert length <= blockSize || blockLists.size() == 1;
 379  4080
         assert wordsInOrder(blockLists);
 380  
 
 381  4080
         if (blockLists.size() == 0) {
 382  0
             return;
 383  
         }
 384  
         
 385  4080
         VocabularyHeader vocabHeader = new VocabularyHeader(blockLists, vocabGroup);
 386  4080
         vocabHeader.build();
 387  
 
 388  
         // -- compute the length of the block --
 389  
         ListData listData;
 390  4080
         if (isCompressed) {
 391  2016
             listData = new CompressedListData(blockLists);
 392  
         } else {
 393  2064
             listData = new UncompressedListData(blockLists);
 394  
         }
 395  
         
 396  4080
         long headerBytes = 8 + // key count
 397  
                 8 + // block end
 398  
                 1 + vocabHeader.getBlockOverlap() + // key prefix bytes
 399  
                 2 * vocabHeader.getGroupCount() + // key lengths 
 400  
                 2 * vocabHeader.getKeyCount() + // inverted list endings
 401  
                 vocabHeader.getKeyDataLength();    // key data 
 402  
 
 403  4080
         long startPosition = filePosition;
 404  4080
         long endPosition = filePosition + headerBytes + listData.encodedLength();
 405  4080
         assert endPosition <= startPosition + length || isCompressed;
 406  4080
         assert endPosition > startPosition || isCompressed;
 407  4080
         assert filePosition >= Integer.MAX_VALUE || filePosition == output.size();
 408  
 
 409  
         // -- begin writing the block -- 
 410  4080
         vocabulary.add(vocabHeader.getFirstWord(), startPosition);
 411  
 
 412  
         // write block data end
 413  4080
         output.writeLong(endPosition);
 414  4080
         vocabHeader.writeKeyHeader(output);
 415  
 
 416  
         // write inverted list end positions
 417  4080
         long totalListData = listData.length();
 418  4080
         long invertedListBytes = 0;
 419  4080
         for (IndexElement list : blockLists) {
 420  8128
             invertedListBytes += list.dataLength();
 421  8128
             assert totalListData - invertedListBytes < Short.MAX_VALUE;
 422  8128
             assert totalListData >= invertedListBytes;
 423  8128
             output.writeShort((short) (totalListData - invertedListBytes));
 424  
         }
 425  
 
 426  
         // key data
 427  4080
         vocabHeader.writeKeyData(output);
 428  
 
 429  
         // write inverted list binary data
 430  4080
         listData.write(output);
 431  
 
 432  4080
         filePosition = endPosition;
 433  4080
         assert filePosition >= Integer.MAX_VALUE || filePosition == output.size();
 434  4080
         assert endPosition - startPosition <= blockSize || blockLists.size() == 1 || isCompressed;
 435  
 
 436  4080
         if (blocksWritten != null) {
 437  0
             blocksWritten.increment();
 438  
         }
 439  4080
     }
 440  
 
 441  
     private boolean needsFlush(IndexElement list) {
 442  8128
         long listExtra = 1 + // byte for key length
 443  
                 1;  // byte for overlap with previous key
 444  
 
 445  8128
         long bufferedBytes = bufferedSize() +
 446  
                 invertedListLength(list) +
 447  
                 listExtra;
 448  
 
 449  8128
         return bufferedBytes >= blockSize;
 450  
     }
 451  
 
 452  
     public void add(IndexElement list) throws IOException {
 453  8128
         if (list.key().length >= 256 || list.key().length >= blockSize / 4) {
 454  0
             throw new IOException("Key is too long.");
 455  
         }
 456  8128
         if (needsFlush(list)) {
 457  3992
             flush();
 458  
         }
 459  8128
         lists.add(list);
 460  8128
         updateBufferedSize(list);
 461  8128
         if (recordsWritten != null) {
 462  0
             recordsWritten.increment();
 463  
         }
 464  8128
     }
 465  
 
 466  
     public void close() throws IOException {
 467  88
         flush();
 468  
         
 469  88
         byte[] vocabularyData = vocabulary.data();
 470  88
         byte[] xmlData = manifest.toString().getBytes("UTF-8");
 471  88
         long vocabularyOffset = filePosition;
 472  88
         long manifestOffset = filePosition + vocabularyData.length;
 473  
         
 474  88
         output.write(vocabularyData);
 475  88
         output.write(xmlData);
 476  
         
 477  88
         output.writeLong(vocabularyOffset);
 478  88
         output.writeLong(manifestOffset);
 479  88
         output.writeInt(blockSize);
 480  88
         output.writeInt(vocabGroup);
 481  88
         output.writeBoolean(isCompressed);
 482  88
         output.writeLong(MAGIC_NUMBER);
 483  
         
 484  88
         output.close();
 485  88
     }
 486  
 }