View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.index;
3   
4   import java.io.BufferedOutputStream;
5   import java.io.ByteArrayOutputStream;
6   import java.io.DataOutputStream;
7   import java.io.File;
8   import java.io.FileNotFoundException;
9   import java.io.FileOutputStream;
10  import java.io.IOException;
11  import java.io.OutputStream;
12  import java.util.ArrayList;
13  import java.util.List;
14  import java.util.zip.GZIPOutputStream;
15  import org.galagosearch.tupleflow.Counter;
16  import org.galagosearch.tupleflow.Parameters;
17  import org.galagosearch.tupleflow.TupleFlowParameters;
18  import org.galagosearch.tupleflow.Utility;
19  
20  /***
21   * This class writes index files, which are used for most Galago indexes.
22   * 
23   * An index is a mapping between a key and a value, much like a TreeMap.  The keys are
24   * sorted to allow iteration over the whole file.  Keys are stored using prefix
25   * compression to save space.  The structure is designed for fast random access on disk.
26   * 
27   * For indexes, we assume that the data in each value is already compressed, so IndexWriter
28   * does no additional compression.  However, if the isCompressed flag is set, IndexWriter
29   * will compress the value data.  This is convenient for storing documents in an index.
30   * 
31   * Keys cannot be longer than 256 bytes, and they must be added in sorted order.
32   * 
33   * @author trevor
34   */
35  public class IndexWriter {
36      public static final long MAGIC_NUMBER = 0x1a2b3c4d5e6f7a8bL;
37  
38      DataOutputStream output;
39      final VocabularyWriter vocabulary;
40      Parameters manifest;
41      ArrayList<IndexElement> lists;
42  
43      int blockSize = 32768;
44      int vocabGroup = 16;
45      long filePosition = 0;
46      long listBytes = 0;
47      // compression isn't supported yet
48      boolean isCompressed = false;
49  
50      Counter recordsWritten = null;
51      Counter blocksWritten = null;
52  
53      /***
54       * Creates a new instance of IndexWriter
55       */
56      public IndexWriter(String outputFilename, Parameters parameters)
57              throws FileNotFoundException, IOException {
58          // Create the parent directory:
59          new File(outputFilename).getParentFile().mkdirs();
60  
61          blockSize = (int) parameters.get("blockSize", 32768);
62          isCompressed = parameters.get("isCompressed", false);
63          output = new DataOutputStream(new BufferedOutputStream(
64                                        new FileOutputStream(outputFilename)));
65          vocabulary = new VocabularyWriter();
66          manifest = new Parameters();
67          manifest.copy(parameters);
68          lists = new ArrayList<IndexElement>();
69      }
70      
71      public IndexWriter(String outputFilename)
72              throws FileNotFoundException, IOException {
73          output = new DataOutputStream(new BufferedOutputStream(
74                                        new FileOutputStream(outputFilename)));
75          vocabulary = new VocabularyWriter();
76          manifest = new Parameters();
77          lists = new ArrayList<IndexElement>();
78      }
79  
80      public IndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
81          this(parameters.getXML().get("filename"), parameters.getXML());
82          recordsWritten = parameters.getCounter("Records Written");
83          blocksWritten = parameters.getCounter("Blocks Written");
84      }
85      
86      /***
87       * Returns the current copy of the manifest, which will be stored in
88       * the completed index file.  This data is not written until close() is called.
89       */
90      
91      public Parameters getManifest() {
92          return manifest;
93      }
94  
95      /*** 
96       * Gives a conservative estimate of the buffered size of the data,
97       * excluding the most recent inverted list.
98       * Does not include savings due to key overlap compression.
99       */
100     public long bufferedSize() {
101         long extra = 8 + // end of block
102                 8 + // key count
103                 1; // overlap length
104 
105         return listBytes + extra;
106     }
107 
108     public void updateBufferedSize(IndexElement list) {
109         long extra = 1 + // byte for key length
110                 1;  // byte for overlap with previous key
111 
112         listBytes += invertedListLength(list);
113         listBytes += extra;
114     }
115 
116     private long invertedListLength(IndexElement list) {
117         long listLength = 0;
118 
119         listLength += list.key().length;
120         listLength += 2; // key length bytes
121         listLength += 2; // file offset bytes
122 
123         listLength += list.dataLength();
124         return listLength;
125     }
126 
127     /***
128      * Flush all lists out to disk.
129      */
130     public void flush() throws IOException {
131         // if there aren't any lists, quit now
132         if (lists.size() == 0) {
133             return;        // write everything out
134         }
135         writeBlock(lists, bufferedSize());
136 
137         // remove all of the current data
138         lists = new ArrayList<IndexElement>();
139         listBytes = 0;
140     }
141 
142     public long getBlockSize() {
143         return blockSize;
144     }
145 
146     private boolean lessThanOrEqualTo(byte[] one, byte[] two) {
147         boolean isOneShorterOrEqualLength = (one.length <= two.length);
148         int commonLength = Math.min(one.length, two.length);
149         
150         for (int i = 0; i < commonLength; i++) {
151             int a = one[i];
152             int b = two[i];
153             a &= 0xFF;
154             b &= 0xFF;
155             if (a < b) {
156                 return true;
157             }
158             if (b < a) {
159                 return false;
160             }
161         }
162         
163         return isOneShorterOrEqualLength;
164     }
165 
166     /***
167      * Returns true if the lists are sorted in ascending order by
168      * key.
169      * 
170      * @param blockLists
171      */
172     public boolean wordsInOrder(List<IndexElement> blockLists) {
173         for (int i = 0; i < blockLists.size() - 1; i++) {
174             boolean result = lessThanOrEqualTo(blockLists.get(i).key(),
175                                                blockLists.get(i + 1).key());
176             if (result == false) {
177                 return false;
178             }
179         }
180         return true;
181     }
182     
183     interface ListData {
184         long length();
185         long encodedLength();
186         void write(OutputStream stream) throws IOException;
187     }
188     
189     class UncompressedListData implements ListData {
190         List<IndexElement> blockLists;
191         
192         UncompressedListData(List<IndexElement> blockLists) {
193             this.blockLists = blockLists;
194         }
195         
196         public long length() {
197             long totalLength = 0;
198             for (IndexElement e : blockLists) {
199                 totalLength += e.dataLength();
200             }
201             return totalLength;
202         }
203         
204         public long encodedLength() {
205             return length();
206         }
207         
208         public void write(OutputStream stream) throws IOException {
209             for (IndexElement e : blockLists) {
210                 e.write(stream);
211             }
212         }
213     }
214     
215     class CompressedListData implements ListData {
216         List<IndexElement> blockLists;
217         byte[] compressedData;
218         
219         CompressedListData(List<IndexElement> blockLists) throws IOException {
220             this.blockLists = blockLists;
221             compress();
222         }
223         
224         void compress() throws IOException {
225             ByteArrayOutputStream stream = new ByteArrayOutputStream();
226             
227             // write the uncompressed length here
228             DataOutputStream s = new DataOutputStream(stream);
229             s.writeInt((int)length());
230             
231             GZIPOutputStream gzipStream = new GZIPOutputStream(stream);
232             for (IndexElement element : blockLists) {
233                 element.write(gzipStream);
234             }
235             
236             gzipStream.close();
237             compressedData = stream.toByteArray();
238         }
239         
240         public long length() {
241             long totalLength = 0;
242             for (IndexElement e : blockLists) {
243                 totalLength += e.dataLength();
244             }
245             return totalLength;
246         }
247         
248         public long encodedLength() {
249             return compressedData.length;
250         }
251         
252         public void write(OutputStream stream) throws IOException {
253             stream.write(compressedData);
254         }
255     }
256     
257     static class VocabularyHeader {
258         ArrayList<byte[]> keys;
259         short[] ends;
260         ByteArrayOutputStream wordByteStream = new ByteArrayOutputStream();
261         DataOutputStream vocabOutput = new DataOutputStream(wordByteStream);
262         int blockOverlap;
263         int groupCount;
264         int vocabGroupSize;
265         
266         VocabularyHeader(List<IndexElement> blockLists, int vocabGroupSize) {
267             keys = new ArrayList<byte[]>();
268             this.vocabGroupSize = vocabGroupSize;
269             for (IndexElement list : blockLists) {
270                 keys.add(list.key());
271             }
272         }
273 
274         int prefixOverlap(byte[] firstTerm, byte[] lastTerm, int start) {
275             int maximum = Math.min(firstTerm.length - start, lastTerm.length - start);
276             maximum = Math.min(Byte.MAX_VALUE - 1, maximum);
277 
278             for (int i = start; i < maximum; i++) {
279                 if (firstTerm[i] != lastTerm[i]) {
280                     return i - start;
281                 }
282             }
283 
284             return maximum;
285         }
286 
287         int prefixOverlap(byte[] firstTerm, byte[] secondTerm) {
288             return prefixOverlap(firstTerm, secondTerm, 0);
289         }
290         
291         void calculateBlockPrefix() {
292             // vocabulary group (prefix sharing)
293             byte[] firstWord = keys.get(0);
294             byte[] lastWord = keys.get(keys.size() - 1);
295 
296             // determine how many prefix characters are in common among all terms in this block
297             blockOverlap = prefixOverlap(firstWord, lastWord);
298         }
299 
300         void build() throws IOException {
301             calculateBlockPrefix();
302 
303             groupCount = (int) Math.ceil((float) keys.size() / vocabGroupSize);
304             ends = new short[groupCount];
305 
306             // write key data: outer loop is for each vocabulary group
307             for (int i = 0; i < keys.size(); i += vocabGroupSize) {
308                 byte[] word = keys.get(i);
309                 byte[] lastWord = word;
310                 assert word.length >= blockOverlap :
311                     "Overlap: " + blockOverlap + " too small for " + word.length +
312                     " (" + Utility.makeString(word) + ")";
313                 assert word.length < 256;
314 
315                 // this is the first word in the group
316                 vocabOutput.writeByte(word.length - blockOverlap);
317                 vocabOutput.write(word, blockOverlap, word.length - blockOverlap);
318                 int end = Math.min(keys.size(), i + vocabGroupSize);
319 
320                 // inner loop is for the remaining terms in each vocabulary group
321                 for (int j = i + 1; j < end; j++) {
322                     assert word.length < 256;
323 
324                     // write only new data (reference the previous key for prefix compression)
325                     word = keys.get(j);
326                     int common = this.prefixOverlap(lastWord, word);
327                     vocabOutput.writeByte((byte) common);
328                     vocabOutput.writeByte(word.length);
329                     vocabOutput.write(word, common, word.length - common);
330                     lastWord = word;
331                 }
332 
333                 ends[i / vocabGroupSize] = (short) vocabOutput.size();
334             }
335             vocabOutput.close();
336         }
337 
338         int getBlockOverlap() {
339             return blockOverlap;
340         }
341         
342         int getGroupCount() {
343             return groupCount;
344         }
345 
346         int getKeyCount() {
347             return keys.size();
348         }
349         
350         int getKeyDataLength() {
351             return wordByteStream.size();
352         }
353         
354         byte[] getFirstWord() {
355             return keys.get(0);
356         }
357         
358         void writeKeyHeader(DataOutputStream output) throws IOException {
359             // write key count
360             output.writeLong(getKeyCount());
361 
362             // write key prefix
363             output.writeByte((byte) blockOverlap);
364             output.write(getFirstWord(), 0, blockOverlap);
365 
366             // write key block lengths
367             for (short wordBlockEnd : ends) {
368                 output.writeShort(wordBlockEnd);
369             }
370         }
371         
372         void writeKeyData(DataOutputStream output) throws IOException {
373             output.write(wordByteStream.toByteArray());
374         }
375     }
376 
377     public void writeBlock(List<IndexElement> blockLists, long length) throws IOException {
378         assert length <= blockSize || blockLists.size() == 1;
379         assert wordsInOrder(blockLists);
380 
381         if (blockLists.size() == 0) {
382             return;
383         }
384         
385         VocabularyHeader vocabHeader = new VocabularyHeader(blockLists, vocabGroup);
386         vocabHeader.build();
387 
388         // -- compute the length of the block --
389         ListData listData;
390         if (isCompressed) {
391             listData = new CompressedListData(blockLists);
392         } else {
393             listData = new UncompressedListData(blockLists);
394         }
395         
396         long headerBytes = 8 + // key count
397                 8 + // block end
398                 1 + vocabHeader.getBlockOverlap() + // key prefix bytes
399                 2 * vocabHeader.getGroupCount() + // key lengths 
400                 2 * vocabHeader.getKeyCount() + // inverted list endings
401                 vocabHeader.getKeyDataLength();    // key data 
402 
403         long startPosition = filePosition;
404         long endPosition = filePosition + headerBytes + listData.encodedLength();
405         assert endPosition <= startPosition + length || isCompressed;
406         assert endPosition > startPosition || isCompressed;
407         assert filePosition >= Integer.MAX_VALUE || filePosition == output.size();
408 
409         // -- begin writing the block -- 
410         vocabulary.add(vocabHeader.getFirstWord(), startPosition);
411 
412         // write block data end
413         output.writeLong(endPosition);
414         vocabHeader.writeKeyHeader(output);
415 
416         // write inverted list end positions
417         long totalListData = listData.length();
418         long invertedListBytes = 0;
419         for (IndexElement list : blockLists) {
420             invertedListBytes += list.dataLength();
421             assert totalListData - invertedListBytes < Short.MAX_VALUE;
422             assert totalListData >= invertedListBytes;
423             output.writeShort((short) (totalListData - invertedListBytes));
424         }
425 
426         // key data
427         vocabHeader.writeKeyData(output);
428 
429         // write inverted list binary data
430         listData.write(output);
431 
432         filePosition = endPosition;
433         assert filePosition >= Integer.MAX_VALUE || filePosition == output.size();
434         assert endPosition - startPosition <= blockSize || blockLists.size() == 1 || isCompressed;
435 
436         if (blocksWritten != null) {
437             blocksWritten.increment();
438         }
439     }
440 
441     private boolean needsFlush(IndexElement list) {
442         long listExtra = 1 + // byte for key length
443                 1;  // byte for overlap with previous key
444 
445         long bufferedBytes = bufferedSize() +
446                 invertedListLength(list) +
447                 listExtra;
448 
449         return bufferedBytes >= blockSize;
450     }
451 
452     public void add(IndexElement list) throws IOException {
453         if (list.key().length >= 256 || list.key().length >= blockSize / 4) {
454             throw new IOException("Key is too long.");
455         }
456         if (needsFlush(list)) {
457             flush();
458         }
459         lists.add(list);
460         updateBufferedSize(list);
461         if (recordsWritten != null) {
462             recordsWritten.increment();
463         }
464     }
465 
466     public void close() throws IOException {
467         flush();
468         
469         byte[] vocabularyData = vocabulary.data();
470         byte[] xmlData = manifest.toString().getBytes("UTF-8");
471         long vocabularyOffset = filePosition;
472         long manifestOffset = filePosition + vocabularyData.length;
473         
474         output.write(vocabularyData);
475         output.write(xmlData);
476         
477         output.writeLong(vocabularyOffset);
478         output.writeLong(manifestOffset);
479         output.writeInt(blockSize);
480         output.writeInt(vocabGroup);
481         output.writeBoolean(isCompressed);
482         output.writeLong(MAGIC_NUMBER);
483         
484         output.close();
485     }
486 }