1
2 package org.galagosearch.core.index;
3
4 import java.io.BufferedOutputStream;
5 import java.io.ByteArrayOutputStream;
6 import java.io.DataOutputStream;
7 import java.io.File;
8 import java.io.FileNotFoundException;
9 import java.io.FileOutputStream;
10 import java.io.IOException;
11 import java.io.OutputStream;
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.zip.GZIPOutputStream;
15 import org.galagosearch.tupleflow.Counter;
16 import org.galagosearch.tupleflow.Parameters;
17 import org.galagosearch.tupleflow.TupleFlowParameters;
18 import org.galagosearch.tupleflow.Utility;
19
20 /***
21 * This class writes index files, which are used for most Galago indexes.
22 *
23 * An index is a mapping between a key and a value, much like a TreeMap. The keys are
24 * sorted to allow iteration over the whole file. Keys are stored using prefix
25 * compression to save space. The structure is designed for fast random access on disk.
26 *
27 * For indexes, we assume that the data in each value is already compressed, so IndexWriter
28 * does no additional compression. However, if the isCompressed flag is set, IndexWriter
29 * will compress the value data. This is convenient for storing documents in an index.
30 *
31 * Keys cannot be longer than 256 bytes, and they must be added in sorted order.
32 *
33 * @author trevor
34 */
35 public class IndexWriter {
36 public static final long MAGIC_NUMBER = 0x1a2b3c4d5e6f7a8bL;
37
38 DataOutputStream output;
39 final VocabularyWriter vocabulary;
40 Parameters manifest;
41 ArrayList<IndexElement> lists;
42
43 int blockSize = 32768;
44 int vocabGroup = 16;
45 long filePosition = 0;
46 long listBytes = 0;
47
48 boolean isCompressed = false;
49
50 Counter recordsWritten = null;
51 Counter blocksWritten = null;
52
53 /***
54 * Creates a new instance of IndexWriter
55 */
56 public IndexWriter(String outputFilename, Parameters parameters)
57 throws FileNotFoundException, IOException {
58
59 new File(outputFilename).getParentFile().mkdirs();
60
61 blockSize = (int) parameters.get("blockSize", 32768);
62 isCompressed = parameters.get("isCompressed", false);
63 output = new DataOutputStream(new BufferedOutputStream(
64 new FileOutputStream(outputFilename)));
65 vocabulary = new VocabularyWriter();
66 manifest = new Parameters();
67 manifest.copy(parameters);
68 lists = new ArrayList<IndexElement>();
69 }
70
71 public IndexWriter(String outputFilename)
72 throws FileNotFoundException, IOException {
73 output = new DataOutputStream(new BufferedOutputStream(
74 new FileOutputStream(outputFilename)));
75 vocabulary = new VocabularyWriter();
76 manifest = new Parameters();
77 lists = new ArrayList<IndexElement>();
78 }
79
80 public IndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
81 this(parameters.getXML().get("filename"), parameters.getXML());
82 recordsWritten = parameters.getCounter("Records Written");
83 blocksWritten = parameters.getCounter("Blocks Written");
84 }
85
86 /***
87 * Returns the current copy of the manifest, which will be stored in
88 * the completed index file. This data is not written until close() is called.
89 */
90
91 public Parameters getManifest() {
92 return manifest;
93 }
94
95 /***
96 * Gives a conservative estimate of the buffered size of the data,
97 * excluding the most recent inverted list.
98 * Does not include savings due to key overlap compression.
99 */
100 public long bufferedSize() {
101 long extra = 8 +
102 8 +
103 1;
104
105 return listBytes + extra;
106 }
107
108 public void updateBufferedSize(IndexElement list) {
109 long extra = 1 +
110 1;
111
112 listBytes += invertedListLength(list);
113 listBytes += extra;
114 }
115
116 private long invertedListLength(IndexElement list) {
117 long listLength = 0;
118
119 listLength += list.key().length;
120 listLength += 2;
121 listLength += 2;
122
123 listLength += list.dataLength();
124 return listLength;
125 }
126
127 /***
128 * Flush all lists out to disk.
129 */
130 public void flush() throws IOException {
131
132 if (lists.size() == 0) {
133 return;
134 }
135 writeBlock(lists, bufferedSize());
136
137
138 lists = new ArrayList<IndexElement>();
139 listBytes = 0;
140 }
141
142 public long getBlockSize() {
143 return blockSize;
144 }
145
146 private boolean lessThanOrEqualTo(byte[] one, byte[] two) {
147 boolean isOneShorterOrEqualLength = (one.length <= two.length);
148 int commonLength = Math.min(one.length, two.length);
149
150 for (int i = 0; i < commonLength; i++) {
151 int a = one[i];
152 int b = two[i];
153 a &= 0xFF;
154 b &= 0xFF;
155 if (a < b) {
156 return true;
157 }
158 if (b < a) {
159 return false;
160 }
161 }
162
163 return isOneShorterOrEqualLength;
164 }
165
166 /***
167 * Returns true if the lists are sorted in ascending order by
168 * key.
169 *
170 * @param blockLists
171 */
172 public boolean wordsInOrder(List<IndexElement> blockLists) {
173 for (int i = 0; i < blockLists.size() - 1; i++) {
174 boolean result = lessThanOrEqualTo(blockLists.get(i).key(),
175 blockLists.get(i + 1).key());
176 if (result == false) {
177 return false;
178 }
179 }
180 return true;
181 }
182
183 interface ListData {
184 long length();
185 long encodedLength();
186 void write(OutputStream stream) throws IOException;
187 }
188
189 class UncompressedListData implements ListData {
190 List<IndexElement> blockLists;
191
192 UncompressedListData(List<IndexElement> blockLists) {
193 this.blockLists = blockLists;
194 }
195
196 public long length() {
197 long totalLength = 0;
198 for (IndexElement e : blockLists) {
199 totalLength += e.dataLength();
200 }
201 return totalLength;
202 }
203
204 public long encodedLength() {
205 return length();
206 }
207
208 public void write(OutputStream stream) throws IOException {
209 for (IndexElement e : blockLists) {
210 e.write(stream);
211 }
212 }
213 }
214
215 class CompressedListData implements ListData {
216 List<IndexElement> blockLists;
217 byte[] compressedData;
218
219 CompressedListData(List<IndexElement> blockLists) throws IOException {
220 this.blockLists = blockLists;
221 compress();
222 }
223
224 void compress() throws IOException {
225 ByteArrayOutputStream stream = new ByteArrayOutputStream();
226
227
228 DataOutputStream s = new DataOutputStream(stream);
229 s.writeInt((int)length());
230
231 GZIPOutputStream gzipStream = new GZIPOutputStream(stream);
232 for (IndexElement element : blockLists) {
233 element.write(gzipStream);
234 }
235
236 gzipStream.close();
237 compressedData = stream.toByteArray();
238 }
239
240 public long length() {
241 long totalLength = 0;
242 for (IndexElement e : blockLists) {
243 totalLength += e.dataLength();
244 }
245 return totalLength;
246 }
247
248 public long encodedLength() {
249 return compressedData.length;
250 }
251
252 public void write(OutputStream stream) throws IOException {
253 stream.write(compressedData);
254 }
255 }
256
257 static class VocabularyHeader {
258 ArrayList<byte[]> keys;
259 short[] ends;
260 ByteArrayOutputStream wordByteStream = new ByteArrayOutputStream();
261 DataOutputStream vocabOutput = new DataOutputStream(wordByteStream);
262 int blockOverlap;
263 int groupCount;
264 int vocabGroupSize;
265
266 VocabularyHeader(List<IndexElement> blockLists, int vocabGroupSize) {
267 keys = new ArrayList<byte[]>();
268 this.vocabGroupSize = vocabGroupSize;
269 for (IndexElement list : blockLists) {
270 keys.add(list.key());
271 }
272 }
273
274 int prefixOverlap(byte[] firstTerm, byte[] lastTerm, int start) {
275 int maximum = Math.min(firstTerm.length - start, lastTerm.length - start);
276 maximum = Math.min(Byte.MAX_VALUE - 1, maximum);
277
278 for (int i = start; i < maximum; i++) {
279 if (firstTerm[i] != lastTerm[i]) {
280 return i - start;
281 }
282 }
283
284 return maximum;
285 }
286
287 int prefixOverlap(byte[] firstTerm, byte[] secondTerm) {
288 return prefixOverlap(firstTerm, secondTerm, 0);
289 }
290
291 void calculateBlockPrefix() {
292
293 byte[] firstWord = keys.get(0);
294 byte[] lastWord = keys.get(keys.size() - 1);
295
296
297 blockOverlap = prefixOverlap(firstWord, lastWord);
298 }
299
300 void build() throws IOException {
301 calculateBlockPrefix();
302
303 groupCount = (int) Math.ceil((float) keys.size() / vocabGroupSize);
304 ends = new short[groupCount];
305
306
307 for (int i = 0; i < keys.size(); i += vocabGroupSize) {
308 byte[] word = keys.get(i);
309 byte[] lastWord = word;
310 assert word.length >= blockOverlap :
311 "Overlap: " + blockOverlap + " too small for " + word.length +
312 " (" + Utility.makeString(word) + ")";
313 assert word.length < 256;
314
315
316 vocabOutput.writeByte(word.length - blockOverlap);
317 vocabOutput.write(word, blockOverlap, word.length - blockOverlap);
318 int end = Math.min(keys.size(), i + vocabGroupSize);
319
320
321 for (int j = i + 1; j < end; j++) {
322 assert word.length < 256;
323
324
325 word = keys.get(j);
326 int common = this.prefixOverlap(lastWord, word);
327 vocabOutput.writeByte((byte) common);
328 vocabOutput.writeByte(word.length);
329 vocabOutput.write(word, common, word.length - common);
330 lastWord = word;
331 }
332
333 ends[i / vocabGroupSize] = (short) vocabOutput.size();
334 }
335 vocabOutput.close();
336 }
337
338 int getBlockOverlap() {
339 return blockOverlap;
340 }
341
342 int getGroupCount() {
343 return groupCount;
344 }
345
346 int getKeyCount() {
347 return keys.size();
348 }
349
350 int getKeyDataLength() {
351 return wordByteStream.size();
352 }
353
354 byte[] getFirstWord() {
355 return keys.get(0);
356 }
357
358 void writeKeyHeader(DataOutputStream output) throws IOException {
359
360 output.writeLong(getKeyCount());
361
362
363 output.writeByte((byte) blockOverlap);
364 output.write(getFirstWord(), 0, blockOverlap);
365
366
367 for (short wordBlockEnd : ends) {
368 output.writeShort(wordBlockEnd);
369 }
370 }
371
372 void writeKeyData(DataOutputStream output) throws IOException {
373 output.write(wordByteStream.toByteArray());
374 }
375 }
376
377 public void writeBlock(List<IndexElement> blockLists, long length) throws IOException {
378 assert length <= blockSize || blockLists.size() == 1;
379 assert wordsInOrder(blockLists);
380
381 if (blockLists.size() == 0) {
382 return;
383 }
384
385 VocabularyHeader vocabHeader = new VocabularyHeader(blockLists, vocabGroup);
386 vocabHeader.build();
387
388
389 ListData listData;
390 if (isCompressed) {
391 listData = new CompressedListData(blockLists);
392 } else {
393 listData = new UncompressedListData(blockLists);
394 }
395
396 long headerBytes = 8 +
397 8 +
398 1 + vocabHeader.getBlockOverlap() +
399 2 * vocabHeader.getGroupCount() +
400 2 * vocabHeader.getKeyCount() +
401 vocabHeader.getKeyDataLength();
402
403 long startPosition = filePosition;
404 long endPosition = filePosition + headerBytes + listData.encodedLength();
405 assert endPosition <= startPosition + length || isCompressed;
406 assert endPosition > startPosition || isCompressed;
407 assert filePosition >= Integer.MAX_VALUE || filePosition == output.size();
408
409
410 vocabulary.add(vocabHeader.getFirstWord(), startPosition);
411
412
413 output.writeLong(endPosition);
414 vocabHeader.writeKeyHeader(output);
415
416
417 long totalListData = listData.length();
418 long invertedListBytes = 0;
419 for (IndexElement list : blockLists) {
420 invertedListBytes += list.dataLength();
421 assert totalListData - invertedListBytes < Short.MAX_VALUE;
422 assert totalListData >= invertedListBytes;
423 output.writeShort((short) (totalListData - invertedListBytes));
424 }
425
426
427 vocabHeader.writeKeyData(output);
428
429
430 listData.write(output);
431
432 filePosition = endPosition;
433 assert filePosition >= Integer.MAX_VALUE || filePosition == output.size();
434 assert endPosition - startPosition <= blockSize || blockLists.size() == 1 || isCompressed;
435
436 if (blocksWritten != null) {
437 blocksWritten.increment();
438 }
439 }
440
441 private boolean needsFlush(IndexElement list) {
442 long listExtra = 1 +
443 1;
444
445 long bufferedBytes = bufferedSize() +
446 invertedListLength(list) +
447 listExtra;
448
449 return bufferedBytes >= blockSize;
450 }
451
452 public void add(IndexElement list) throws IOException {
453 if (list.key().length >= 256 || list.key().length >= blockSize / 4) {
454 throw new IOException("Key is too long.");
455 }
456 if (needsFlush(list)) {
457 flush();
458 }
459 lists.add(list);
460 updateBufferedSize(list);
461 if (recordsWritten != null) {
462 recordsWritten.increment();
463 }
464 }
465
466 public void close() throws IOException {
467 flush();
468
469 byte[] vocabularyData = vocabulary.data();
470 byte[] xmlData = manifest.toString().getBytes("UTF-8");
471 long vocabularyOffset = filePosition;
472 long manifestOffset = filePosition + vocabularyData.length;
473
474 output.write(vocabularyData);
475 output.write(xmlData);
476
477 output.writeLong(vocabularyOffset);
478 output.writeLong(manifestOffset);
479 output.writeInt(blockSize);
480 output.writeInt(vocabGroup);
481 output.writeBoolean(isCompressed);
482 output.writeLong(MAGIC_NUMBER);
483
484 output.close();
485 }
486 }