| 1 | |
|
| 2 | |
package org.galagosearch.core.index; |
| 3 | |
|
| 4 | |
import java.io.BufferedOutputStream; |
| 5 | |
import java.io.ByteArrayOutputStream; |
| 6 | |
import java.io.DataOutputStream; |
| 7 | |
import java.io.File; |
| 8 | |
import java.io.FileNotFoundException; |
| 9 | |
import java.io.FileOutputStream; |
| 10 | |
import java.io.IOException; |
| 11 | |
import java.io.OutputStream; |
| 12 | |
import java.util.ArrayList; |
| 13 | |
import java.util.List; |
| 14 | |
import java.util.zip.GZIPOutputStream; |
| 15 | |
import org.galagosearch.tupleflow.Counter; |
| 16 | |
import org.galagosearch.tupleflow.Parameters; |
| 17 | |
import org.galagosearch.tupleflow.TupleFlowParameters; |
| 18 | |
import org.galagosearch.tupleflow.Utility; |
| 19 | |
|
| 20 | |
|
| 21 | |
|
| 22 | |
|
| 23 | |
|
| 24 | |
|
| 25 | |
|
| 26 | |
|
| 27 | |
|
| 28 | |
|
| 29 | |
|
| 30 | |
|
| 31 | |
|
| 32 | |
|
| 33 | |
|
| 34 | |
|
| 35 | 4 | public class IndexWriter { |
| 36 | |
public static final long MAGIC_NUMBER = 0x1a2b3c4d5e6f7a8bL; |
| 37 | |
|
| 38 | |
DataOutputStream output; |
| 39 | |
final VocabularyWriter vocabulary; |
| 40 | |
Parameters manifest; |
| 41 | |
ArrayList<IndexElement> lists; |
| 42 | |
|
| 43 | 88 | int blockSize = 32768; |
| 44 | 88 | int vocabGroup = 16; |
| 45 | 88 | long filePosition = 0; |
| 46 | 88 | long listBytes = 0; |
| 47 | |
|
| 48 | 88 | boolean isCompressed = false; |
| 49 | |
|
| 50 | 88 | Counter recordsWritten = null; |
| 51 | 88 | Counter blocksWritten = null; |
| 52 | |
|
| 53 | |
|
| 54 | |
|
| 55 | |
|
| 56 | |
public IndexWriter(String outputFilename, Parameters parameters) |
| 57 | 88 | throws FileNotFoundException, IOException { |
| 58 | |
|
| 59 | 88 | new File(outputFilename).getParentFile().mkdirs(); |
| 60 | |
|
| 61 | 88 | blockSize = (int) parameters.get("blockSize", 32768); |
| 62 | 88 | isCompressed = parameters.get("isCompressed", false); |
| 63 | 88 | output = new DataOutputStream(new BufferedOutputStream( |
| 64 | |
new FileOutputStream(outputFilename))); |
| 65 | 88 | vocabulary = new VocabularyWriter(); |
| 66 | 88 | manifest = new Parameters(); |
| 67 | 88 | manifest.copy(parameters); |
| 68 | 88 | lists = new ArrayList<IndexElement>(); |
| 69 | 88 | } |
| 70 | |
|
| 71 | |
public IndexWriter(String outputFilename) |
| 72 | 0 | throws FileNotFoundException, IOException { |
| 73 | 0 | output = new DataOutputStream(new BufferedOutputStream( |
| 74 | |
new FileOutputStream(outputFilename))); |
| 75 | 0 | vocabulary = new VocabularyWriter(); |
| 76 | 0 | manifest = new Parameters(); |
| 77 | 0 | lists = new ArrayList<IndexElement>(); |
| 78 | 0 | } |
| 79 | |
|
| 80 | |
public IndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException { |
| 81 | 56 | this(parameters.getXML().get("filename"), parameters.getXML()); |
| 82 | 56 | recordsWritten = parameters.getCounter("Records Written"); |
| 83 | 56 | blocksWritten = parameters.getCounter("Blocks Written"); |
| 84 | 56 | } |
| 85 | |
|
| 86 | |
|
| 87 | |
|
| 88 | |
|
| 89 | |
|
| 90 | |
|
| 91 | |
public Parameters getManifest() { |
| 92 | 112 | return manifest; |
| 93 | |
} |
| 94 | |
|
| 95 | |
|
| 96 | |
|
| 97 | |
|
| 98 | |
|
| 99 | |
|
| 100 | |
public long bufferedSize() { |
| 101 | 12208 | long extra = 8 + |
| 102 | |
8 + |
| 103 | |
1; |
| 104 | |
|
| 105 | 12208 | return listBytes + extra; |
| 106 | |
} |
| 107 | |
|
| 108 | |
public void updateBufferedSize(IndexElement list) { |
| 109 | 8128 | long extra = 1 + |
| 110 | |
1; |
| 111 | |
|
| 112 | 8128 | listBytes += invertedListLength(list); |
| 113 | 8128 | listBytes += extra; |
| 114 | 8128 | } |
| 115 | |
|
| 116 | |
private long invertedListLength(IndexElement list) { |
| 117 | 16256 | long listLength = 0; |
| 118 | |
|
| 119 | 16256 | listLength += list.key().length; |
| 120 | 16256 | listLength += 2; |
| 121 | 16256 | listLength += 2; |
| 122 | |
|
| 123 | 16256 | listLength += list.dataLength(); |
| 124 | 16256 | return listLength; |
| 125 | |
} |
| 126 | |
|
| 127 | |
|
| 128 | |
|
| 129 | |
|
| 130 | |
public void flush() throws IOException { |
| 131 | |
|
| 132 | 4080 | if (lists.size() == 0) { |
| 133 | 0 | return; |
| 134 | |
} |
| 135 | 4080 | writeBlock(lists, bufferedSize()); |
| 136 | |
|
| 137 | |
|
| 138 | 4080 | lists = new ArrayList<IndexElement>(); |
| 139 | 4080 | listBytes = 0; |
| 140 | 4080 | } |
| 141 | |
|
| 142 | |
public long getBlockSize() { |
| 143 | 0 | return blockSize; |
| 144 | |
} |
| 145 | |
|
| 146 | |
private boolean lessThanOrEqualTo(byte[] one, byte[] two) { |
| 147 | 4048 | boolean isOneShorterOrEqualLength = (one.length <= two.length); |
| 148 | 4048 | int commonLength = Math.min(one.length, two.length); |
| 149 | |
|
| 150 | 20048 | for (int i = 0; i < commonLength; i++) { |
| 151 | 20048 | int a = one[i]; |
| 152 | 20048 | int b = two[i]; |
| 153 | 20048 | a &= 0xFF; |
| 154 | 20048 | b &= 0xFF; |
| 155 | 20048 | if (a < b) { |
| 156 | 4048 | return true; |
| 157 | |
} |
| 158 | 16000 | if (b < a) { |
| 159 | 0 | return false; |
| 160 | |
} |
| 161 | |
} |
| 162 | |
|
| 163 | 0 | return isOneShorterOrEqualLength; |
| 164 | |
} |
| 165 | |
|
| 166 | |
|
| 167 | |
|
| 168 | |
|
| 169 | |
|
| 170 | |
|
| 171 | |
|
| 172 | |
public boolean wordsInOrder(List<IndexElement> blockLists) { |
| 173 | 8128 | for (int i = 0; i < blockLists.size() - 1; i++) { |
| 174 | 4048 | boolean result = lessThanOrEqualTo(blockLists.get(i).key(), |
| 175 | |
blockLists.get(i + 1).key()); |
| 176 | 4048 | if (result == false) { |
| 177 | 0 | return false; |
| 178 | |
} |
| 179 | |
} |
| 180 | 4080 | return true; |
| 181 | |
} |
| 182 | |
|
| 183 | |
interface ListData { |
| 184 | |
long length(); |
| 185 | |
long encodedLength(); |
| 186 | |
void write(OutputStream stream) throws IOException; |
| 187 | |
} |
| 188 | |
|
| 189 | |
class UncompressedListData implements ListData { |
| 190 | |
List<IndexElement> blockLists; |
| 191 | |
|
| 192 | 2064 | UncompressedListData(List<IndexElement> blockLists) { |
| 193 | 2064 | this.blockLists = blockLists; |
| 194 | 2064 | } |
| 195 | |
|
| 196 | |
public long length() { |
| 197 | 4128 | long totalLength = 0; |
| 198 | 4128 | for (IndexElement e : blockLists) { |
| 199 | 8224 | totalLength += e.dataLength(); |
| 200 | |
} |
| 201 | 4128 | return totalLength; |
| 202 | |
} |
| 203 | |
|
| 204 | |
public long encodedLength() { |
| 205 | 2064 | return length(); |
| 206 | |
} |
| 207 | |
|
| 208 | |
public void write(OutputStream stream) throws IOException { |
| 209 | 2064 | for (IndexElement e : blockLists) { |
| 210 | 4112 | e.write(stream); |
| 211 | |
} |
| 212 | 2064 | } |
| 213 | |
} |
| 214 | |
|
| 215 | |
class CompressedListData implements ListData { |
| 216 | |
List<IndexElement> blockLists; |
| 217 | |
byte[] compressedData; |
| 218 | |
|
| 219 | 2016 | CompressedListData(List<IndexElement> blockLists) throws IOException { |
| 220 | 2016 | this.blockLists = blockLists; |
| 221 | 2016 | compress(); |
| 222 | 2016 | } |
| 223 | |
|
| 224 | |
void compress() throws IOException { |
| 225 | 2016 | ByteArrayOutputStream stream = new ByteArrayOutputStream(); |
| 226 | |
|
| 227 | |
|
| 228 | 2016 | DataOutputStream s = new DataOutputStream(stream); |
| 229 | 2016 | s.writeInt((int)length()); |
| 230 | |
|
| 231 | 2016 | GZIPOutputStream gzipStream = new GZIPOutputStream(stream); |
| 232 | 2016 | for (IndexElement element : blockLists) { |
| 233 | 4016 | element.write(gzipStream); |
| 234 | |
} |
| 235 | |
|
| 236 | 2016 | gzipStream.close(); |
| 237 | 2016 | compressedData = stream.toByteArray(); |
| 238 | 2016 | } |
| 239 | |
|
| 240 | |
public long length() { |
| 241 | 4032 | long totalLength = 0; |
| 242 | 4032 | for (IndexElement e : blockLists) { |
| 243 | 8032 | totalLength += e.dataLength(); |
| 244 | |
} |
| 245 | 4032 | return totalLength; |
| 246 | |
} |
| 247 | |
|
| 248 | |
public long encodedLength() { |
| 249 | 2016 | return compressedData.length; |
| 250 | |
} |
| 251 | |
|
| 252 | |
public void write(OutputStream stream) throws IOException { |
| 253 | 2016 | stream.write(compressedData); |
| 254 | 2016 | } |
| 255 | |
} |
| 256 | |
|
| 257 | 4 | static class VocabularyHeader { |
| 258 | |
ArrayList<byte[]> keys; |
| 259 | |
short[] ends; |
| 260 | 4080 | ByteArrayOutputStream wordByteStream = new ByteArrayOutputStream(); |
| 261 | 4080 | DataOutputStream vocabOutput = new DataOutputStream(wordByteStream); |
| 262 | |
int blockOverlap; |
| 263 | |
int groupCount; |
| 264 | |
int vocabGroupSize; |
| 265 | |
|
| 266 | 4080 | VocabularyHeader(List<IndexElement> blockLists, int vocabGroupSize) { |
| 267 | 4080 | keys = new ArrayList<byte[]>(); |
| 268 | 4080 | this.vocabGroupSize = vocabGroupSize; |
| 269 | 4080 | for (IndexElement list : blockLists) { |
| 270 | 8128 | keys.add(list.key()); |
| 271 | |
} |
| 272 | 4080 | } |
| 273 | |
|
| 274 | |
int prefixOverlap(byte[] firstTerm, byte[] lastTerm, int start) { |
| 275 | 8128 | int maximum = Math.min(firstTerm.length - start, lastTerm.length - start); |
| 276 | 8128 | maximum = Math.min(Byte.MAX_VALUE - 1, maximum); |
| 277 | |
|
| 278 | 40380 | for (int i = start; i < maximum; i++) { |
| 279 | 40348 | if (firstTerm[i] != lastTerm[i]) { |
| 280 | 8096 | return i - start; |
| 281 | |
} |
| 282 | |
} |
| 283 | |
|
| 284 | 32 | return maximum; |
| 285 | |
} |
| 286 | |
|
| 287 | |
int prefixOverlap(byte[] firstTerm, byte[] secondTerm) { |
| 288 | 8128 | return prefixOverlap(firstTerm, secondTerm, 0); |
| 289 | |
} |
| 290 | |
|
| 291 | |
void calculateBlockPrefix() { |
| 292 | |
|
| 293 | 4080 | byte[] firstWord = keys.get(0); |
| 294 | 4080 | byte[] lastWord = keys.get(keys.size() - 1); |
| 295 | |
|
| 296 | |
|
| 297 | 4080 | blockOverlap = prefixOverlap(firstWord, lastWord); |
| 298 | 4080 | } |
| 299 | |
|
| 300 | |
void build() throws IOException { |
| 301 | 4080 | calculateBlockPrefix(); |
| 302 | |
|
| 303 | 4080 | groupCount = (int) Math.ceil((float) keys.size() / vocabGroupSize); |
| 304 | 4080 | ends = new short[groupCount]; |
| 305 | |
|
| 306 | |
|
| 307 | 8160 | for (int i = 0; i < keys.size(); i += vocabGroupSize) { |
| 308 | 4080 | byte[] word = keys.get(i); |
| 309 | 4080 | byte[] lastWord = word; |
| 310 | |
assert word.length >= blockOverlap : |
| 311 | |
"Overlap: " + blockOverlap + " too small for " + word.length + |
| 312 | 4080 | " (" + Utility.makeString(word) + ")"; |
| 313 | 4080 | assert word.length < 256; |
| 314 | |
|
| 315 | |
|
| 316 | 4080 | vocabOutput.writeByte(word.length - blockOverlap); |
| 317 | 4080 | vocabOutput.write(word, blockOverlap, word.length - blockOverlap); |
| 318 | 4080 | int end = Math.min(keys.size(), i + vocabGroupSize); |
| 319 | |
|
| 320 | |
|
| 321 | 8128 | for (int j = i + 1; j < end; j++) { |
| 322 | 4048 | assert word.length < 256; |
| 323 | |
|
| 324 | |
|
| 325 | 4048 | word = keys.get(j); |
| 326 | 4048 | int common = this.prefixOverlap(lastWord, word); |
| 327 | 4048 | vocabOutput.writeByte((byte) common); |
| 328 | 4048 | vocabOutput.writeByte(word.length); |
| 329 | 4048 | vocabOutput.write(word, common, word.length - common); |
| 330 | 4048 | lastWord = word; |
| 331 | |
} |
| 332 | |
|
| 333 | 4080 | ends[i / vocabGroupSize] = (short) vocabOutput.size(); |
| 334 | |
} |
| 335 | 4080 | vocabOutput.close(); |
| 336 | 4080 | } |
| 337 | |
|
| 338 | |
int getBlockOverlap() { |
| 339 | 4080 | return blockOverlap; |
| 340 | |
} |
| 341 | |
|
| 342 | |
int getGroupCount() { |
| 343 | 4080 | return groupCount; |
| 344 | |
} |
| 345 | |
|
| 346 | |
int getKeyCount() { |
| 347 | 8160 | return keys.size(); |
| 348 | |
} |
| 349 | |
|
| 350 | |
int getKeyDataLength() { |
| 351 | 4080 | return wordByteStream.size(); |
| 352 | |
} |
| 353 | |
|
| 354 | |
byte[] getFirstWord() { |
| 355 | 8160 | return keys.get(0); |
| 356 | |
} |
| 357 | |
|
| 358 | |
void writeKeyHeader(DataOutputStream output) throws IOException { |
| 359 | |
|
| 360 | 4080 | output.writeLong(getKeyCount()); |
| 361 | |
|
| 362 | |
|
| 363 | 4080 | output.writeByte((byte) blockOverlap); |
| 364 | 4080 | output.write(getFirstWord(), 0, blockOverlap); |
| 365 | |
|
| 366 | |
|
| 367 | 8160 | for (short wordBlockEnd : ends) { |
| 368 | 4080 | output.writeShort(wordBlockEnd); |
| 369 | |
} |
| 370 | 4080 | } |
| 371 | |
|
| 372 | |
void writeKeyData(DataOutputStream output) throws IOException { |
| 373 | 4080 | output.write(wordByteStream.toByteArray()); |
| 374 | 4080 | } |
| 375 | |
} |
| 376 | |
|
| 377 | |
public void writeBlock(List<IndexElement> blockLists, long length) throws IOException { |
| 378 | 4080 | assert length <= blockSize || blockLists.size() == 1; |
| 379 | 4080 | assert wordsInOrder(blockLists); |
| 380 | |
|
| 381 | 4080 | if (blockLists.size() == 0) { |
| 382 | 0 | return; |
| 383 | |
} |
| 384 | |
|
| 385 | 4080 | VocabularyHeader vocabHeader = new VocabularyHeader(blockLists, vocabGroup); |
| 386 | 4080 | vocabHeader.build(); |
| 387 | |
|
| 388 | |
|
| 389 | |
ListData listData; |
| 390 | 4080 | if (isCompressed) { |
| 391 | 2016 | listData = new CompressedListData(blockLists); |
| 392 | |
} else { |
| 393 | 2064 | listData = new UncompressedListData(blockLists); |
| 394 | |
} |
| 395 | |
|
| 396 | 4080 | long headerBytes = 8 + |
| 397 | |
8 + |
| 398 | |
1 + vocabHeader.getBlockOverlap() + |
| 399 | |
2 * vocabHeader.getGroupCount() + |
| 400 | |
2 * vocabHeader.getKeyCount() + |
| 401 | |
vocabHeader.getKeyDataLength(); |
| 402 | |
|
| 403 | 4080 | long startPosition = filePosition; |
| 404 | 4080 | long endPosition = filePosition + headerBytes + listData.encodedLength(); |
| 405 | 4080 | assert endPosition <= startPosition + length || isCompressed; |
| 406 | 4080 | assert endPosition > startPosition || isCompressed; |
| 407 | 4080 | assert filePosition >= Integer.MAX_VALUE || filePosition == output.size(); |
| 408 | |
|
| 409 | |
|
| 410 | 4080 | vocabulary.add(vocabHeader.getFirstWord(), startPosition); |
| 411 | |
|
| 412 | |
|
| 413 | 4080 | output.writeLong(endPosition); |
| 414 | 4080 | vocabHeader.writeKeyHeader(output); |
| 415 | |
|
| 416 | |
|
| 417 | 4080 | long totalListData = listData.length(); |
| 418 | 4080 | long invertedListBytes = 0; |
| 419 | 4080 | for (IndexElement list : blockLists) { |
| 420 | 8128 | invertedListBytes += list.dataLength(); |
| 421 | 8128 | assert totalListData - invertedListBytes < Short.MAX_VALUE; |
| 422 | 8128 | assert totalListData >= invertedListBytes; |
| 423 | 8128 | output.writeShort((short) (totalListData - invertedListBytes)); |
| 424 | |
} |
| 425 | |
|
| 426 | |
|
| 427 | 4080 | vocabHeader.writeKeyData(output); |
| 428 | |
|
| 429 | |
|
| 430 | 4080 | listData.write(output); |
| 431 | |
|
| 432 | 4080 | filePosition = endPosition; |
| 433 | 4080 | assert filePosition >= Integer.MAX_VALUE || filePosition == output.size(); |
| 434 | 4080 | assert endPosition - startPosition <= blockSize || blockLists.size() == 1 || isCompressed; |
| 435 | |
|
| 436 | 4080 | if (blocksWritten != null) { |
| 437 | 0 | blocksWritten.increment(); |
| 438 | |
} |
| 439 | 4080 | } |
| 440 | |
|
| 441 | |
private boolean needsFlush(IndexElement list) { |
| 442 | 8128 | long listExtra = 1 + |
| 443 | |
1; |
| 444 | |
|
| 445 | 8128 | long bufferedBytes = bufferedSize() + |
| 446 | |
invertedListLength(list) + |
| 447 | |
listExtra; |
| 448 | |
|
| 449 | 8128 | return bufferedBytes >= blockSize; |
| 450 | |
} |
| 451 | |
|
| 452 | |
public void add(IndexElement list) throws IOException { |
| 453 | 8128 | if (list.key().length >= 256 || list.key().length >= blockSize / 4) { |
| 454 | 0 | throw new IOException("Key is too long."); |
| 455 | |
} |
| 456 | 8128 | if (needsFlush(list)) { |
| 457 | 3992 | flush(); |
| 458 | |
} |
| 459 | 8128 | lists.add(list); |
| 460 | 8128 | updateBufferedSize(list); |
| 461 | 8128 | if (recordsWritten != null) { |
| 462 | 0 | recordsWritten.increment(); |
| 463 | |
} |
| 464 | 8128 | } |
| 465 | |
|
| 466 | |
public void close() throws IOException { |
| 467 | 88 | flush(); |
| 468 | |
|
| 469 | 88 | byte[] vocabularyData = vocabulary.data(); |
| 470 | 88 | byte[] xmlData = manifest.toString().getBytes("UTF-8"); |
| 471 | 88 | long vocabularyOffset = filePosition; |
| 472 | 88 | long manifestOffset = filePosition + vocabularyData.length; |
| 473 | |
|
| 474 | 88 | output.write(vocabularyData); |
| 475 | 88 | output.write(xmlData); |
| 476 | |
|
| 477 | 88 | output.writeLong(vocabularyOffset); |
| 478 | 88 | output.writeLong(manifestOffset); |
| 479 | 88 | output.writeInt(blockSize); |
| 480 | 88 | output.writeInt(vocabGroup); |
| 481 | 88 | output.writeBoolean(isCompressed); |
| 482 | 88 | output.writeLong(MAGIC_NUMBER); |
| 483 | |
|
| 484 | 88 | output.close(); |
| 485 | 88 | } |
| 486 | |
} |