1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import org.galagosearch.core.types.DocumentSplit;
8 import org.galagosearch.tupleflow.Utility;
9
10 /***
11 * Reads Document data from an index file. Typically you'd use this parser by
12 * including UniversalParser in a TupleFlow Job.
13 *
14 * @author trevor
15 */
16 public class IndexReaderSplitParser implements DocumentStreamParser {
17 DocumentIndexReader.Iterator iterator;
18 DocumentSplit split;
19
20 public IndexReaderSplitParser(DocumentSplit split) throws FileNotFoundException, IOException {
21 DocumentIndexReader reader = new DocumentIndexReader(split.fileName);
22 iterator = reader.getIterator();
23 iterator.skipTo(split.startKey);
24 this.split = split;
25 }
26
27 public Document nextDocument() throws IOException {
28 if (iterator.isDone()) {
29 return null;
30 }
31
32 String key = iterator.getKey();
33 byte[] keyBytes = Utility.makeBytes(key);
34
35
36 if (split.endKey.length > 0 && Utility.compare(keyBytes, split.endKey) >= 0) {
37 return null;
38 }
39
40 Document document = iterator.getDocument();
41 iterator.nextDocument();
42 return document;
43 }
44 }