View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.FileNotFoundException;
6   import java.io.IOException;
7   import org.galagosearch.core.types.DocumentSplit;
8   import org.galagosearch.tupleflow.Utility;
9   
10  /***
11   * Reads Document data from an index file.  Typically you'd use this parser by
12   * including UniversalParser in a TupleFlow Job.
13   * 
14   * @author trevor
15   */
16  public class IndexReaderSplitParser implements DocumentStreamParser {
17      DocumentIndexReader.Iterator iterator;
18      DocumentSplit split;
19      
20      public IndexReaderSplitParser(DocumentSplit split) throws FileNotFoundException, IOException {
21          DocumentIndexReader reader = new DocumentIndexReader(split.fileName);
22          iterator = reader.getIterator();
23          iterator.skipTo(split.startKey);
24          this.split = split;
25      }
26      
27      public Document nextDocument() throws IOException {
28          if (iterator.isDone()) {
29              return null;
30          }
31          
32          String key = iterator.getKey();
33          byte[] keyBytes = Utility.makeBytes(key);
34          
35          // Don't go past the end of the split.
36          if (split.endKey.length > 0 && Utility.compare(keyBytes, split.endKey) >= 0) {
37              return null;
38          }
39  
40          Document document = iterator.getDocument();
41          iterator.nextDocument();
42          return document;
43      }
44  }