View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.ByteArrayOutputStream;
6   import java.io.DataOutputStream;
7   import java.io.FileNotFoundException;
8   import java.io.IOException;
9   import java.util.Map;
10  import org.galagosearch.core.index.GenericElement;
11  import org.galagosearch.core.index.IndexWriter;
12  import org.galagosearch.tupleflow.Counter;
13  import org.galagosearch.tupleflow.InputClass;
14  import org.galagosearch.tupleflow.Parameters;
15  import org.galagosearch.tupleflow.Processor;
16  import org.galagosearch.tupleflow.TupleFlowParameters;
17  import org.galagosearch.tupleflow.VByteOutput;
18  import org.galagosearch.tupleflow.execution.ErrorHandler;
19  import org.galagosearch.tupleflow.execution.Verification;
20  
21  /***
22   * Writes document text and metadata to an index file.  The output files
23   * are in '.corpus' format, which can be fed to UniversalParser as an input
24   * to indexing.  The '.corpus' format is also convenient for quickly
25   * finding individual documents.
26   * 
27   * @author trevor
28   */
29  @InputClass(className = "org.galagosearch.core.parse.Document")
30  public class DocumentIndexWriter implements Processor<Document> {
31      IndexWriter writer;
32      Counter documentsWritten;
33      
34      public DocumentIndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
35          Parameters p = new Parameters();
36          p.add("isCompressed", "true");
37          writer = new IndexWriter(parameters.getXML().get("filename"), p);
38          documentsWritten = parameters.getCounter("Documents Written");
39      }
40      
41      public void close() throws IOException {
42          writer.close();
43      }
44  
45      public void process(Document document) throws IOException {
46          ByteArrayOutputStream stream = new ByteArrayOutputStream();
47          VByteOutput output = new VByteOutput(new DataOutputStream(stream));
48          
49          output.writeString(document.text);
50          for (Map.Entry<String, String> entry : document.metadata.entrySet()) {
51              output.writeString(entry.getKey());
52              output.writeString(entry.getValue());
53          }
54          
55          writer.add(new GenericElement(document.identifier, stream.toByteArray()));
56          if (documentsWritten != null)
57              documentsWritten.increment();
58      }
59  
60      public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
61          if (!parameters.getXML().containsKey("filename")) {
62              handler.addError("DocumentIndexWriter requires an 'filename' parameter.");
63              return;
64          }
65  
66          String index = parameters.getXML().get("filename");
67          Verification.requireWriteableFile(index, handler);
68      }
69  }