1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.ByteArrayOutputStream;
6 import java.io.DataOutputStream;
7 import java.io.FileNotFoundException;
8 import java.io.IOException;
9 import java.util.Map;
10 import org.galagosearch.core.index.GenericElement;
11 import org.galagosearch.core.index.IndexWriter;
12 import org.galagosearch.tupleflow.Counter;
13 import org.galagosearch.tupleflow.InputClass;
14 import org.galagosearch.tupleflow.Parameters;
15 import org.galagosearch.tupleflow.Processor;
16 import org.galagosearch.tupleflow.TupleFlowParameters;
17 import org.galagosearch.tupleflow.VByteOutput;
18 import org.galagosearch.tupleflow.execution.ErrorHandler;
19 import org.galagosearch.tupleflow.execution.Verification;
20
21 /***
22 * Writes document text and metadata to an index file. The output files
23 * are in '.corpus' format, which can be fed to UniversalParser as an input
24 * to indexing. The '.corpus' format is also convenient for quickly
25 * finding individual documents.
26 *
27 * @author trevor
28 */
29 @InputClass(className = "org.galagosearch.core.parse.Document")
30 public class DocumentIndexWriter implements Processor<Document> {
31 IndexWriter writer;
32 Counter documentsWritten;
33
34 public DocumentIndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
35 Parameters p = new Parameters();
36 p.add("isCompressed", "true");
37 writer = new IndexWriter(parameters.getXML().get("filename"), p);
38 documentsWritten = parameters.getCounter("Documents Written");
39 }
40
41 public void close() throws IOException {
42 writer.close();
43 }
44
45 public void process(Document document) throws IOException {
46 ByteArrayOutputStream stream = new ByteArrayOutputStream();
47 VByteOutput output = new VByteOutput(new DataOutputStream(stream));
48
49 output.writeString(document.text);
50 for (Map.Entry<String, String> entry : document.metadata.entrySet()) {
51 output.writeString(entry.getKey());
52 output.writeString(entry.getValue());
53 }
54
55 writer.add(new GenericElement(document.identifier, stream.toByteArray()));
56 if (documentsWritten != null)
57 documentsWritten.increment();
58 }
59
60 public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
61 if (!parameters.getXML().containsKey("filename")) {
62 handler.addError("DocumentIndexWriter requires an 'filename' parameter.");
63 return;
64 }
65
66 String index = parameters.getXML().get("filename");
67 Verification.requireWriteableFile(index, handler);
68 }
69 }