| 1 | |
|
| 2 | |
|
| 3 | |
package org.galagosearch.core.parse; |
| 4 | |
|
| 5 | |
import java.io.ByteArrayOutputStream; |
| 6 | |
import java.io.DataOutputStream; |
| 7 | |
import java.io.FileNotFoundException; |
| 8 | |
import java.io.IOException; |
| 9 | |
import java.util.Map; |
| 10 | |
import org.galagosearch.core.index.GenericElement; |
| 11 | |
import org.galagosearch.core.index.IndexWriter; |
| 12 | |
import org.galagosearch.tupleflow.Counter; |
| 13 | |
import org.galagosearch.tupleflow.InputClass; |
| 14 | |
import org.galagosearch.tupleflow.Parameters; |
| 15 | |
import org.galagosearch.tupleflow.Processor; |
| 16 | |
import org.galagosearch.tupleflow.TupleFlowParameters; |
| 17 | |
import org.galagosearch.tupleflow.VByteOutput; |
| 18 | |
import org.galagosearch.tupleflow.execution.ErrorHandler; |
| 19 | |
import org.galagosearch.tupleflow.execution.Verification; |
| 20 | |
|
| 21 | |
|
| 22 | |
|
| 23 | |
|
| 24 | |
|
| 25 | |
|
| 26 | |
|
| 27 | |
|
| 28 | |
|
| 29 | |
@InputClass(className = "org.galagosearch.core.parse.Document") |
| 30 | 0 | public class DocumentIndexWriter implements Processor<Document> { |
| 31 | |
IndexWriter writer; |
| 32 | |
Counter documentsWritten; |
| 33 | |
|
| 34 | 12 | public DocumentIndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException { |
| 35 | 12 | Parameters p = new Parameters(); |
| 36 | 12 | p.add("isCompressed", "true"); |
| 37 | 12 | writer = new IndexWriter(parameters.getXML().get("filename"), p); |
| 38 | 12 | documentsWritten = parameters.getCounter("Documents Written"); |
| 39 | 12 | } |
| 40 | |
|
| 41 | |
public void close() throws IOException { |
| 42 | 12 | writer.close(); |
| 43 | 12 | } |
| 44 | |
|
| 45 | |
public void process(Document document) throws IOException { |
| 46 | 12 | ByteArrayOutputStream stream = new ByteArrayOutputStream(); |
| 47 | 12 | VByteOutput output = new VByteOutput(new DataOutputStream(stream)); |
| 48 | |
|
| 49 | 12 | output.writeString(document.text); |
| 50 | 12 | for (Map.Entry<String, String> entry : document.metadata.entrySet()) { |
| 51 | 24 | output.writeString(entry.getKey()); |
| 52 | 24 | output.writeString(entry.getValue()); |
| 53 | |
} |
| 54 | |
|
| 55 | 12 | writer.add(new GenericElement(document.identifier, stream.toByteArray())); |
| 56 | 12 | if (documentsWritten != null) |
| 57 | 0 | documentsWritten.increment(); |
| 58 | 12 | } |
| 59 | |
|
| 60 | |
public static void verify(TupleFlowParameters parameters, ErrorHandler handler) { |
| 61 | 4 | if (!parameters.getXML().containsKey("filename")) { |
| 62 | 0 | handler.addError("DocumentIndexWriter requires an 'filename' parameter."); |
| 63 | 0 | return; |
| 64 | |
} |
| 65 | |
|
| 66 | 4 | String index = parameters.getXML().get("filename"); |
| 67 | 4 | Verification.requireWriteableFile(index, handler); |
| 68 | 4 | } |
| 69 | |
} |