1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.IOException;
6 import org.galagosearch.core.types.AdditionalDocumentText;
7 import org.galagosearch.tupleflow.InputClass;
8 import org.galagosearch.tupleflow.OutputClass;
9 import org.galagosearch.tupleflow.StandardStep;
10 import org.galagosearch.tupleflow.TupleFlowParameters;
11 import org.galagosearch.tupleflow.TypeReader;
12 import org.galagosearch.tupleflow.Utility;
13 import org.galagosearch.tupleflow.execution.ErrorHandler;
14 import org.galagosearch.tupleflow.execution.Verification;
15
16 /***
17 * Adds tuples of type AdditionalDocumentText to the end of the text field in
18 * a document. The AdditionalDocumentText stream is specified in the
19 * textSource parameter. This stage should be used before document tokenizing.
20 *
21 * @author trevor
22 */
23 @InputClass(className = "org.galagosearch.core.parse.Document")
24 @OutputClass(className = "org.galagosearch.core.parse.Document")
25 public class AdditionalTextCombiner extends StandardStep<Document, Document> {
26 TypeReader<AdditionalDocumentText> text;
27 AdditionalDocumentText last;
28
29 @SuppressWarnings("unchecked")
30 public AdditionalTextCombiner(TupleFlowParameters parameters) throws IOException {
31 String readerName = parameters.getXML().get("textSource");
32 text = parameters.getTypeReader(readerName);
33 last = text.read();
34 }
35
36 public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
37 if (!Verification.requireParameters(new String[] { "textSource" }, parameters.getXML(), handler))
38 return;
39
40 String readerName = parameters.getXML().get("textSource");
41 Verification.verifyTypeReader(readerName, AdditionalDocumentText.class, parameters, handler);
42 }
43
44 @Override
45 public void process(Document document) throws IOException {
46 while (last != null && Utility.compare(last.identifier, document.identifier) < 0) {
47 last = text.read();
48 }
49
50 if (last != null && last.identifier.equals(document.identifier)) {
51 document.text += last.text;
52 }
53
54 processor.process(document);
55 }
56 }