1
2
3 package org.galagosearch.core.parse;
4
5 import org.galagosearch.tupleflow.ExNihiloSource;
6 import org.galagosearch.tupleflow.IncompatibleProcessorException;
7 import org.galagosearch.tupleflow.Linkage;
8 import org.galagosearch.tupleflow.OutputClass;
9 import org.galagosearch.tupleflow.TupleFlowParameters;
10 import org.galagosearch.tupleflow.Processor;
11 import org.galagosearch.tupleflow.Step;
12 import org.galagosearch.tupleflow.TypeReader;
13 import org.galagosearch.tupleflow.execution.ErrorHandler;
14 import org.galagosearch.tupleflow.execution.Verification;
15 import org.galagosearch.core.types.ExtractedLink;
16 import org.galagosearch.core.types.IdentifiedLink;
17 import java.io.IOException;
18 import org.galagosearch.core.types.DocumentData;
19
20 /***
21 *
22 * @author trevor
23 */
24 @OutputClass(className = "org.galagosearch.core.parse.DocumentLinkData")
25 public class LinkCombiner implements ExNihiloSource<IdentifiedLink>, IdentifiedLink.Source {
26 TypeReader<ExtractedLink> extractedLinks;
27 TypeReader<DocumentData> documentDatas;
28 DocumentLinkData linkData;
29 public Processor<DocumentLinkData> processor;
30
31 @SuppressWarnings("unchecked")
32 public LinkCombiner(TupleFlowParameters parameters) throws IOException {
33 String extractedLinksName = parameters.getXML().get("extractedLinks");
34 String documentDatasName = parameters.getXML().get("documentDatas");
35
36 extractedLinks = parameters.getTypeReader(extractedLinksName);
37 documentDatas = parameters.getTypeReader(documentDatasName);
38 }
39
40 public void setProcessor(Step processor) throws IncompatibleProcessorException {
41 Linkage.link(this, processor);
42 }
43
44 void match(DocumentData docData, ExtractedLink link) {
45 if (linkData == null) {
46 linkData = new DocumentLinkData();
47 linkData.identifier = docData.identifier;
48 linkData.url = docData.url;
49 linkData.textLength = docData.textLength;
50 }
51
52 linkData.links.add(link);
53 }
54
55 void flush() throws IOException {
56 if (linkData != null) {
57 processor.process(linkData);
58 }
59 }
60
61 public void run() throws IOException {
62 ExtractedLink link = extractedLinks.read();
63 DocumentData docData = documentDatas.read();
64
65 while (docData != null && link != null) {
66 int result = link.destUrl.compareTo(docData.url);
67 if (result == 0) {
68 match(docData, link);
69 link = extractedLinks.read();
70 } else {
71 if (result < 0) {
72 link = extractedLinks.read();
73 } else {
74 docData = documentDatas.read();
75 }
76 }
77 }
78
79 processor.close();
80 }
81
82 public Class<IdentifiedLink> getOutputClass() {
83 return IdentifiedLink.class;
84 }
85
86 public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
87 if (!Verification.requireParameters(new String[] { "extractedLinks", "documentDatas" },
88 parameters.getXML(), handler)) {
89 return;
90 }
91
92 String extractedLinksName = parameters.getXML().get("extractedLinks");
93 String documentDatasName = parameters.getXML().get("documentDatas");
94
95 Verification.verifyTypeReader(extractedLinksName, ExtractedLink.class, parameters, handler);
96 Verification.verifyTypeReader(documentDatasName, DocumentData.class, parameters, handler);
97 }
98 }