View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import org.galagosearch.tupleflow.ExNihiloSource;
6   import org.galagosearch.tupleflow.IncompatibleProcessorException;
7   import org.galagosearch.tupleflow.Linkage;
8   import org.galagosearch.tupleflow.OutputClass;
9   import org.galagosearch.tupleflow.TupleFlowParameters;
10  import org.galagosearch.tupleflow.Processor;
11  import org.galagosearch.tupleflow.Step;
12  import org.galagosearch.tupleflow.TypeReader;
13  import org.galagosearch.tupleflow.execution.ErrorHandler;
14  import org.galagosearch.tupleflow.execution.Verification;
15  import org.galagosearch.core.types.ExtractedLink;
16  import org.galagosearch.core.types.IdentifiedLink;
17  import java.io.IOException;
18  import org.galagosearch.core.types.DocumentData;
19  
20  /***
21   *
22   * @author trevor
23   */
24  @OutputClass(className = "org.galagosearch.core.parse.DocumentLinkData")
25  public class LinkCombiner implements ExNihiloSource<IdentifiedLink>, IdentifiedLink.Source {
26      TypeReader<ExtractedLink> extractedLinks;
27      TypeReader<DocumentData> documentDatas;
28      DocumentLinkData linkData;
29      public Processor<DocumentLinkData> processor;
30  
31      @SuppressWarnings("unchecked")
32      public LinkCombiner(TupleFlowParameters parameters) throws IOException {
33          String extractedLinksName = parameters.getXML().get("extractedLinks");
34          String documentDatasName = parameters.getXML().get("documentDatas");
35  
36          extractedLinks = parameters.getTypeReader(extractedLinksName);
37          documentDatas = parameters.getTypeReader(documentDatasName);
38      }
39  
40      public void setProcessor(Step processor) throws IncompatibleProcessorException {
41          Linkage.link(this, processor);
42      }
43  
44      void match(DocumentData docData, ExtractedLink link) {
45          if (linkData == null) {
46              linkData = new DocumentLinkData();
47              linkData.identifier = docData.identifier;
48              linkData.url = docData.url;
49              linkData.textLength = docData.textLength;
50          }
51          
52          linkData.links.add(link);
53      }
54      
55      void flush() throws IOException {
56          if (linkData != null) {
57              processor.process(linkData);
58          }
59      }
60      
61      public void run() throws IOException {
62          ExtractedLink link = extractedLinks.read();
63          DocumentData docData = documentDatas.read();
64  
65          while (docData != null && link != null) {
66              int result = link.destUrl.compareTo(docData.url);
67              if (result == 0) {
68                  match(docData, link);
69                  link = extractedLinks.read();
70              } else {
71                  if (result < 0) {
72                      link = extractedLinks.read();
73                  } else {
74                      docData = documentDatas.read();
75                  }
76              }
77          }
78  
79          processor.close();
80      }
81  
82      public Class<IdentifiedLink> getOutputClass() {
83          return IdentifiedLink.class;
84      }
85  
86      public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
87          if (!Verification.requireParameters(new String[] { "extractedLinks", "documentDatas" },
88                                              parameters.getXML(), handler)) {
89              return;
90          }
91  
92          String extractedLinksName = parameters.getXML().get("extractedLinks");
93          String documentDatasName = parameters.getXML().get("documentDatas");
94  
95          Verification.verifyTypeReader(extractedLinksName, ExtractedLink.class, parameters, handler);
96          Verification.verifyTypeReader(documentDatasName, DocumentData.class, parameters, handler);
97      }
98  }