View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.IOException;
6   import java.util.ArrayList;
7   import java.util.HashMap;
8   import org.galagosearch.tupleflow.InputClass;
9   import org.galagosearch.tupleflow.OutputClass;
10  import org.galagosearch.tupleflow.StandardStep;
11  import org.galagosearch.tupleflow.execution.Verified;
12  import org.galagosearch.core.types.IdentifiedLink;
13  
14  /***
15   * From an IdentifiedLink object, this class constructs a document containing
16   * only anchor text.
17   * 
18   * @author trevor
19   */
20  @Verified
21  @InputClass(className = "org.galagosearch.core.types.IdentifiedLink")
22  @OutputClass(className = "org.galagosearch.core.parse.Document")
23  public class AnchorTextDocumentCreator extends StandardStep<IdentifiedLink, Document> {
24      TagTokenizer tokenizer = new TagTokenizer();
25      ArrayList<IdentifiedLink> links = new ArrayList<IdentifiedLink>();
26      String lastDocument = null;
27  
28      /***
29       * This method takes the text from a link object, tokenizes it,
30       * then adds it to a document object.
31       */
32      public void process(IdentifiedLink link) throws IOException {
33          if (lastDocument != null && !lastDocument.equals(link.identifier)) {
34              flush();
35          }
36          links.add(link);
37          lastDocument = link.identifier;
38      }
39  
40      public void flush() throws IOException {
41          Document document = new Document();
42  
43          if (links.size() == 0) {
44              return;
45          } else if (links.size() == 1) {
46              IdentifiedLink link = links.get(0);
47  
48              document.text = link.anchorText;
49              document.identifier = link.identifier;
50          } else {
51              StringBuilder builder = new StringBuilder();
52  
53              for (IdentifiedLink link : links) {
54                  builder.append(link.anchorText);
55                  builder.append(' ');
56              }
57  
58              document.text = builder.toString();
59              document.identifier = links.get(0).identifier;
60          }
61  
62          document.terms = null;
63          document.metadata = new HashMap<String, String>();
64          document.tags = new ArrayList<Tag>();
65  
66          // parse the text into pieces
67          tokenizer.process(document);
68  
69          // send it on to the next stage
70          processor.process(document);
71          lastDocument = null;
72          links.clear();
73      }
74  
75      @Override
76      public void close() throws IOException {
77          flush();
78          super.close();
79      }
80  
81      public Class<IdentifiedLink> getInputClass() {
82          return IdentifiedLink.class;
83      }
84  
85      public Class<Document> getOutputClass() {
86          return Document.class;
87      }
88  }