1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.IOException;
6 import java.util.ArrayList;
7 import java.util.HashMap;
8 import org.galagosearch.tupleflow.InputClass;
9 import org.galagosearch.tupleflow.OutputClass;
10 import org.galagosearch.tupleflow.StandardStep;
11 import org.galagosearch.tupleflow.execution.Verified;
12 import org.galagosearch.core.types.IdentifiedLink;
13
14 /***
15 * From an IdentifiedLink object, this class constructs a document containing
16 * only anchor text.
17 *
18 * @author trevor
19 */
20 @Verified
21 @InputClass(className = "org.galagosearch.core.types.IdentifiedLink")
22 @OutputClass(className = "org.galagosearch.core.parse.Document")
23 public class AnchorTextDocumentCreator extends StandardStep<IdentifiedLink, Document> {
24 TagTokenizer tokenizer = new TagTokenizer();
25 ArrayList<IdentifiedLink> links = new ArrayList<IdentifiedLink>();
26 String lastDocument = null;
27
28 /***
29 * This method takes the text from a link object, tokenizes it,
30 * then adds it to a document object.
31 */
32 public void process(IdentifiedLink link) throws IOException {
33 if (lastDocument != null && !lastDocument.equals(link.identifier)) {
34 flush();
35 }
36 links.add(link);
37 lastDocument = link.identifier;
38 }
39
40 public void flush() throws IOException {
41 Document document = new Document();
42
43 if (links.size() == 0) {
44 return;
45 } else if (links.size() == 1) {
46 IdentifiedLink link = links.get(0);
47
48 document.text = link.anchorText;
49 document.identifier = link.identifier;
50 } else {
51 StringBuilder builder = new StringBuilder();
52
53 for (IdentifiedLink link : links) {
54 builder.append(link.anchorText);
55 builder.append(' ');
56 }
57
58 document.text = builder.toString();
59 document.identifier = links.get(0).identifier;
60 }
61
62 document.terms = null;
63 document.metadata = new HashMap<String, String>();
64 document.tags = new ArrayList<Tag>();
65
66
67 tokenizer.process(document);
68
69
70 processor.process(document);
71 lastDocument = null;
72 links.clear();
73 }
74
75 @Override
76 public void close() throws IOException {
77 flush();
78 super.close();
79 }
80
81 public Class<IdentifiedLink> getInputClass() {
82 return IdentifiedLink.class;
83 }
84
85 public Class<Document> getOutputClass() {
86 return Document.class;
87 }
88 }