1
2 package org.galagosearch.core.parse;
3
4 import java.io.IOException;
5 import java.net.URL;
6 import org.galagosearch.core.types.ExtractedLink;
7 import org.galagosearch.tupleflow.InputClass;
8 import org.galagosearch.tupleflow.OutputClass;
9 import org.galagosearch.tupleflow.StandardStep;
10 import org.galagosearch.tupleflow.TupleFlowParameters;
11 import org.galagosearch.tupleflow.execution.Verified;
12
13 /***
14 * Extracts links from documents (anchor text, URLs).
15 *
16 * @author trevor
17 */
18 @Verified
19 @InputClass(className = "org.galagosearch.core.parse.Document")
20 @OutputClass(className = "org.galagosearch.core.types.ExtractedLink")
21 public class LinkExtractor extends StandardStep<Document, ExtractedLink> {
22 private boolean acceptLocalLinks;
23 private boolean acceptNoFollowLinks;
24
25 public LinkExtractor(TupleFlowParameters parameters) {
26 acceptLocalLinks = parameters.getXML().get("acceptLocalLinks", false);
27 acceptNoFollowLinks = parameters.getXML().get("acceptNoFollowLinks", false);
28 }
29
30 public String scrubUrl(String url) {
31
32 if (url.charAt(url.length() - 1) == '#') {
33 url = url.substring(0, url.length() - 1);
34 }
35 url = url.toLowerCase();
36
37
38 url = url.replace(":80/", "/");
39 if (url.endsWith(":80")) {
40 url = url.replace(":80", "");
41 }
42
43 while (url.charAt(url.length() - 1) == '/') {
44 url = url.substring(0, url.length() - 1);
45 }
46 return url;
47 }
48
49 public void process(Document document) throws IOException {
50 String sourceUrl = document.metadata.get("url");
51
52 if (sourceUrl == null) {
53 return;
54 }
55 URL base = new URL(sourceUrl);
56
57 for (Tag t : document.tags) {
58 if (t.name.equals("base")) {
59 try {
60 base = new URL(base, t.attributes.get("href"));
61 } catch (Exception e) {
62
63 base = new URL(sourceUrl);
64 continue;
65 }
66 } else if (t.name.equals("a")) {
67 String destSpec = t.attributes.get("href");
68 URL destUrlObject = null;
69 String destUrl = null;
70
71 try {
72 destUrlObject = new URL(base, destSpec);
73 destUrl = destUrlObject.toString();
74 } catch (Exception e) {
75
76 continue;
77 }
78
79 boolean linkIsLocal = destUrlObject.getHost().equals(base.getHost());
80
81
82 if (linkIsLocal && acceptLocalLinks == false) {
83 continue;
84 }
85 ExtractedLink link = new ExtractedLink();
86
87 link.srcUrl = sourceUrl;
88 link.destUrl = scrubUrl(destUrl);
89
90 StringBuilder builder = new StringBuilder();
91
92 for (int i = t.begin; i < t.end && i < document.terms.size(); i++) {
93 String term = document.terms.get(i);
94
95 if (term != null) {
96 builder.append(term);
97 builder.append(' ');
98 }
99 }
100
101 link.anchorText = builder.toString().trim();
102
103 if (t.attributes.containsKey("rel") && t.attributes.get("rel").equals("nofollow")) {
104 link.noFollow = true;
105 } else {
106 link.noFollow = false;
107 }
108
109 boolean acceptable = (acceptNoFollowLinks || link.noFollow == false) &&
110 (acceptLocalLinks || linkIsLocal == false);
111
112 if (acceptable) {
113 processor.process(link);
114 }
115 }
116 }
117 }
118
119 public Class<Document> getInputClass() {
120 return Document.class;
121 }
122
123 public Class<ExtractedLink> getOutputClass() {
124 return ExtractedLink.class;
125 }
126 }