View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.parse;
3   
4   import java.io.IOException;
5   import java.net.URL;
6   import org.galagosearch.core.types.ExtractedLink;
7   import org.galagosearch.tupleflow.InputClass;
8   import org.galagosearch.tupleflow.OutputClass;
9   import org.galagosearch.tupleflow.StandardStep;
10  import org.galagosearch.tupleflow.TupleFlowParameters;
11  import org.galagosearch.tupleflow.execution.Verified;
12  
13  /***
14   * Extracts links from documents (anchor text, URLs).
15   * 
16   * @author trevor
17   */
18  @Verified
19  @InputClass(className = "org.galagosearch.core.parse.Document")
20  @OutputClass(className = "org.galagosearch.core.types.ExtractedLink")
21  public class LinkExtractor extends StandardStep<Document, ExtractedLink> {
22      private boolean acceptLocalLinks;
23      private boolean acceptNoFollowLinks;
24  
25      public LinkExtractor(TupleFlowParameters parameters) {
26          acceptLocalLinks = parameters.getXML().get("acceptLocalLinks", false);
27          acceptNoFollowLinks = parameters.getXML().get("acceptNoFollowLinks", false);
28      }
29  
30      public String scrubUrl(String url) {
31          // remove a leading pound sign
32          if (url.charAt(url.length() - 1) == '#') {
33              url = url.substring(0, url.length() - 1);        // make it lowercase
34          }
35          url = url.toLowerCase();
36  
37          // remove a port number, if it's the default number
38          url = url.replace(":80/", "/");
39          if (url.endsWith(":80")) {
40              url = url.replace(":80", "");
41          }
42          // remove trailing slashes
43          while (url.charAt(url.length() - 1) == '/') {
44              url = url.substring(0, url.length() - 1);
45          }
46          return url;
47      }
48  
49      public void process(Document document) throws IOException {
50          String sourceUrl = document.metadata.get("url");
51  
52          if (sourceUrl == null) {
53              return;
54          }
55          URL base = new URL(sourceUrl);
56  
57          for (Tag t : document.tags) {
58              if (t.name.equals("base")) {
59                  try {
60                      base = new URL(base, t.attributes.get("href"));
61                  } catch (Exception e) {
62                      // this can happen when the link protocol is unknown
63                      base = new URL(sourceUrl);
64                      continue;
65                  }
66              } else if (t.name.equals("a")) {
67                  String destSpec = t.attributes.get("href");
68                  URL destUrlObject = null;
69                  String destUrl = null;
70  
71                  try {
72                      destUrlObject = new URL(base, destSpec);
73                      destUrl = destUrlObject.toString();
74                  } catch (Exception e) {
75                      // this can happen when the link protocol is unknown
76                      continue;
77                  }
78  
79                  boolean linkIsLocal = destUrlObject.getHost().equals(base.getHost());
80  
81                  // if we're filtering out local links, there's no need to continue
82                  if (linkIsLocal && acceptLocalLinks == false) {
83                      continue;
84                  }
85                  ExtractedLink link = new ExtractedLink();
86  
87                  link.srcUrl = sourceUrl;
88                  link.destUrl = scrubUrl(destUrl);
89  
90                  StringBuilder builder = new StringBuilder();
91  
92                  for (int i = t.begin; i < t.end && i < document.terms.size(); i++) {
93                      String term = document.terms.get(i);
94  
95                      if (term != null) {
96                          builder.append(term);
97                          builder.append(' ');
98                      }
99                  }
100 
101                 link.anchorText = builder.toString().trim();
102 
103                 if (t.attributes.containsKey("rel") && t.attributes.get("rel").equals("nofollow")) {
104                     link.noFollow = true;
105                 } else {
106                     link.noFollow = false;
107                 }
108 
109                 boolean acceptable = (acceptNoFollowLinks || link.noFollow == false) &&
110                         (acceptLocalLinks || linkIsLocal == false);
111 
112                 if (acceptable) {
113                     processor.process(link);
114                 }
115             }
116         }
117     }
118 
119     public Class<Document> getInputClass() {
120         return Document.class;
121     }
122 
123     public Class<ExtractedLink> getOutputClass() {
124         return ExtractedLink.class;
125     }
126 }