Coverage Report - org.galagosearch.core.parse.LinkExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
LinkExtractor
0%
0/57
0%
0/38
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 package org.galagosearch.core.parse;
 3  
 
 4  
 import java.io.IOException;
 5  
 import java.net.URL;
 6  
 import org.galagosearch.core.types.ExtractedLink;
 7  
 import org.galagosearch.tupleflow.InputClass;
 8  
 import org.galagosearch.tupleflow.OutputClass;
 9  
 import org.galagosearch.tupleflow.StandardStep;
 10  
 import org.galagosearch.tupleflow.TupleFlowParameters;
 11  
 import org.galagosearch.tupleflow.execution.Verified;
 12  
 
 13  
 /**
 14  
  * Extracts links from documents (anchor text, URLs).
 15  
  * 
 16  
  * @author trevor
 17  
  */
 18  
 @Verified
 19  
 @InputClass(className = "org.galagosearch.core.parse.Document")
 20  
 @OutputClass(className = "org.galagosearch.core.types.ExtractedLink")
 21  0
 public class LinkExtractor extends StandardStep<Document, ExtractedLink> {
 22  
     private boolean acceptLocalLinks;
 23  
     private boolean acceptNoFollowLinks;
 24  
 
 25  0
     public LinkExtractor(TupleFlowParameters parameters) {
 26  0
         acceptLocalLinks = parameters.getXML().get("acceptLocalLinks", false);
 27  0
         acceptNoFollowLinks = parameters.getXML().get("acceptNoFollowLinks", false);
 28  0
     }
 29  
 
 30  
     public String scrubUrl(String url) {
 31  
         // remove a leading pound sign
 32  0
         if (url.charAt(url.length() - 1) == '#') {
 33  0
             url = url.substring(0, url.length() - 1);        // make it lowercase
 34  
         }
 35  0
         url = url.toLowerCase();
 36  
 
 37  
         // remove a port number, if it's the default number
 38  0
         url = url.replace(":80/", "/");
 39  0
         if (url.endsWith(":80")) {
 40  0
             url = url.replace(":80", "");
 41  
         }
 42  
         // remove trailing slashes
 43  0
         while (url.charAt(url.length() - 1) == '/') {
 44  0
             url = url.substring(0, url.length() - 1);
 45  
         }
 46  0
         return url;
 47  
     }
 48  
 
 49  
     public void process(Document document) throws IOException {
 50  0
         String sourceUrl = document.metadata.get("url");
 51  
 
 52  0
         if (sourceUrl == null) {
 53  0
             return;
 54  
         }
 55  0
         URL base = new URL(sourceUrl);
 56  
 
 57  0
         for (Tag t : document.tags) {
 58  0
             if (t.name.equals("base")) {
 59  
                 try {
 60  0
                     base = new URL(base, t.attributes.get("href"));
 61  0
                 } catch (Exception e) {
 62  
                     // this can happen when the link protocol is unknown
 63  0
                     base = new URL(sourceUrl);
 64  0
                     continue;
 65  0
                 }
 66  0
             } else if (t.name.equals("a")) {
 67  0
                 String destSpec = t.attributes.get("href");
 68  0
                 URL destUrlObject = null;
 69  0
                 String destUrl = null;
 70  
 
 71  
                 try {
 72  0
                     destUrlObject = new URL(base, destSpec);
 73  0
                     destUrl = destUrlObject.toString();
 74  0
                 } catch (Exception e) {
 75  
                     // this can happen when the link protocol is unknown
 76  0
                     continue;
 77  0
                 }
 78  
 
 79  0
                 boolean linkIsLocal = destUrlObject.getHost().equals(base.getHost());
 80  
 
 81  
                 // if we're filtering out local links, there's no need to continue
 82  0
                 if (linkIsLocal && acceptLocalLinks == false) {
 83  0
                     continue;
 84  
                 }
 85  0
                 ExtractedLink link = new ExtractedLink();
 86  
 
 87  0
                 link.srcUrl = sourceUrl;
 88  0
                 link.destUrl = scrubUrl(destUrl);
 89  
 
 90  0
                 StringBuilder builder = new StringBuilder();
 91  
 
 92  0
                 for (int i = t.begin; i < t.end && i < document.terms.size(); i++) {
 93  0
                     String term = document.terms.get(i);
 94  
 
 95  0
                     if (term != null) {
 96  0
                         builder.append(term);
 97  0
                         builder.append(' ');
 98  
                     }
 99  
                 }
 100  
 
 101  0
                 link.anchorText = builder.toString().trim();
 102  
 
 103  0
                 if (t.attributes.containsKey("rel") && t.attributes.get("rel").equals("nofollow")) {
 104  0
                     link.noFollow = true;
 105  
                 } else {
 106  0
                     link.noFollow = false;
 107  
                 }
 108  
 
 109  0
                 boolean acceptable = (acceptNoFollowLinks || link.noFollow == false) &&
 110  
                         (acceptLocalLinks || linkIsLocal == false);
 111  
 
 112  0
                 if (acceptable) {
 113  0
                     processor.process(link);
 114  
                 }
 115  0
             }
 116  
         }
 117  0
     }
 118  
 
 119  
     public Class<Document> getInputClass() {
 120  0
         return Document.class;
 121  
     }
 122  
 
 123  
     public Class<ExtractedLink> getOutputClass() {
 124  0
         return ExtractedLink.class;
 125  
     }
 126  
 }