Coverage Report - org.galagosearch.core.parse.AnchorTextDocumentCreator
 
Classes in this File Line Coverage Branch Coverage Complexity
AnchorTextDocumentCreator
0%
0/36
0%
0/10
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.parse;
 4  
 
 5  
 import java.io.IOException;
 6  
 import java.util.ArrayList;
 7  
 import java.util.HashMap;
 8  
 import org.galagosearch.tupleflow.InputClass;
 9  
 import org.galagosearch.tupleflow.OutputClass;
 10  
 import org.galagosearch.tupleflow.StandardStep;
 11  
 import org.galagosearch.tupleflow.execution.Verified;
 12  
 import org.galagosearch.core.types.IdentifiedLink;
 13  
 
 14  
 /**
 15  
  * From an IdentifiedLink object, this class constructs a document containing
 16  
  * only anchor text.
 17  
  * 
 18  
  * @author trevor
 19  
  */
 20  
 @Verified
 21  
 @InputClass(className = "org.galagosearch.core.types.IdentifiedLink")
 22  
 @OutputClass(className = "org.galagosearch.core.parse.Document")
 23  0
 public class AnchorTextDocumentCreator extends StandardStep<IdentifiedLink, Document> {
 24  0
     TagTokenizer tokenizer = new TagTokenizer();
 25  0
     ArrayList<IdentifiedLink> links = new ArrayList<IdentifiedLink>();
 26  0
     String lastDocument = null;
 27  
 
 28  
     /**
 29  
      * This method takes the text from a link object, tokenizes it,
 30  
      * then adds it to a document object.
 31  
      */
 32  
     public void process(IdentifiedLink link) throws IOException {
 33  0
         if (lastDocument != null && !lastDocument.equals(link.identifier)) {
 34  0
             flush();
 35  
         }
 36  0
         links.add(link);
 37  0
         lastDocument = link.identifier;
 38  0
     }
 39  
 
 40  
     public void flush() throws IOException {
 41  0
         Document document = new Document();
 42  
 
 43  0
         if (links.size() == 0) {
 44  0
             return;
 45  0
         } else if (links.size() == 1) {
 46  0
             IdentifiedLink link = links.get(0);
 47  
 
 48  0
             document.text = link.anchorText;
 49  0
             document.identifier = link.identifier;
 50  0
         } else {
 51  0
             StringBuilder builder = new StringBuilder();
 52  
 
 53  0
             for (IdentifiedLink link : links) {
 54  0
                 builder.append(link.anchorText);
 55  0
                 builder.append(' ');
 56  
             }
 57  
 
 58  0
             document.text = builder.toString();
 59  0
             document.identifier = links.get(0).identifier;
 60  
         }
 61  
 
 62  0
         document.terms = null;
 63  0
         document.metadata = new HashMap<String, String>();
 64  0
         document.tags = new ArrayList<Tag>();
 65  
 
 66  
         // parse the text into pieces
 67  0
         tokenizer.process(document);
 68  
 
 69  
         // send it on to the next stage
 70  0
         processor.process(document);
 71  0
         lastDocument = null;
 72  0
         links.clear();
 73  0
     }
 74  
 
 75  
     @Override
 76  
     public void close() throws IOException {
 77  0
         flush();
 78  0
         super.close();
 79  0
     }
 80  
 
 81  
     public Class<IdentifiedLink> getInputClass() {
 82  0
         return IdentifiedLink.class;
 83  
     }
 84  
 
 85  
     public Class<Document> getOutputClass() {
 86  0
         return Document.class;
 87  
     }
 88  
 }