Coverage Report - org.galagosearch.core.parse.TrecWebParser
 
Classes in this File Line Coverage Branch Coverage Complexity
TrecWebParser
0%
0/45
0%
0/18
2.833
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 package org.galagosearch.core.parse;
 3  
 
 4  
 import java.io.BufferedReader;
 5  
 import java.io.FileNotFoundException;
 6  
 import java.io.IOException;
 7  
 
 8  
 /**
 9  
  *
 10  
  * @author trevor
 11  
  */
 12  
 public class TrecWebParser implements DocumentStreamParser {
 13  
     BufferedReader reader;
 14  
 
 15  
     /** Creates a new instance of TrecWebParser */
 16  0
     public TrecWebParser(BufferedReader reader) throws FileNotFoundException, IOException {
 17  0
         this.reader = reader;
 18  0
     }
 19  
 
 20  
     public String waitFor(String tag) throws IOException {
 21  
         String line;
 22  
 
 23  0
         while ((line = reader.readLine()) != null) {
 24  0
             if (line.startsWith(tag)) {
 25  0
                 return line;
 26  
             }
 27  
         }
 28  
 
 29  0
         return null;
 30  
     }
 31  
 
 32  
     public void close() throws IOException {
 33  0
         reader.close();
 34  0
         reader = null;
 35  0
     }
 36  
 
 37  
     public String scrubUrl(String url) {
 38  
         // remove a trailing pound sign
 39  0
         if (url.charAt(url.length() - 1) == '#') {
 40  0
             url = url.substring(0, url.length() - 1);        // make it lowercase
 41  
         }
 42  0
         url = url.toLowerCase();
 43  
 
 44  
         // remove a port number
 45  0
         url = url.replace(":80/", "/");
 46  0
         if (url.endsWith(":80")) {
 47  0
             url = url.replace(":80", "");        // remove trailing slashes
 48  
         }
 49  0
         while (url.charAt(url.length() - 1) == '/') {
 50  0
             url = url.substring(0, url.length() - 1);
 51  
         }
 52  0
         return url;
 53  
     }
 54  
 
 55  
     public String readUrl() throws IOException {
 56  0
         String url = reader.readLine();
 57  0
         int space = url.indexOf(' ');
 58  
 
 59  0
         if (space < 0) {
 60  0
             space = url.length();
 61  
         }
 62  0
         return scrubUrl(url.substring(0, space));
 63  
     }
 64  
 
 65  
     public Document nextDocument() throws IOException {
 66  0
         String line = null;
 67  
 
 68  0
         if (null == waitFor("<DOC>")) {
 69  0
             close();
 70  0
             return null;
 71  
         }
 72  
 
 73  0
         String identifier = waitFor("<DOCNO>");
 74  0
         identifier = identifier.substring(7).trim();
 75  0
         identifier = identifier.substring(0, identifier.length() - 8);
 76  0
         identifier = new String(identifier.trim());
 77  0
         waitFor("<DOCHDR>");
 78  0
         String url = readUrl();
 79  0
         waitFor("</DOCHDR>");
 80  
 
 81  0
         StringBuilder buffer = new StringBuilder(20 * 1024);
 82  
 
 83  0
         while ((line = reader.readLine()) != null) {
 84  0
             if (line.startsWith("</DOC>")) {
 85  0
                 break;
 86  
             }
 87  0
             buffer.append(line);
 88  0
             buffer.append(' ');
 89  
         }
 90  
 
 91  0
         Document result = new Document(identifier, buffer.toString());
 92  0
         result.metadata.put("url", new String(url));
 93  0
         result.metadata.put("identifier", result.identifier);
 94  
 
 95  0
         return result;
 96  
     }
 97  
 }