View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.parse;
3   
4   import java.io.BufferedReader;
5   import java.io.FileNotFoundException;
6   import java.io.IOException;
7   
8   /***
9    *
10   * @author trevor
11   */
12  public class TrecWebParser implements DocumentStreamParser {
13      BufferedReader reader;
14  
15      /*** Creates a new instance of TrecWebParser */
16      public TrecWebParser(BufferedReader reader) throws FileNotFoundException, IOException {
17          this.reader = reader;
18      }
19  
20      public String waitFor(String tag) throws IOException {
21          String line;
22  
23          while ((line = reader.readLine()) != null) {
24              if (line.startsWith(tag)) {
25                  return line;
26              }
27          }
28  
29          return null;
30      }
31  
32      public void close() throws IOException {
33          reader.close();
34          reader = null;
35      }
36  
37      public String scrubUrl(String url) {
38          // remove a trailing pound sign
39          if (url.charAt(url.length() - 1) == '#') {
40              url = url.substring(0, url.length() - 1);        // make it lowercase
41          }
42          url = url.toLowerCase();
43  
44          // remove a port number
45          url = url.replace(":80/", "/");
46          if (url.endsWith(":80")) {
47              url = url.replace(":80", "");        // remove trailing slashes
48          }
49          while (url.charAt(url.length() - 1) == '/') {
50              url = url.substring(0, url.length() - 1);
51          }
52          return url;
53      }
54  
55      public String readUrl() throws IOException {
56          String url = reader.readLine();
57          int space = url.indexOf(' ');
58  
59          if (space < 0) {
60              space = url.length();
61          }
62          return scrubUrl(url.substring(0, space));
63      }
64  
65      public Document nextDocument() throws IOException {
66          String line = null;
67  
68          if (null == waitFor("<DOC>")) {
69              close();
70              return null;
71          }
72  
73          String identifier = waitFor("<DOCNO>");
74          identifier = identifier.substring(7).trim();
75          identifier = identifier.substring(0, identifier.length() - 8);
76          identifier = new String(identifier.trim());
77          waitFor("<DOCHDR>");
78          String url = readUrl();
79          waitFor("</DOCHDR>");
80  
81          StringBuilder buffer = new StringBuilder(20 * 1024);
82  
83          while ((line = reader.readLine()) != null) {
84              if (line.startsWith("</DOC>")) {
85                  break;
86              }
87              buffer.append(line);
88              buffer.append(' ');
89          }
90  
91          Document result = new Document(identifier, buffer.toString());
92          result.metadata.put("url", new String(url));
93          result.metadata.put("identifier", result.identifier);
94  
95          return result;
96      }
97  }