View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.parse;
3   
4   import java.io.BufferedReader;
5   import java.io.FileNotFoundException;
6   import java.io.IOException;
7   
8   /***
9    *
10   * @author trevor
11   */
12  public class TrecTextParser implements DocumentStreamParser {
13      BufferedReader reader;
14  
15      /*** Creates a new instance of TrecTextParser */
16      public TrecTextParser(BufferedReader reader) throws FileNotFoundException, IOException {
17          this.reader = reader;
18      }
19  
20      public String waitFor(String tag) throws IOException {
21          String line;
22  
23          while ((line = reader.readLine()) != null) {
24              if (line.startsWith(tag)) {
25                  return line;
26              }
27          }
28  
29          return null;
30      }
31  
32      public String parseDocNumber() throws IOException {
33          String allText = waitFor("<DOCNO>");
34  
35          while (allText.contains("</DOCNO>") == false) {
36              String line = reader.readLine();
37              if (line == null) {
38                  break;
39              }
40              allText += line;
41          }
42  
43          int start = allText.indexOf("<DOCNO>") + 7;
44          int end = allText.indexOf("</DOCNO>");
45  
46          return new String(allText.substring(start, end).trim());
47      }
48  
49      public Document nextDocument() throws IOException {
50          String line;
51  
52          if (null == waitFor("<DOC>")) {
53              return null;
54          }
55          String identifier = parseDocNumber();
56          StringBuffer buffer = new StringBuffer();
57  
58          String[] startTags = {"<TEXT>", "<HEADLINE>", "<TITLE>", "<HL>", "<HEAD>",
59              "<TTL>", "<DD>", "<DATE>", "<LP>", "<LEADPARA>"
60          };
61          String[] endTags = {"</TEXT>", "</HEADLINE>", "</TITLE>", "</HL>", "</HEAD>",
62              "</TTL>", "</DD>", "</DATE>", "</LP>", "</LEADPARA>"
63          };
64  
65          int inTag = -1;
66  
67          while ((line = reader.readLine()) != null) {
68              if (line.startsWith("</DOC>")) {
69                  break;
70              }
71              if (line.startsWith("<")) {
72                  if (inTag >= 0 && line.startsWith(endTags[inTag])) {
73                      inTag = -1;
74  
75                      buffer.append(line);
76                      buffer.append(' ');
77                  } else if (inTag < 0) {
78                      for (int i = 0; i < startTags.length; i++) {
79                          if (line.startsWith(startTags[i])) {
80                              inTag = i;
81                              break;
82                          }
83                      }
84                  }
85              }
86  
87              if (inTag >= 0) {
88                  buffer.append(line);
89                  buffer.append(' ');
90              }
91          }
92  
93          return new Document(identifier, buffer.toString());
94      }
95  }