1
2 package org.galagosearch.core.parse;
3
4 import java.io.BufferedReader;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7
8 /***
9 *
10 * @author trevor
11 */
12 public class TrecTextParser implements DocumentStreamParser {
13 BufferedReader reader;
14
15 /*** Creates a new instance of TrecTextParser */
16 public TrecTextParser(BufferedReader reader) throws FileNotFoundException, IOException {
17 this.reader = reader;
18 }
19
20 public String waitFor(String tag) throws IOException {
21 String line;
22
23 while ((line = reader.readLine()) != null) {
24 if (line.startsWith(tag)) {
25 return line;
26 }
27 }
28
29 return null;
30 }
31
32 public String parseDocNumber() throws IOException {
33 String allText = waitFor("<DOCNO>");
34
35 while (allText.contains("</DOCNO>") == false) {
36 String line = reader.readLine();
37 if (line == null) {
38 break;
39 }
40 allText += line;
41 }
42
43 int start = allText.indexOf("<DOCNO>") + 7;
44 int end = allText.indexOf("</DOCNO>");
45
46 return new String(allText.substring(start, end).trim());
47 }
48
49 public Document nextDocument() throws IOException {
50 String line;
51
52 if (null == waitFor("<DOC>")) {
53 return null;
54 }
55 String identifier = parseDocNumber();
56 StringBuffer buffer = new StringBuffer();
57
58 String[] startTags = {"<TEXT>", "<HEADLINE>", "<TITLE>", "<HL>", "<HEAD>",
59 "<TTL>", "<DD>", "<DATE>", "<LP>", "<LEADPARA>"
60 };
61 String[] endTags = {"</TEXT>", "</HEADLINE>", "</TITLE>", "</HL>", "</HEAD>",
62 "</TTL>", "</DD>", "</DATE>", "</LP>", "</LEADPARA>"
63 };
64
65 int inTag = -1;
66
67 while ((line = reader.readLine()) != null) {
68 if (line.startsWith("</DOC>")) {
69 break;
70 }
71 if (line.startsWith("<")) {
72 if (inTag >= 0 && line.startsWith(endTags[inTag])) {
73 inTag = -1;
74
75 buffer.append(line);
76 buffer.append(' ');
77 } else if (inTag < 0) {
78 for (int i = 0; i < startTags.length; i++) {
79 if (line.startsWith(startTags[i])) {
80 inTag = i;
81 break;
82 }
83 }
84 }
85 }
86
87 if (inTag >= 0) {
88 buffer.append(line);
89 buffer.append(' ');
90 }
91 }
92
93 return new Document(identifier, buffer.toString());
94 }
95 }