1
2 package org.galagosearch.core.parse;
3
4 import java.io.BufferedReader;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7
8 /***
9 *
10 * @author trevor
11 */
12 public class TrecWebParser implements DocumentStreamParser {
13 BufferedReader reader;
14
15 /*** Creates a new instance of TrecWebParser */
16 public TrecWebParser(BufferedReader reader) throws FileNotFoundException, IOException {
17 this.reader = reader;
18 }
19
20 public String waitFor(String tag) throws IOException {
21 String line;
22
23 while ((line = reader.readLine()) != null) {
24 if (line.startsWith(tag)) {
25 return line;
26 }
27 }
28
29 return null;
30 }
31
32 public void close() throws IOException {
33 reader.close();
34 reader = null;
35 }
36
37 public String scrubUrl(String url) {
38
39 if (url.charAt(url.length() - 1) == '#') {
40 url = url.substring(0, url.length() - 1);
41 }
42 url = url.toLowerCase();
43
44
45 url = url.replace(":80/", "/");
46 if (url.endsWith(":80")) {
47 url = url.replace(":80", "");
48 }
49 while (url.charAt(url.length() - 1) == '/') {
50 url = url.substring(0, url.length() - 1);
51 }
52 return url;
53 }
54
55 public String readUrl() throws IOException {
56 String url = reader.readLine();
57 int space = url.indexOf(' ');
58
59 if (space < 0) {
60 space = url.length();
61 }
62 return scrubUrl(url.substring(0, space));
63 }
64
65 public Document nextDocument() throws IOException {
66 String line = null;
67
68 if (null == waitFor("<DOC>")) {
69 close();
70 return null;
71 }
72
73 String identifier = waitFor("<DOCNO>");
74 identifier = identifier.substring(7).trim();
75 identifier = identifier.substring(0, identifier.length() - 8);
76 identifier = new String(identifier.trim());
77 waitFor("<DOCHDR>");
78 String url = readUrl();
79 waitFor("</DOCHDR>");
80
81 StringBuilder buffer = new StringBuilder(20 * 1024);
82
83 while ((line = reader.readLine()) != null) {
84 if (line.startsWith("</DOC>")) {
85 break;
86 }
87 buffer.append(line);
88 buffer.append(' ');
89 }
90
91 Document result = new Document(identifier, buffer.toString());
92 result.metadata.put("url", new String(url));
93 result.metadata.put("identifier", result.identifier);
94
95 return result;
96 }
97 }