1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.BufferedReader;
6 import java.io.File;
7 import java.io.IOException;
8 import org.galagosearch.tupleflow.Parameters;
9
10 /***
11 * Reads data from a single text file of type HTML, XML or txt.
12 *
13 * @author trevor
14 */
15 class FileParser implements DocumentStreamParser {
16 BufferedReader reader;
17 String identifier;
18
19 public FileParser(Parameters parameters, String fileName, BufferedReader bufferedReader) {
20 this.identifier = getIdentifier(parameters, fileName);
21 this.reader = bufferedReader;
22 }
23
24 public String getIdentifier(Parameters parameters, String fileName) {
25 String idType = parameters.get("identifier", "filename");
26 if (idType.equals("filename")) {
27 return fileName;
28 } else {
29 String id = stripExtensions(fileName);
30 id = new File(id).getName();
31 return id;
32 }
33 }
34
35 public String stripExtension(String name, String extension) {
36 if (name.endsWith(extension)) {
37 name = name.substring(0, name.length()-extension.length());
38 }
39 return name;
40 }
41
42 public String stripExtensions(String name) {
43 name = stripExtension(name, ".gz");
44 name = stripExtension(name, ".html");
45 name = stripExtension(name, ".xml");
46 name = stripExtension(name, ".txt");
47 return name;
48 }
49
50 public String getTitle(String text) {
51 int start = text.indexOf("<title>");
52 if (start < 0) return "";
53 int end = text.indexOf("</title>");
54 if (end < 0) return "";
55 return new String(text.substring(start + "<title>".length(), end));
56 }
57
58 public Document nextDocument() throws IOException {
59 if (reader == null) {
60 return null;
61 }
62
63 StringBuilder builder = new StringBuilder();
64 String line;
65
66 while ((line = reader.readLine()) != null) {
67 builder.append(line);
68 builder.append("\n");
69 }
70
71 Document result = new Document();
72 result.identifier = identifier;
73 result.text = builder.toString();
74 result.metadata.put("title", getTitle(result.text));
75 reader.close();
76 reader = null;
77 return result;
78 }
79 }