View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.BufferedReader;
6   import java.io.File;
7   import java.io.IOException;
8   import org.galagosearch.tupleflow.Parameters;
9   
10  /***
11   * Reads data from a single text file of type HTML, XML or txt.
12   * 
13   * @author trevor
14   */
15  class FileParser implements DocumentStreamParser {
16      BufferedReader reader;
17      String identifier;
18  
19      public FileParser(Parameters parameters, String fileName, BufferedReader bufferedReader) {
20          this.identifier = getIdentifier(parameters, fileName);
21          this.reader = bufferedReader;
22      }
23  
24      public String getIdentifier(Parameters parameters, String fileName) {
25          String idType = parameters.get("identifier", "filename");
26          if (idType.equals("filename")) {
27              return fileName;
28          } else {
29              String id = stripExtensions(fileName);
30              id = new File(id).getName();
31              return id;
32          }
33      }
34  
35      public String stripExtension(String name, String extension) {
36          if (name.endsWith(extension)) {
37              name = name.substring(0, name.length()-extension.length());
38          }
39          return name;
40      }
41  
42      public String stripExtensions(String name) {
43          name = stripExtension(name, ".gz");
44          name = stripExtension(name, ".html");
45          name = stripExtension(name, ".xml");
46          name = stripExtension(name, ".txt");
47          return name;
48      }
49  
50      public String getTitle(String text) {
51          int start = text.indexOf("<title>");
52          if (start < 0) return "";
53          int end = text.indexOf("</title>");
54          if (end < 0) return "";
55          return new String(text.substring(start + "<title>".length(), end));
56      }
57  
58      public Document nextDocument() throws IOException {
59          if (reader == null) {
60              return null;
61          }
62          
63          StringBuilder builder = new StringBuilder();
64          String line;
65  
66          while ((line = reader.readLine()) != null) {
67              builder.append(line);
68              builder.append("\n");
69          }
70  
71          Document result = new Document();
72          result.identifier = identifier;
73          result.text = builder.toString();
74          result.metadata.put("title", getTitle(result.text));
75          reader.close();
76          reader = null;
77          return result;
78      }
79  }