View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.BufferedInputStream;
6   import java.io.BufferedReader;
7   import java.io.FileInputStream;
8   import org.galagosearch.tupleflow.Counter;
9   import org.galagosearch.tupleflow.InputClass;
10  import org.galagosearch.tupleflow.OutputClass;
11  import org.galagosearch.tupleflow.StandardStep;
12  import org.galagosearch.tupleflow.execution.Verified;
13  import java.io.IOException;
14  import java.io.InputStreamReader;
15  import java.util.zip.GZIPInputStream;
16  import org.galagosearch.tupleflow.StreamCreator;
17  import org.galagosearch.tupleflow.TupleFlowParameters;
18  import org.galagosearch.core.types.DocumentSplit;
19  import org.galagosearch.tupleflow.Parameters;
20  
21  /***
22   *
23   * @author trevor
24   */
25  @Verified
26  @InputClass(className = "org.galagosearch.core.types.DocumentSplit")
27  @OutputClass(className = "org.galagosearch.core.parse.Document")
28  public class UniversalParser extends StandardStep<DocumentSplit, Document> {
29      private Counter documentCounter;
30      private Parameters parameters;
31  
32      public BufferedReader getBufferedReader(DocumentSplit split) throws IOException {
33          FileInputStream stream = StreamCreator.realInputStream(split.fileName);
34          BufferedReader reader;
35  
36          if (split.isCompressed) {
37              reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(stream)));
38          } else {
39              reader = new BufferedReader(new InputStreamReader(stream));
40          }
41          return reader;
42      }
43      
44      public BufferedInputStream getBufferedInputStream(DocumentSplit split) throws IOException {
45          FileInputStream fileStream = StreamCreator.realInputStream(split.fileName);
46          BufferedInputStream stream;
47  
48          if (split.isCompressed) {
49              stream = new BufferedInputStream(new GZIPInputStream(fileStream));
50          } else {
51              stream = new BufferedInputStream(fileStream);
52          }
53          return stream;
54      }
55      
56      public UniversalParser(TupleFlowParameters parameters) {
57          documentCounter = parameters.getCounter("Documents Parsed");
58          this.parameters = parameters.getXML();
59      }
60      
61      public void process(DocumentSplit split) throws IOException {
62          DocumentStreamParser parser;
63  
64          if (split.fileType.equals("html") ||
65              split.fileType.equals("xml") ||
66              split.fileType.equals("txt")) {
67              parser = new FileParser(parameters, split.fileName, getBufferedReader(split));
68          } else if (split.fileType.equals("arc")) {
69              parser = new ArcParser(getBufferedInputStream(split));
70          } else if (split.fileType.equals("trectext")) {
71              parser = new TrecTextParser(getBufferedReader(split));
72          } else if (split.fileType.equals("trecweb")) {
73              parser = new TrecWebParser(getBufferedReader(split));
74          } else if (split.fileType.equals("corpus")) {
75              parser = new IndexReaderSplitParser(split);
76          } else {
77              throw new IOException("Unknown fileType: " + split.fileType +
78                                    " for fileName: "  + split.fileName);
79          }
80  
81          Document document;
82          while ((document = parser.nextDocument()) != null) {
83              processor.process(document);
84              if (documentCounter != null)
85                  documentCounter.increment();
86          }
87      }
88  }