1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.BufferedInputStream;
6 import java.io.BufferedReader;
7 import java.io.FileInputStream;
8 import org.galagosearch.tupleflow.Counter;
9 import org.galagosearch.tupleflow.InputClass;
10 import org.galagosearch.tupleflow.OutputClass;
11 import org.galagosearch.tupleflow.StandardStep;
12 import org.galagosearch.tupleflow.execution.Verified;
13 import java.io.IOException;
14 import java.io.InputStreamReader;
15 import java.util.zip.GZIPInputStream;
16 import org.galagosearch.tupleflow.StreamCreator;
17 import org.galagosearch.tupleflow.TupleFlowParameters;
18 import org.galagosearch.core.types.DocumentSplit;
19 import org.galagosearch.tupleflow.Parameters;
20
21 /***
22 *
23 * @author trevor
24 */
25 @Verified
26 @InputClass(className = "org.galagosearch.core.types.DocumentSplit")
27 @OutputClass(className = "org.galagosearch.core.parse.Document")
28 public class UniversalParser extends StandardStep<DocumentSplit, Document> {
29 private Counter documentCounter;
30 private Parameters parameters;
31
32 public BufferedReader getBufferedReader(DocumentSplit split) throws IOException {
33 FileInputStream stream = StreamCreator.realInputStream(split.fileName);
34 BufferedReader reader;
35
36 if (split.isCompressed) {
37 reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(stream)));
38 } else {
39 reader = new BufferedReader(new InputStreamReader(stream));
40 }
41 return reader;
42 }
43
44 public BufferedInputStream getBufferedInputStream(DocumentSplit split) throws IOException {
45 FileInputStream fileStream = StreamCreator.realInputStream(split.fileName);
46 BufferedInputStream stream;
47
48 if (split.isCompressed) {
49 stream = new BufferedInputStream(new GZIPInputStream(fileStream));
50 } else {
51 stream = new BufferedInputStream(fileStream);
52 }
53 return stream;
54 }
55
56 public UniversalParser(TupleFlowParameters parameters) {
57 documentCounter = parameters.getCounter("Documents Parsed");
58 this.parameters = parameters.getXML();
59 }
60
61 public void process(DocumentSplit split) throws IOException {
62 DocumentStreamParser parser;
63
64 if (split.fileType.equals("html") ||
65 split.fileType.equals("xml") ||
66 split.fileType.equals("txt")) {
67 parser = new FileParser(parameters, split.fileName, getBufferedReader(split));
68 } else if (split.fileType.equals("arc")) {
69 parser = new ArcParser(getBufferedInputStream(split));
70 } else if (split.fileType.equals("trectext")) {
71 parser = new TrecTextParser(getBufferedReader(split));
72 } else if (split.fileType.equals("trecweb")) {
73 parser = new TrecWebParser(getBufferedReader(split));
74 } else if (split.fileType.equals("corpus")) {
75 parser = new IndexReaderSplitParser(split);
76 } else {
77 throw new IOException("Unknown fileType: " + split.fileType +
78 " for fileName: " + split.fileName);
79 }
80
81 Document document;
82 while ((document = parser.nextDocument()) != null) {
83 processor.process(document);
84 if (documentCounter != null)
85 documentCounter.increment();
86 }
87 }
88 }