Coverage Report - org.galagosearch.core.parse.UniversalParser
 
Classes in this File Line Coverage Branch Coverage Complexity
UniversalParser
0%
0/31
0%
0/22
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.parse;
 4  
 
 5  
 import java.io.BufferedInputStream;
 6  
 import java.io.BufferedReader;
 7  
 import java.io.FileInputStream;
 8  
 import org.galagosearch.tupleflow.Counter;
 9  
 import org.galagosearch.tupleflow.InputClass;
 10  
 import org.galagosearch.tupleflow.OutputClass;
 11  
 import org.galagosearch.tupleflow.StandardStep;
 12  
 import org.galagosearch.tupleflow.execution.Verified;
 13  
 import java.io.IOException;
 14  
 import java.io.InputStreamReader;
 15  
 import java.util.zip.GZIPInputStream;
 16  
 import org.galagosearch.tupleflow.StreamCreator;
 17  
 import org.galagosearch.tupleflow.TupleFlowParameters;
 18  
 import org.galagosearch.core.types.DocumentSplit;
 19  
 import org.galagosearch.tupleflow.Parameters;
 20  
 
 21  
 /**
 22  
  *
 23  
  * @author trevor
 24  
  */
 25  
 @Verified
 26  
 @InputClass(className = "org.galagosearch.core.types.DocumentSplit")
 27  
 @OutputClass(className = "org.galagosearch.core.parse.Document")
 28  0
 public class UniversalParser extends StandardStep<DocumentSplit, Document> {
 29  
     private Counter documentCounter;
 30  
     private Parameters parameters;
 31  
 
 32  
     public BufferedReader getBufferedReader(DocumentSplit split) throws IOException {
 33  0
         FileInputStream stream = StreamCreator.realInputStream(split.fileName);
 34  
         BufferedReader reader;
 35  
 
 36  0
         if (split.isCompressed) {
 37  0
             reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(stream)));
 38  
         } else {
 39  0
             reader = new BufferedReader(new InputStreamReader(stream));
 40  
         }
 41  0
         return reader;
 42  
     }
 43  
     
 44  
     public BufferedInputStream getBufferedInputStream(DocumentSplit split) throws IOException {
 45  0
         FileInputStream fileStream = StreamCreator.realInputStream(split.fileName);
 46  
         BufferedInputStream stream;
 47  
 
 48  0
         if (split.isCompressed) {
 49  0
             stream = new BufferedInputStream(new GZIPInputStream(fileStream));
 50  
         } else {
 51  0
             stream = new BufferedInputStream(fileStream);
 52  
         }
 53  0
         return stream;
 54  
     }
 55  
     
 56  0
     public UniversalParser(TupleFlowParameters parameters) {
 57  0
         documentCounter = parameters.getCounter("Documents Parsed");
 58  0
         this.parameters = parameters.getXML();
 59  0
     }
 60  
     
 61  
     public void process(DocumentSplit split) throws IOException {
 62  
         DocumentStreamParser parser;
 63  
 
 64  0
         if (split.fileType.equals("html") ||
 65  
             split.fileType.equals("xml") ||
 66  
             split.fileType.equals("txt")) {
 67  0
             parser = new FileParser(parameters, split.fileName, getBufferedReader(split));
 68  0
         } else if (split.fileType.equals("arc")) {
 69  0
             parser = new ArcParser(getBufferedInputStream(split));
 70  0
         } else if (split.fileType.equals("trectext")) {
 71  0
             parser = new TrecTextParser(getBufferedReader(split));
 72  0
         } else if (split.fileType.equals("trecweb")) {
 73  0
             parser = new TrecWebParser(getBufferedReader(split));
 74  0
         } else if (split.fileType.equals("corpus")) {
 75  0
             parser = new IndexReaderSplitParser(split);
 76  
         } else {
 77  0
             throw new IOException("Unknown fileType: " + split.fileType +
 78  
                                   " for fileName: "  + split.fileName);
 79  
         }
 80  
 
 81  
         Document document;
 82  0
         while ((document = parser.nextDocument()) != null) {
 83  0
             processor.process(document);
 84  0
             if (documentCounter != null)
 85  0
                 documentCounter.increment();
 86  
         }
 87  0
     }
 88  
 }