Coverage Report - org.galagosearch.core.parse.DocumentSource
 
Classes in this File Line Coverage Branch Coverage Complexity
DocumentSource
0%
0/70
0%
0/46
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.parse;
 4  
 
 5  
 import java.io.File;
 6  
 import java.io.IOException;
 7  
 import java.util.ArrayList;
 8  
 import java.util.List;
 9  
 import org.galagosearch.core.index.VocabularyReader;
 10  
 import org.galagosearch.core.index.VocabularyReader.TermSlot;
 11  
 import org.galagosearch.core.index.IndexReader;
 12  
 import org.galagosearch.tupleflow.ExNihiloSource;
 13  
 import org.galagosearch.tupleflow.FileSource;
 14  
 import org.galagosearch.tupleflow.IncompatibleProcessorException;
 15  
 import org.galagosearch.tupleflow.Linkage;
 16  
 import org.galagosearch.tupleflow.OutputClass;
 17  
 import org.galagosearch.tupleflow.Parameters.Value;
 18  
 import org.galagosearch.tupleflow.Processor;
 19  
 import org.galagosearch.tupleflow.Step;
 20  
 import org.galagosearch.tupleflow.TupleFlowParameters;
 21  
 import org.galagosearch.tupleflow.execution.ErrorHandler;
 22  
 import org.galagosearch.tupleflow.execution.Verified;
 23  
 import org.galagosearch.core.types.DocumentSplit;
 24  
 
 25  
 /**
 26  
  * From a set of inputs, splits the input into many DocumentSplit records.
 27  
  * This will usually be in a stage by itself at the beginning of a Galago pipeline.
 28  
  * This is somewhat similar to FileSource, except that it can autodetect file formats.
 29  
  * This splitter can detect ARC, TREC, TRECWEB and corpus files.
 30  
  * 
 31  
  * @author trevor
 32  
  */
 33  
 
 34  
 @Verified
 35  
 @OutputClass(className = "org.galagosearch.core.types.DocumentSplit")
 36  
 public class DocumentSource implements ExNihiloSource<DocumentSplit> {
 37  
     public Processor processor;
 38  
     TupleFlowParameters parameters;
 39  
     
 40  0
     public DocumentSource(TupleFlowParameters parameters) {
 41  0
         this.parameters = parameters;
 42  0
     }
 43  
 
 44  
     private String getExtension(String fileName) {
 45  0
         String[] fields = fileName.split("\\.");
 46  
         
 47  
         // A filename needs to have a period to have an extension.
 48  0
         if (fields.length <= 1) {
 49  0
             return "";
 50  
         }
 51  
         
 52  
         // If the last chunk of the filename is gz, we'll ignore it.
 53  
         // The second-to-last bit is the type extension (but only if
 54  
         // there are at least three parts to the name).
 55  0
         if (fields[fields.length-1].equals("gz")) {
 56  0
             if (fields.length > 2) {
 57  0
                 return fields[fields.length-2];
 58  
             } else {
 59  0
                 return "";
 60  
             }
 61  
         }
 62  
         
 63  
         // No 'gz' extension, so just return the last part.
 64  0
         return fields[fields.length-1];
 65  
     }
 66  
 
 67  
     private void processCorpusFile(String fileName, String fileType) throws IOException {
 68  
         // If this is a big file, we'll split it into roughly 100MB pieces.
 69  0
         long fileLength = new File(fileName).length();
 70  0
         long chunkSize = 100 * 1024 * 1024;
 71  
         
 72  0
         IndexReader reader = new IndexReader(fileName);
 73  0
         VocabularyReader vocabulary = reader.getVocabulary();
 74  0
         List<TermSlot> slots = vocabulary.getSlots();
 75  0
         int pieces = Math.max(1, (int) (fileLength / chunkSize));
 76  0
         ArrayList<byte[]> keys = new ArrayList<byte[]>();
 77  
 
 78  0
         for (int i = 1; i < pieces; ++i) {
 79  0
             float fraction = (float) i / pieces;
 80  0
             int slot = (int) (fraction * slots.size());
 81  0
             keys.add(slots.get(slot).termData);
 82  
         }
 83  
 
 84  0
         for (int i = 0; i < pieces; ++i) {
 85  0
             byte[] firstKey = new byte[0];
 86  0
             byte[] lastKey = new byte[0];
 87  
 
 88  0
             if (i > 0) {
 89  0
                 firstKey = keys.get(i - 1);
 90  
             }
 91  0
             if (i < pieces - 1) {
 92  0
                 lastKey = keys.get(i);
 93  
             }
 94  0
             DocumentSplit split = new DocumentSplit(fileName, fileType, false, firstKey, lastKey);
 95  0
             processor.process(split);
 96  
         }
 97  0
     }
 98  
     
 99  
     private void processFile(String fileName) throws IOException {
 100  
         // First, try to detect what kind of file this is:
 101  0
         boolean isCompressed = fileName.endsWith(".gz");
 102  0
         String fileType = null;
 103  
         
 104  
         // We'll try to detect by extension first, so we don't have to open the file
 105  0
         String extension = getExtension(fileName);
 106  0
         if (extension.equals("corpus") ||
 107  
             extension.equals("trecweb") ||
 108  
             extension.equals("trectext") ||
 109  
             extension.equals("arc") ||
 110  
             extension.equals("txt") ||
 111  
             extension.equals("html") ||
 112  
             extension.equals("xml")) {
 113  0
             fileType = extension;
 114  
         } else {
 115  
             // Oh well, we need to autodetect the file type.
 116  0
             if (IndexReader.isIndexFile(fileName)) {
 117  0
                 fileType = "corpus";
 118  
             } else {
 119  
                 // Eventually it'd be nice to do more format detection here.
 120  0
                 throw new IOException("Couldn't determine file type for: " + fileName);
 121  
             }
 122  
         }
 123  
         
 124  0
         if (fileType.equals("corpus")) {
 125  0
             processCorpusFile(fileName, fileType);            
 126  
         } else {
 127  0
             processSplit(fileName, fileType, isCompressed);
 128  
         }
 129  0
     }
 130  
     
 131  
     private void processDirectory(File root) throws IOException {
 132  0
         for (File file : root.listFiles()) {
 133  0
             if (file.isHidden()) {
 134  0
                 continue;
 135  
             }
 136  0
             if (file.isDirectory()) {
 137  0
                 processDirectory(file);
 138  
             } else {
 139  0
                 processFile(file.getAbsolutePath());
 140  
             }
 141  
         }
 142  0
     }
 143  
     
 144  
     public void run() throws IOException {
 145  0
         if (parameters.getXML().containsKey("directory")) {
 146  0
             List<Value> directories = parameters.getXML().list("directory");
 147  
 
 148  0
             for (Value directory : directories) {
 149  0
                 File directoryFile = new File(directory.toString());
 150  0
                 processDirectory(directoryFile);
 151  0
             }
 152  0
         } else if (parameters.getXML().containsKey("filename")) {
 153  0
             List<Value> files = parameters.getXML().list("filename");
 154  
 
 155  0
             for (Value file : files) {
 156  0
                 processFile(file.toString());
 157  
             }
 158  
         }
 159  
 
 160  0
         processor.close();
 161  0
     }
 162  
 
 163  
     public void setProcessor(Step processor) throws IncompatibleProcessorException {
 164  0
         Linkage.link(this, processor);
 165  0
     }
 166  
 
 167  
     public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
 168  0
         FileSource.verify(parameters, handler);
 169  0
     }
 170  
 
 171  
     private void processSplit(String fileName, String fileType, boolean isCompressed) throws IOException {
 172  0
         DocumentSplit split = new DocumentSplit(fileName, fileType, isCompressed, new byte[0], new byte[0]);
 173  0
         processor.process(split);
 174  0
     }
 175  
 }