Coverage Report - org.galagosearch.core.parse.FileParser
 
Classes in this File Line Coverage Branch Coverage Complexity
FileParser
0%
0/36
0%
0/12
2.667
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.parse;
 4  
 
 5  
 import java.io.BufferedReader;
 6  
 import java.io.File;
 7  
 import java.io.IOException;
 8  
 import org.galagosearch.tupleflow.Parameters;
 9  
 
 10  
 /**
 11  
  * Reads data from a single text file of type HTML, XML or txt.
 12  
  * 
 13  
  * @author trevor
 14  
  */
 15  
 class FileParser implements DocumentStreamParser {
 16  
     BufferedReader reader;
 17  
     String identifier;
 18  
 
 19  0
     public FileParser(Parameters parameters, String fileName, BufferedReader bufferedReader) {
 20  0
         this.identifier = getIdentifier(parameters, fileName);
 21  0
         this.reader = bufferedReader;
 22  0
     }
 23  
 
 24  
     public String getIdentifier(Parameters parameters, String fileName) {
 25  0
         String idType = parameters.get("identifier", "filename");
 26  0
         if (idType.equals("filename")) {
 27  0
             return fileName;
 28  
         } else {
 29  0
             String id = stripExtensions(fileName);
 30  0
             id = new File(id).getName();
 31  0
             return id;
 32  
         }
 33  
     }
 34  
 
 35  
     public String stripExtension(String name, String extension) {
 36  0
         if (name.endsWith(extension)) {
 37  0
             name = name.substring(0, name.length()-extension.length());
 38  
         }
 39  0
         return name;
 40  
     }
 41  
 
 42  
     public String stripExtensions(String name) {
 43  0
         name = stripExtension(name, ".gz");
 44  0
         name = stripExtension(name, ".html");
 45  0
         name = stripExtension(name, ".xml");
 46  0
         name = stripExtension(name, ".txt");
 47  0
         return name;
 48  
     }
 49  
 
 50  
     public String getTitle(String text) {
 51  0
         int start = text.indexOf("<title>");
 52  0
         if (start < 0) return "";
 53  0
         int end = text.indexOf("</title>");
 54  0
         if (end < 0) return "";
 55  0
         return new String(text.substring(start + "<title>".length(), end));
 56  
     }
 57  
 
 58  
     public Document nextDocument() throws IOException {
 59  0
         if (reader == null) {
 60  0
             return null;
 61  
         }
 62  
         
 63  0
         StringBuilder builder = new StringBuilder();
 64  
         String line;
 65  
 
 66  0
         while ((line = reader.readLine()) != null) {
 67  0
             builder.append(line);
 68  0
             builder.append("\n");
 69  
         }
 70  
 
 71  0
         Document result = new Document();
 72  0
         result.identifier = identifier;
 73  0
         result.text = builder.toString();
 74  0
         result.metadata.put("title", getTitle(result.text));
 75  0
         reader.close();
 76  0
         reader = null;
 77  0
         return result;
 78  
     }
 79  
 }