Coverage Report - org.galagosearch.core.parse.ArcParser
 
Classes in this File Line Coverage Branch Coverage Complexity
ArcParser
0%
0/49
0%
0/14
3
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.parse;
 4  
 
 5  
 import java.io.BufferedInputStream;
 6  
 import java.io.FileNotFoundException;
 7  
 import java.io.IOException;
 8  
 import org.galagosearch.tupleflow.Utility;
 9  
 
 10  
 /**
 11  
  * Parses ARC files, like those produced by the Heretrix web crawler.
 12  
  * @author trevor
 13  
  */
 14  
 public class ArcParser implements DocumentStreamParser {
 15  
     BufferedInputStream stream;
 16  
 
 17  0
     public ArcParser(BufferedInputStream stream) throws FileNotFoundException, IOException {
 18  0
         this.stream = stream;
 19  0
     }
 20  
 
 21  
     private String readLine() throws IOException {
 22  0
         StringBuffer buffer = new StringBuffer();
 23  0
         boolean seenNonNewline = false;
 24  
 
 25  
         do {
 26  0
             int c = stream.read();
 27  
 
 28  0
             if (c == -1) {
 29  0
                 break;
 30  
             }
 31  0
             if (c == '\n') {
 32  0
                 if (seenNonNewline) {
 33  0
                     break;
 34  
                 } else {
 35  
                     continue;
 36  
                 }
 37  
             }
 38  
 
 39  0
             seenNonNewline = true;
 40  0
             buffer.append((char) c);
 41  0
         } while (true);
 42  
 
 43  0
         return buffer.toString();
 44  
     }
 45  
 
 46  
     public Document nextDocument() throws IOException {
 47  
         // http://www.dmoz.org/robots.txt 207.200.81.154 20070312180115 text/plain 593
 48  
 
 49  
         // read the header line
 50  0
         String header = readLine();
 51  0
         String[] fields = header.split(" ");
 52  
 
 53  0
         String url = fields[0];
 54  0
         String ip = fields[1];
 55  0
         String date = fields[2];
 56  0
         String contentType = fields[3];
 57  0
         long length = Long.parseLong(fields[4]);
 58  
 
 59  
         // read the full document text
 60  0
         byte[] data = new byte[(int) length];
 61  0
         stream.read(data);
 62  
         // get the training newline
 63  0
         stream.read();
 64  0
         String fullText = Utility.makeString(data);
 65  0
         int headerEnd = findDoubleNewline(fullText);
 66  
 
 67  
         String serverHeader;
 68  
         String documentText;
 69  
 
 70  0
         if (headerEnd == 0) {
 71  0
             documentText = fullText;
 72  0
             serverHeader = "";
 73  
         } else {
 74  0
             serverHeader = fullText.substring(0, headerEnd);
 75  0
             documentText = fullText.substring(headerEnd + 1);
 76  
         }
 77  
 
 78  0
         Document result = new Document(new String(url), documentText);
 79  0
         System.out.println(url);
 80  0
         result.metadata.put("serverHeader", serverHeader);
 81  0
         result.metadata.put("contentType", contentType);
 82  0
         result.metadata.put("ip", ip);
 83  0
         result.metadata.put("date", date);
 84  
 
 85  0
         return result;
 86  
     }
 87  
 
 88  
     private int findDoubleNewline(final String fullText) {
 89  
         // scan the full text string looking for two '\n' chars in a row
 90  0
         boolean lastNewline = false;
 91  0
         int headerEnd = 0;
 92  0
         for (int i = 0; i < fullText.length(); i++) {
 93  0
             if (fullText.charAt(i) == '\n') {
 94  0
                 if (lastNewline) {
 95  0
                     headerEnd = i;
 96  0
                     break;
 97  
                 }
 98  0
                 lastNewline = true;
 99  
             } else {
 100  0
                 lastNewline = false;
 101  
             }
 102  
         }
 103  0
         return headerEnd;
 104  
     }
 105  
 }