View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.BufferedInputStream;
6   import java.io.FileNotFoundException;
7   import java.io.IOException;
8   import org.galagosearch.tupleflow.Utility;
9   
10  /***
11   * Parses ARC files, like those produced by the Heretrix web crawler.
12   * @author trevor
13   */
14  public class ArcParser implements DocumentStreamParser {
15      BufferedInputStream stream;
16  
17      public ArcParser(BufferedInputStream stream) throws FileNotFoundException, IOException {
18          this.stream = stream;
19      }
20  
21      private String readLine() throws IOException {
22          StringBuffer buffer = new StringBuffer();
23          boolean seenNonNewline = false;
24  
25          do {
26              int c = stream.read();
27  
28              if (c == -1) {
29                  break;
30              }
31              if (c == '\n') {
32                  if (seenNonNewline) {
33                      break;
34                  } else {
35                      continue;
36                  }
37              }
38  
39              seenNonNewline = true;
40              buffer.append((char) c);
41          } while (true);
42  
43          return buffer.toString();
44      }
45  
46      public Document nextDocument() throws IOException {
47          // http://www.dmoz.org/robots.txt 207.200.81.154 20070312180115 text/plain 593
48  
49          // read the header line
50          String header = readLine();
51          String[] fields = header.split(" ");
52  
53          String url = fields[0];
54          String ip = fields[1];
55          String date = fields[2];
56          String contentType = fields[3];
57          long length = Long.parseLong(fields[4]);
58  
59          // read the full document text
60          byte[] data = new byte[(int) length];
61          stream.read(data);
62          // get the training newline
63          stream.read();
64          String fullText = Utility.makeString(data);
65          int headerEnd = findDoubleNewline(fullText);
66  
67          String serverHeader;
68          String documentText;
69  
70          if (headerEnd == 0) {
71              documentText = fullText;
72              serverHeader = "";
73          } else {
74              serverHeader = fullText.substring(0, headerEnd);
75              documentText = fullText.substring(headerEnd + 1);
76          }
77  
78          Document result = new Document(new String(url), documentText);
79          System.out.println(url);
80          result.metadata.put("serverHeader", serverHeader);
81          result.metadata.put("contentType", contentType);
82          result.metadata.put("ip", ip);
83          result.metadata.put("date", date);
84  
85          return result;
86      }
87  
88      private int findDoubleNewline(final String fullText) {
89          // scan the full text string looking for two '\n' chars in a row
90          boolean lastNewline = false;
91          int headerEnd = 0;
92          for (int i = 0; i < fullText.length(); i++) {
93              if (fullText.charAt(i) == '\n') {
94                  if (lastNewline) {
95                      headerEnd = i;
96                      break;
97                  }
98                  lastNewline = true;
99              } else {
100                 lastNewline = false;
101             }
102         }
103         return headerEnd;
104     }
105 }