1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.BufferedInputStream;
6 import java.io.FileNotFoundException;
7 import java.io.IOException;
8 import org.galagosearch.tupleflow.Utility;
9
10 /***
11 * Parses ARC files, like those produced by the Heretrix web crawler.
12 * @author trevor
13 */
14 public class ArcParser implements DocumentStreamParser {
15 BufferedInputStream stream;
16
17 public ArcParser(BufferedInputStream stream) throws FileNotFoundException, IOException {
18 this.stream = stream;
19 }
20
21 private String readLine() throws IOException {
22 StringBuffer buffer = new StringBuffer();
23 boolean seenNonNewline = false;
24
25 do {
26 int c = stream.read();
27
28 if (c == -1) {
29 break;
30 }
31 if (c == '\n') {
32 if (seenNonNewline) {
33 break;
34 } else {
35 continue;
36 }
37 }
38
39 seenNonNewline = true;
40 buffer.append((char) c);
41 } while (true);
42
43 return buffer.toString();
44 }
45
46 public Document nextDocument() throws IOException {
47
48
49
50 String header = readLine();
51 String[] fields = header.split(" ");
52
53 String url = fields[0];
54 String ip = fields[1];
55 String date = fields[2];
56 String contentType = fields[3];
57 long length = Long.parseLong(fields[4]);
58
59
60 byte[] data = new byte[(int) length];
61 stream.read(data);
62
63 stream.read();
64 String fullText = Utility.makeString(data);
65 int headerEnd = findDoubleNewline(fullText);
66
67 String serverHeader;
68 String documentText;
69
70 if (headerEnd == 0) {
71 documentText = fullText;
72 serverHeader = "";
73 } else {
74 serverHeader = fullText.substring(0, headerEnd);
75 documentText = fullText.substring(headerEnd + 1);
76 }
77
78 Document result = new Document(new String(url), documentText);
79 System.out.println(url);
80 result.metadata.put("serverHeader", serverHeader);
81 result.metadata.put("contentType", contentType);
82 result.metadata.put("ip", ip);
83 result.metadata.put("date", date);
84
85 return result;
86 }
87
88 private int findDoubleNewline(final String fullText) {
89
90 boolean lastNewline = false;
91 int headerEnd = 0;
92 for (int i = 0; i < fullText.length(); i++) {
93 if (fullText.charAt(i) == '\n') {
94 if (lastNewline) {
95 headerEnd = i;
96 break;
97 }
98 lastNewline = true;
99 } else {
100 lastNewline = false;
101 }
102 }
103 return headerEnd;
104 }
105 }