| 1 | |
|
| 2 | |
|
| 3 | |
package org.galagosearch.core.parse; |
| 4 | |
|
| 5 | |
import java.io.BufferedInputStream; |
| 6 | |
import java.io.FileNotFoundException; |
| 7 | |
import java.io.IOException; |
| 8 | |
import org.galagosearch.tupleflow.Utility; |
| 9 | |
|
| 10 | |
|
| 11 | |
|
| 12 | |
|
| 13 | |
|
| 14 | |
public class ArcParser implements DocumentStreamParser { |
| 15 | |
BufferedInputStream stream; |
| 16 | |
|
| 17 | 0 | public ArcParser(BufferedInputStream stream) throws FileNotFoundException, IOException { |
| 18 | 0 | this.stream = stream; |
| 19 | 0 | } |
| 20 | |
|
| 21 | |
private String readLine() throws IOException { |
| 22 | 0 | StringBuffer buffer = new StringBuffer(); |
| 23 | 0 | boolean seenNonNewline = false; |
| 24 | |
|
| 25 | |
do { |
| 26 | 0 | int c = stream.read(); |
| 27 | |
|
| 28 | 0 | if (c == -1) { |
| 29 | 0 | break; |
| 30 | |
} |
| 31 | 0 | if (c == '\n') { |
| 32 | 0 | if (seenNonNewline) { |
| 33 | 0 | break; |
| 34 | |
} else { |
| 35 | |
continue; |
| 36 | |
} |
| 37 | |
} |
| 38 | |
|
| 39 | 0 | seenNonNewline = true; |
| 40 | 0 | buffer.append((char) c); |
| 41 | 0 | } while (true); |
| 42 | |
|
| 43 | 0 | return buffer.toString(); |
| 44 | |
} |
| 45 | |
|
| 46 | |
public Document nextDocument() throws IOException { |
| 47 | |
|
| 48 | |
|
| 49 | |
|
| 50 | 0 | String header = readLine(); |
| 51 | 0 | String[] fields = header.split(" "); |
| 52 | |
|
| 53 | 0 | String url = fields[0]; |
| 54 | 0 | String ip = fields[1]; |
| 55 | 0 | String date = fields[2]; |
| 56 | 0 | String contentType = fields[3]; |
| 57 | 0 | long length = Long.parseLong(fields[4]); |
| 58 | |
|
| 59 | |
|
| 60 | 0 | byte[] data = new byte[(int) length]; |
| 61 | 0 | stream.read(data); |
| 62 | |
|
| 63 | 0 | stream.read(); |
| 64 | 0 | String fullText = Utility.makeString(data); |
| 65 | 0 | int headerEnd = findDoubleNewline(fullText); |
| 66 | |
|
| 67 | |
String serverHeader; |
| 68 | |
String documentText; |
| 69 | |
|
| 70 | 0 | if (headerEnd == 0) { |
| 71 | 0 | documentText = fullText; |
| 72 | 0 | serverHeader = ""; |
| 73 | |
} else { |
| 74 | 0 | serverHeader = fullText.substring(0, headerEnd); |
| 75 | 0 | documentText = fullText.substring(headerEnd + 1); |
| 76 | |
} |
| 77 | |
|
| 78 | 0 | Document result = new Document(new String(url), documentText); |
| 79 | 0 | System.out.println(url); |
| 80 | 0 | result.metadata.put("serverHeader", serverHeader); |
| 81 | 0 | result.metadata.put("contentType", contentType); |
| 82 | 0 | result.metadata.put("ip", ip); |
| 83 | 0 | result.metadata.put("date", date); |
| 84 | |
|
| 85 | 0 | return result; |
| 86 | |
} |
| 87 | |
|
| 88 | |
private int findDoubleNewline(final String fullText) { |
| 89 | |
|
| 90 | 0 | boolean lastNewline = false; |
| 91 | 0 | int headerEnd = 0; |
| 92 | 0 | for (int i = 0; i < fullText.length(); i++) { |
| 93 | 0 | if (fullText.charAt(i) == '\n') { |
| 94 | 0 | if (lastNewline) { |
| 95 | 0 | headerEnd = i; |
| 96 | 0 | break; |
| 97 | |
} |
| 98 | 0 | lastNewline = true; |
| 99 | |
} else { |
| 100 | 0 | lastNewline = false; |
| 101 | |
} |
| 102 | |
} |
| 103 | 0 | return headerEnd; |
| 104 | |
} |
| 105 | |
} |