1
2
3 package org.galagosearch.core.index;
4
5 import java.io.BufferedOutputStream;
6 import java.io.DataOutputStream;
7 import java.io.FileNotFoundException;
8 import java.io.FileOutputStream;
9 import java.io.IOException;
10 import java.util.ArrayList;
11 import org.galagosearch.core.types.NumberedDocumentData;
12 import org.galagosearch.tupleflow.Counter;
13 import org.galagosearch.tupleflow.InputClass;
14 import org.galagosearch.tupleflow.Processor;
15 import org.galagosearch.tupleflow.TupleFlowParameters;
16 import org.galagosearch.tupleflow.Utility;
17 import org.galagosearch.tupleflow.execution.ErrorHandler;
18
19 /***
20 * Writes a list of document names to a binary file.
21 * This class assumes that a document name is a string that contains at least
22 * one hyphen ('-') followed entirely by numbers. All TREC document names
23 * follow this convention, e.g.: WTX-B01-0001.
24 *
25 * @author Trevor Strohman
26 */
27 @InputClass(className = "org.galagosearch.core.types.NumberedDocumentData")
28 public class DocumentNameWriter implements Processor<NumberedDocumentData> {
29 String lastHeader = null;
30 DataOutputStream output;
31 int lastFooterWidth = 0;
32 int lastDocument = -1;
33 ArrayList<Integer> footers;
34 Counter documentsWritten = null;
35
36 public DocumentNameWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
37 String filename = parameters.getXML().get("filename");
38 footers = new ArrayList<Integer>();
39 output = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
40 documentsWritten = parameters.getCounter("Documents Written");
41 }
42
43 public void flush() throws IOException {
44 if (footers.size() == 0) {
45 return;
46 }
47
48 byte[] headerBytes = Utility.makeBytes(lastHeader);
49 output.writeInt(headerBytes.length);
50 output.write(headerBytes);
51 output.writeInt(lastFooterWidth);
52 output.writeInt(footers.size());
53
54 for (int footerValue : footers) {
55 output.writeInt(footerValue);
56 }
57 }
58
59 public void process(NumberedDocumentData numberedDocumentData) throws IOException {
60 assert numberedDocumentData.number - 1 == lastDocument;
61 lastDocument = numberedDocumentData.number;
62
63 String documentName = numberedDocumentData.identifier;
64 int lastDash = documentName.lastIndexOf("-");
65
66 if (lastDash == -1) {
67 putName(documentName, 0, 0);
68 } else {
69 String header = documentName.substring(0, lastDash);
70 String footer = documentName.substring(lastDash + 1);
71
72 try {
73 int footerValue = Integer.parseInt(footer);
74 putName(header, footerValue, footer.length());
75 } catch (NumberFormatException e) {
76 putName(documentName, 0, 0);
77 }
78 }
79
80 if (documentsWritten != null) documentsWritten.increment();
81 }
82
83 public void putName(String header, int footer, int footerWidth) throws IOException {
84 if (header.equals(lastHeader) && footerWidth == lastFooterWidth) {
85 footers.add(footer);
86 } else {
87 flush();
88 lastHeader = header;
89 footers = new ArrayList<Integer>();
90 footers.add(footer);
91 lastFooterWidth = footerWidth;
92 }
93 }
94
95 public void close() throws IOException {
96 flush();
97 output.close();
98 }
99
100 public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
101 if (!parameters.getXML().containsKey("filename")) {
102 handler.addError("DocumentNameWriter requires an 'filename' parameter.");
103 return;
104 }
105 }
106 }