View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.index;
4   
5   import java.io.BufferedOutputStream;
6   import java.io.DataOutputStream;
7   import java.io.FileNotFoundException;
8   import java.io.FileOutputStream;
9   import java.io.IOException;
10  import java.util.ArrayList;
11  import org.galagosearch.core.types.NumberedDocumentData;
12  import org.galagosearch.tupleflow.Counter;
13  import org.galagosearch.tupleflow.InputClass;
14  import org.galagosearch.tupleflow.Processor;
15  import org.galagosearch.tupleflow.TupleFlowParameters;
16  import org.galagosearch.tupleflow.Utility;
17  import org.galagosearch.tupleflow.execution.ErrorHandler;
18  
19  /***
20   * Writes a list of document names to a binary file.
21   * This class assumes that a document name is a string that contains at least
22   * one hyphen ('-') followed entirely by numbers.  All TREC document names
23   * follow this convention, e.g.:  WTX-B01-0001.
24   *
25   * @author Trevor Strohman
26   */
27  @InputClass(className = "org.galagosearch.core.types.NumberedDocumentData")
28  public class DocumentNameWriter implements Processor<NumberedDocumentData> {
29      String lastHeader = null;
30      DataOutputStream output;
31      int lastFooterWidth = 0;
32      int lastDocument = -1;
33      ArrayList<Integer> footers;
34      Counter documentsWritten = null;
35  
36      public DocumentNameWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
37          String filename = parameters.getXML().get("filename");
38          footers = new ArrayList<Integer>();
39          output = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
40          documentsWritten = parameters.getCounter("Documents Written");
41      }
42  
43      public void flush() throws IOException {
44          if (footers.size() == 0) {
45              return;
46          }
47  
48          byte[] headerBytes = Utility.makeBytes(lastHeader);
49          output.writeInt(headerBytes.length);
50          output.write(headerBytes);
51          output.writeInt(lastFooterWidth);
52          output.writeInt(footers.size());
53  
54          for (int footerValue : footers) {
55              output.writeInt(footerValue);
56          }
57      }
58  
59      public void process(NumberedDocumentData numberedDocumentData) throws IOException {
60          assert numberedDocumentData.number - 1 == lastDocument;
61          lastDocument = numberedDocumentData.number;
62  
63          String documentName = numberedDocumentData.identifier;
64          int lastDash = documentName.lastIndexOf("-");
65  
66          if (lastDash == -1) {
67              putName(documentName, 0, 0);
68          } else {
69              String header = documentName.substring(0, lastDash);
70              String footer = documentName.substring(lastDash + 1);
71  
72              try {
73                  int footerValue = Integer.parseInt(footer);
74                  putName(header, footerValue, footer.length());
75              } catch (NumberFormatException e) {
76                  putName(documentName, 0, 0);
77              }
78          }
79  
80          if (documentsWritten != null) documentsWritten.increment();
81      }
82  
83      public void putName(String header, int footer, int footerWidth) throws IOException {
84          if (header.equals(lastHeader) && footerWidth == lastFooterWidth) {
85              footers.add(footer);
86          } else {
87              flush();
88              lastHeader = header;
89              footers = new ArrayList<Integer>();
90              footers.add(footer);
91              lastFooterWidth = footerWidth;
92          }
93      }
94  
95      public void close() throws IOException {
96          flush();
97          output.close();
98      }
99  
100     public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
101         if (!parameters.getXML().containsKey("filename")) {
102             handler.addError("DocumentNameWriter requires an 'filename' parameter.");
103             return;
104         }
105     }
106 }