View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.IOException;
6   import org.galagosearch.core.types.DocumentData;
7   import org.galagosearch.core.types.NumberedDocumentData;
8   import org.galagosearch.tupleflow.InputClass;
9   import org.galagosearch.tupleflow.OutputClass;
10  import org.galagosearch.tupleflow.StandardStep;
11  import org.galagosearch.tupleflow.execution.Verified;
12  
13  /***
14   * <p>Sequentially numbers document data objects.</p>
15   *
16   * <p>The point of this class is to assign small numbers to each document.  This
17   * would be simple if only one process was parsing documents, but in fact there are many
18   * of them doing the job at once.  So, we extract DocumentData records from each document,
19   * put them into a single list, and assign numbers to them.  These NumberedDocumentData
20   * records are then used to assign numbers to index positings.
21   * </p>
22   * 
23   * @author trevor
24   */
25  @Verified
26  @InputClass(className = "org.galagosearch.core.types.DocumentData")
27  @OutputClass(className = "org.galagosearch.core.types.NumberedDocumentData")
28  public class DocumentDataNumberer extends StandardStep<DocumentData, NumberedDocumentData> {
29      int number = 0;
30  
31      public void process(DocumentData data) throws IOException {
32          NumberedDocumentData numbered = new NumberedDocumentData();
33          numbered.identifier = data.identifier;
34          numbered.url = data.url;
35          numbered.textLength = data.textLength;
36          numbered.number = number;
37          ++number;
38  
39          processor.process(numbered);
40      }
41  }