1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.IOException;
6 import org.galagosearch.core.types.DocumentData;
7 import org.galagosearch.core.types.NumberedDocumentData;
8 import org.galagosearch.tupleflow.InputClass;
9 import org.galagosearch.tupleflow.OutputClass;
10 import org.galagosearch.tupleflow.StandardStep;
11 import org.galagosearch.tupleflow.execution.Verified;
12
13 /***
14 * <p>Sequentially numbers document data objects.</p>
15 *
16 * <p>The point of this class is to assign small numbers to each document. This
17 * would be simple if only one process was parsing documents, but in fact there are many
18 * of them doing the job at once. So, we extract DocumentData records from each document,
19 * put them into a single list, and assign numbers to them. These NumberedDocumentData
20 * records are then used to assign numbers to index positings.
21 * </p>
22 *
23 * @author trevor
24 */
25 @Verified
26 @InputClass(className = "org.galagosearch.core.types.DocumentData")
27 @OutputClass(className = "org.galagosearch.core.types.NumberedDocumentData")
28 public class DocumentDataNumberer extends StandardStep<DocumentData, NumberedDocumentData> {
29 int number = 0;
30
31 public void process(DocumentData data) throws IOException {
32 NumberedDocumentData numbered = new NumberedDocumentData();
33 numbered.identifier = data.identifier;
34 numbered.url = data.url;
35 numbered.textLength = data.textLength;
36 numbered.number = number;
37 ++number;
38
39 processor.process(numbered);
40 }
41 }