1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.IOException;
6 import org.galagosearch.core.types.DocumentData;
7 import org.galagosearch.tupleflow.InputClass;
8 import org.galagosearch.tupleflow.OutputClass;
9 import org.galagosearch.tupleflow.StandardStep;
10 import org.galagosearch.tupleflow.execution.Verified;
11
12 /***
13 * Copies a few pieces of metadata about a document (identifier, url, length) from
14 * a document object and stores them in a DocumentData tuple.
15 *
16 * @author trevor
17 */
18 @InputClass(className = "org.galagosearch.core.parse.Document")
19 @OutputClass(className = "org.galagosearch.core.types.DocumentData")
20 @Verified
21 public class DocumentDataExtractor extends StandardStep<Document, DocumentData> {
22 public void process(Document document) throws IOException {
23 DocumentData data = new DocumentData();
24 data.identifier = document.identifier;
25 data.url = "";
26 if (document.metadata.containsKey("url")) {
27 data.url = document.metadata.get("url");
28 }
29 data.textLength = document.terms.size();
30
31 processor.process(data);
32 }
33 }