View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.IOException;
6   import org.galagosearch.core.types.DocumentData;
7   import org.galagosearch.tupleflow.InputClass;
8   import org.galagosearch.tupleflow.OutputClass;
9   import org.galagosearch.tupleflow.StandardStep;
10  import org.galagosearch.tupleflow.execution.Verified;
11  
12  /***
13   * Copies a few pieces of metadata about a document (identifier, url, length) from
14   * a document object and stores them in a DocumentData tuple.
15   * 
16   * @author trevor
17   */
18  @InputClass(className = "org.galagosearch.core.parse.Document")
19  @OutputClass(className = "org.galagosearch.core.types.DocumentData")
20  @Verified
21  public class DocumentDataExtractor extends StandardStep<Document, DocumentData> {
22      public void process(Document document) throws IOException {
23          DocumentData data = new DocumentData();
24          data.identifier = document.identifier;
25          data.url = "";
26          if (document.metadata.containsKey("url")) {
27              data.url = document.metadata.get("url");
28          }
29          data.textLength = document.terms.size();
30  
31          processor.process(data);
32      }
33  }