View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.tools;
4   
5   import java.io.File;
6   import java.io.IOException;
7   import java.util.ArrayList;
8   import org.galagosearch.core.index.DocumentLengthsWriter;
9   import org.galagosearch.core.index.DocumentNameWriter;
10  import org.galagosearch.core.index.ExtentIndexWriter;
11  import org.galagosearch.core.index.ExtentValueIndexWriter;
12  import org.galagosearch.core.index.ManifestWriter;
13  import org.galagosearch.core.index.PositionIndexWriter;
14  import org.galagosearch.core.parse.AdditionalTextCombiner;
15  import org.galagosearch.core.parse.AnchorTextCreator;
16  import org.galagosearch.core.parse.CollectionLengthCounter;
17  import org.galagosearch.core.parse.DocumentDataExtractor;
18  import org.galagosearch.core.parse.DocumentDataNumberer;
19  import org.galagosearch.core.parse.DocumentSource;
20  import org.galagosearch.core.parse.ExtentExtractor;
21  import org.galagosearch.core.parse.ExtentsNumberer;
22  import org.galagosearch.core.parse.LinkCombiner;
23  import org.galagosearch.core.parse.LinkExtractor;
24  import org.galagosearch.core.parse.Porter2Stemmer;
25  import org.galagosearch.core.parse.PositionPostingsNumberer;
26  import org.galagosearch.core.parse.PostingsPositionExtractor;
27  import org.galagosearch.core.parse.TagTokenizer;
28  import org.galagosearch.core.parse.UniversalParser;
29  import org.galagosearch.core.types.AdditionalDocumentText;
30  import org.galagosearch.core.types.DocumentData;
31  import org.galagosearch.core.types.DocumentExtent;
32  import org.galagosearch.core.types.DocumentSplit;
33  import org.galagosearch.core.types.DocumentWordPosition;
34  import org.galagosearch.core.types.ExtractedLink;
35  import org.galagosearch.core.types.NumberWordPosition;
36  import org.galagosearch.core.types.NumberedDocumentData;
37  import org.galagosearch.core.types.NumberedExtent;
38  import org.galagosearch.core.types.NumberedValuedExtent;
39  import org.galagosearch.tupleflow.NullSource;
40  import org.galagosearch.tupleflow.Order;
41  import org.galagosearch.tupleflow.Parameters;
42  import org.galagosearch.tupleflow.StreamCombiner;
43  import org.galagosearch.tupleflow.Utility;
44  import org.galagosearch.tupleflow.execution.ConnectionAssignmentType;
45  import org.galagosearch.tupleflow.execution.ConnectionPointType;
46  import org.galagosearch.tupleflow.execution.InputStep;
47  import org.galagosearch.tupleflow.execution.Job;
48  import org.galagosearch.tupleflow.execution.MultiStep;
49  import org.galagosearch.tupleflow.execution.OutputStep;
50  import org.galagosearch.tupleflow.execution.Stage;
51  import org.galagosearch.tupleflow.execution.StageConnectionPoint;
52  import org.galagosearch.tupleflow.execution.Step;
53  import org.galagosearch.tupleflow.types.XMLFragment;
54  
55  /***
56   *
57   * @author trevor
58   */
59  public class BuildIndex {
60      String indexPath;
61      boolean stemming;
62      boolean useLinks;
63  
64      public BuildIndex() {
65          this.stemming = false;
66          this.useLinks = false;
67      }
68  
69      public BuildIndex(String indexPath) {
70          this.indexPath = indexPath;
71          this.stemming = true;
72          this.useLinks = true;
73      }
74  
75      public Stage getSplitStage(String[] inputs) throws IOException {
76          Stage stage = new Stage("inputSplit");
77          stage.add(new StageConnectionPoint(ConnectionPointType.Output, "splits",
78                                             new DocumentSplit.FileNameStartKeyOrder()));
79  
80          Parameters p = new Parameters();
81          for (String input : inputs) {
82              File inputFile = new File(input);
83              
84              if (inputFile.isFile()) {
85                  p.add("filename", input);
86              } else if (inputFile.isDirectory()) {
87                  p.add("directory", input);
88              } else {
89                  throw new IOException("Couldn't find file/directory: " + input);
90              }
91          }
92  
93          stage.add(new Step(DocumentSource.class, p));
94          stage.add(Utility.getSorter(new DocumentSplit.FileNameStartKeyOrder()));
95          stage.add(new OutputStep("splits"));
96          return stage;
97      }
98  
99      public ArrayList<Step> getExtractionSteps(
100             String outputName,
101             Class extractionClass,
102             Order sortOrder) {
103         ArrayList<Step> steps = new ArrayList<Step>();
104         steps.add(new Step(extractionClass));
105         steps.add(Utility.getSorter(sortOrder));
106         steps.add(new OutputStep(outputName));
107         return steps;
108     }
109 
110     public Stage getParsePostingsStage() {
111         Stage stage = new Stage("parsePostings");
112 
113         stage.add(new StageConnectionPoint(
114                 ConnectionPointType.Input,
115                 "splits", new DocumentSplit.FileNameStartKeyOrder()));
116         stage.add(new StageConnectionPoint(
117                 ConnectionPointType.Output,
118                 "postings", new DocumentWordPosition.DocumentWordPositionOrder()));
119         stage.add(new StageConnectionPoint(
120                 ConnectionPointType.Output,
121                 "extents", new DocumentExtent.IdentifierOrder()));
122         stage.add(new StageConnectionPoint(
123                 ConnectionPointType.Output,
124                 "documentData", new DocumentData.IdentifierOrder()));
125         if (stemming) {
126             stage.add(new StageConnectionPoint(
127                 ConnectionPointType.Output,
128                 "stemmedPostings", new DocumentWordPosition.DocumentWordPositionOrder()));
129         }
130         if (useLinks) {
131             stage.add(new StageConnectionPoint(
132                 ConnectionPointType.Input,
133                 "anchorText", new AdditionalDocumentText.IdentifierOrder()));
134         }
135 
136         stage.add(new InputStep("splits"));
137         stage.add(new Step(UniversalParser.class));
138         if (useLinks) {
139             Parameters p = new Parameters();
140             p.add("textSource", "anchorText");
141             stage.add(new Step(AdditionalTextCombiner.class, p));
142         }
143         stage.add(new Step(TagTokenizer.class));
144 
145         MultiStep multi = new MultiStep();
146         ArrayList<Step> text =
147                 getExtractionSteps("postings", PostingsPositionExtractor.class,
148                                    new DocumentWordPosition.DocumentWordPositionOrder());
149         ArrayList<Step> extents =
150                 getExtractionSteps("extents", ExtentExtractor.class,
151                                    new DocumentExtent.IdentifierOrder());
152         ArrayList<Step> documentData =
153                 getExtractionSteps("documentData", DocumentDataExtractor.class,
154                                    new DocumentData.IdentifierOrder());
155 
156         multi.groups.add(text);
157         multi.groups.add(extents);
158         multi.groups.add(documentData);
159 
160         if (stemming) {
161             ArrayList<Step> stemmedSteps = new ArrayList<Step>();
162             stemmedSteps.add(new Step(Porter2Stemmer.class));
163             stemmedSteps.add(new Step(PostingsPositionExtractor.class));
164             stemmedSteps.add(Utility.getSorter(new DocumentWordPosition.DocumentWordPositionOrder()));
165             stemmedSteps.add(new OutputStep("stemmedPostings"));
166             multi.groups.add(stemmedSteps);
167         }
168 
169         stage.add(multi);
170         return stage;
171     }
172 
173     public Stage getParseLinksStage() {
174         Stage stage = new Stage("parseLinks");
175 
176         stage.add(new StageConnectionPoint(
177                 ConnectionPointType.Input,
178                 "splits", new DocumentSplit.FileNameStartKeyOrder()));
179         stage.add(new StageConnectionPoint(
180                 ConnectionPointType.Output,
181                 "links", new ExtractedLink.DestUrlOrder()));
182         stage.add(new StageConnectionPoint(
183                 ConnectionPointType.Output,
184                 "documentUrls", new DocumentData.UrlOrder()));
185 
186         stage.add(new InputStep("splits"));
187         stage.add(new Step(UniversalParser.class));
188         stage.add(new Step(TagTokenizer.class));
189 
190         MultiStep multi = new MultiStep();
191         ArrayList<Step> links =
192                 getExtractionSteps("links", LinkExtractor.class, new ExtractedLink.DestUrlOrder());
193         ArrayList<Step> data =
194                 getExtractionSteps("documentUrls", DocumentDataExtractor.class,
195                                    new DocumentData.UrlOrder());
196 
197         multi.groups.add(links);
198         multi.groups.add(data);
199         stage.add(multi);
200 
201         return stage;
202     }
203 
204     public Stage getLinkCombineStage() {
205         Stage stage = new Stage("linkCombine");
206 
207         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "documentUrls",
208                                            new DocumentData.UrlOrder()));
209         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "links",
210                                            new ExtractedLink.DestUrlOrder()));
211         stage.add(new StageConnectionPoint(ConnectionPointType.Output, "anchorText",
212                                            new AdditionalDocumentText.IdentifierOrder()));
213 
214         Parameters p = new Parameters();
215         p.add("documentDatas", "documentUrls");
216         p.add("extractedLinks", "links");
217         stage.add(new Step(LinkCombiner.class, p));
218         stage.add(new Step(AnchorTextCreator.class));
219         stage.add(Utility.getSorter(new AdditionalDocumentText.IdentifierOrder()));
220         stage.add(new OutputStep("anchorText"));
221 
222         return stage;
223     }
224 
225     public Stage getCollectionLengthStage() {
226         Stage stage = new Stage("collectionLength");
227 
228         stage.add(new StageConnectionPoint(
229                   ConnectionPointType.Input, "documentData",
230                   new DocumentData.IdentifierOrder()));
231         stage.add(new StageConnectionPoint(
232                   ConnectionPointType.Output, "collectionLength",
233                   new XMLFragment.NodePathOrder()));
234 
235         stage.add(new InputStep("documentData"));
236         stage.add(new Step(CollectionLengthCounter.class));
237         stage.add(Utility.getSorter(new XMLFragment.NodePathOrder()));
238         stage.add(new OutputStep("collectionLength"));
239 
240         return stage;
241     }
242 
243     public Stage getWritePostingsStage(String stageName, String inputName, String indexName) {
244         Stage stage = new Stage(stageName);
245 
246         stage.add(new StageConnectionPoint(
247                 ConnectionPointType.Input, inputName,
248                 new NumberWordPosition.WordDocumentPositionOrder()));
249         stage.add(new InputStep(inputName));
250         Parameters p = new Parameters();
251         p.add("filename", indexPath + File.separator + "parts" + File.separator + indexName);
252         stage.add(new Step(PositionIndexWriter.class, p));
253         return stage;
254     }
255 
256     public Stage getWriteExtentsStage() {
257         Stage stage = new Stage("writeExtents");
258 
259         stage.add(new StageConnectionPoint(
260                 ConnectionPointType.Input, "numberedExtents",
261                 new NumberedExtent.ExtentNameNumberBeginOrder()));
262 
263         stage.add(new InputStep("numberedExtents"));
264         Parameters p = new Parameters();
265         p.add("filename", indexPath + File.separator + "parts" + File.separator + "extents");
266         stage.add(new Step(ExtentIndexWriter.class, p));
267         return stage;
268     }
269 
270     public Stage getWriteDatesStage() {
271         Stage stage = new Stage("writeDates");
272 
273         stage.add(new StageConnectionPoint(
274                 ConnectionPointType.Input, "numberedDateExtents",
275                 new NumberedValuedExtent.ExtentNameNumberBeginOrder()));
276         Parameters p = new Parameters();
277         p.add("filename", indexPath + File.separator + "parts" + File.separator + "dates");
278         stage.add(new Step(ExtentValueIndexWriter.class));
279 
280         return stage;
281     }
282 
283     /***
284      * Write out document count and collection length information.
285      */
286     public Stage getWriteManifestStage() {
287         Stage stage = new Stage("writeManifest");
288 
289         stage.add(new StageConnectionPoint(ConnectionPointType.Input,
290                                            "collectionLength",
291                                            new XMLFragment.NodePathOrder()));
292         stage.add(new InputStep("collectionLength"));
293         Parameters p = new Parameters();
294         p.add("filename", indexPath + File.separator + "manifest");
295         stage.add(new Step(ManifestWriter.class, p));
296         return stage;
297     }
298 
299     /***
300      * Writes document lengths to a document lengths file.
301      */
302     public Stage getWriteDocumentLengthsStage() {
303         Stage stage = new Stage("writeDocumentLengths");
304 
305         stage.add(new StageConnectionPoint(ConnectionPointType.Input,
306                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
307         Parameters p = new Parameters();
308         p.add("filename", indexPath + File.separator + "documentLengths");
309         stage.add(new InputStep("numberedDocumentData"));
310         stage.add(new Step(DocumentLengthsWriter.class, p));
311 
312         return stage;
313     }
314 
315     /***
316      * Writes document names to a document names file.
317      */
318     public Stage getWriteDocumentNamesStage() {
319         Stage stage = new Stage("writeDocumentNames");
320 
321         stage.add(new StageConnectionPoint(ConnectionPointType.Input,
322                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
323         Parameters p = new Parameters();
324         p.add("filename", indexPath + File.separator + "documentNames");
325         stage.add(new InputStep("numberedDocumentData"));
326         stage.add(new Step(DocumentNameWriter.class, p));
327         return stage;
328     }
329 
330     public Stage getNumberDocumentsStage() {
331         Stage stage = new Stage("numberDocuments");
332 
333         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "documentData",
334                     new DocumentData.IdentifierOrder()));
335         stage.add(new StageConnectionPoint(ConnectionPointType.Output, "numberedDocumentData",
336                     new NumberedDocumentData.NumberOrder()));
337         stage.add(new InputStep("documentData"));
338         stage.add(new Step(DocumentDataNumberer.class));
339         stage.add(Utility.getSorter(new NumberedDocumentData.NumberOrder()));
340         stage.add(new OutputStep("numberedDocumentData"));
341 
342         return stage;
343     }
344 
345     public Stage getNumberPostingsStage(String stageName, String inputName, String outputName) {
346         Stage stage = new Stage(stageName);
347 
348         stage.add(new StageConnectionPoint(
349                 ConnectionPointType.Input,
350                 inputName, new DocumentWordPosition.DocumentWordPositionOrder()));
351         stage.add(new StageConnectionPoint(
352                 ConnectionPointType.Input,
353                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
354         stage.add(new StageConnectionPoint(
355                 ConnectionPointType.Output,
356                 outputName, new NumberWordPosition.WordDocumentPositionOrder()));
357 
358         stage.add(new InputStep(inputName));
359         stage.add(new Step(PositionPostingsNumberer.class));
360         stage.add(Utility.getSorter(new NumberWordPosition.WordDocumentPositionOrder()));
361         stage.add(new OutputStep(outputName));
362 
363         return stage;
364     }
365 
366     public Stage getNumberExtentsStage() {
367         Stage stage = new Stage("numberExtents");
368 
369         stage.add(new StageConnectionPoint(
370                 ConnectionPointType.Input,
371                 "extents", new DocumentExtent.IdentifierOrder()));
372         stage.add(new StageConnectionPoint(
373                 ConnectionPointType.Input,
374                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
375         stage.add(new StageConnectionPoint(
376                 ConnectionPointType.Output,
377                 "numberedExtents", new NumberedExtent.ExtentNameNumberBeginOrder()));
378 
379         stage.add(new InputStep("extents"));
380         stage.add(new Step(ExtentsNumberer.class));
381         stage.add(Utility.getSorter(new NumberedExtent.ExtentNameNumberBeginOrder()));
382         stage.add(new OutputStep("numberedExtents"));
383 
384         return stage;
385     }
386 
387     public Job getIndexJob(String indexDirectory, String[] indexInputs,
388                            boolean extractAnchors, boolean useStemming) throws IOException {
389         Job job = new Job();
390         this.indexPath = indexDirectory;
391         this.stemming = useStemming;
392         this.useLinks = extractAnchors;
393 
394         job.add(getSplitStage(indexInputs));
395         job.add(getParsePostingsStage());
396         job.add(getWritePostingsStage("writePostings", "numberedPostings", "postings"));
397         job.add(getWriteManifestStage());
398         job.add(getWriteExtentsStage());
399         job.add(getWriteDocumentNamesStage());
400         job.add(getWriteDocumentLengthsStage());
401         job.add(getNumberDocumentsStage());
402         job.add(getNumberPostingsStage("numberPostings", "postings", "numberedPostings"));
403         job.add(getNumberExtentsStage());
404         job.add(getCollectionLengthStage());
405 
406         job.connect("inputSplit", "parsePostings", ConnectionAssignmentType.Each);
407         job.connect("parsePostings", "numberDocuments", ConnectionAssignmentType.Combined);
408         job.connect("numberDocuments", "writeDocumentLengths", ConnectionAssignmentType.Combined);
409         job.connect("numberDocuments", "writeDocumentNames", ConnectionAssignmentType.Combined);
410         job.connect("numberDocuments", "numberPostings", ConnectionAssignmentType.Combined);
411         job.connect("numberDocuments", "numberExtents", ConnectionAssignmentType.Combined);
412         job.connect("parsePostings", "numberPostings", ConnectionAssignmentType.Each);
413         job.connect("parsePostings", "numberExtents", ConnectionAssignmentType.Each);
414         job.connect("numberExtents", "writeExtents", ConnectionAssignmentType.Combined);
415         job.connect("numberPostings", "writePostings", ConnectionAssignmentType.Combined);
416         job.connect("parsePostings", "collectionLength", ConnectionAssignmentType.Combined);
417         job.connect("collectionLength", "writeManifest", ConnectionAssignmentType.Combined);
418 
419         if (useLinks) {
420             job.add(getParseLinksStage());
421             job.add(getLinkCombineStage());
422 
423             job.connect("inputSplit", "parseLinks", ConnectionAssignmentType.Each);
424             job.connect("parseLinks", "linkCombine", ConnectionAssignmentType.Each);
425             job.connect("linkCombine", "parsePostings", ConnectionAssignmentType.Each);
426         }
427 
428         if (stemming) {
429             job.add(getNumberPostingsStage("numberStemmedPostings",
430                                            "stemmedPostings",
431                                            "numberedStemmedPostings"));
432             job.add(getWritePostingsStage("writeStemmedPostings",
433                                           "numberedStemmedPostings",
434                                           "stemmedPostings"));
435             job.connect("parsePostings", "numberStemmedPostings", ConnectionAssignmentType.Each);
436             job.connect("numberDocuments", "numberStemmedPostings", ConnectionAssignmentType.Combined);
437             job.connect("numberStemmedPostings", "writeStemmedPostings", ConnectionAssignmentType.Combined);
438         }
439 
440         return job;
441     }
442 }