1
2
3 package org.galagosearch.core.tools;
4
5 import java.io.File;
6 import java.io.IOException;
7 import java.util.ArrayList;
8 import org.galagosearch.core.index.DocumentLengthsWriter;
9 import org.galagosearch.core.index.DocumentNameWriter;
10 import org.galagosearch.core.index.ExtentIndexWriter;
11 import org.galagosearch.core.index.ExtentValueIndexWriter;
12 import org.galagosearch.core.index.ManifestWriter;
13 import org.galagosearch.core.index.PositionIndexWriter;
14 import org.galagosearch.core.parse.AdditionalTextCombiner;
15 import org.galagosearch.core.parse.AnchorTextCreator;
16 import org.galagosearch.core.parse.CollectionLengthCounter;
17 import org.galagosearch.core.parse.DocumentDataExtractor;
18 import org.galagosearch.core.parse.DocumentDataNumberer;
19 import org.galagosearch.core.parse.DocumentSource;
20 import org.galagosearch.core.parse.ExtentExtractor;
21 import org.galagosearch.core.parse.ExtentsNumberer;
22 import org.galagosearch.core.parse.LinkCombiner;
23 import org.galagosearch.core.parse.LinkExtractor;
24 import org.galagosearch.core.parse.Porter2Stemmer;
25 import org.galagosearch.core.parse.PositionPostingsNumberer;
26 import org.galagosearch.core.parse.PostingsPositionExtractor;
27 import org.galagosearch.core.parse.TagTokenizer;
28 import org.galagosearch.core.parse.UniversalParser;
29 import org.galagosearch.core.types.AdditionalDocumentText;
30 import org.galagosearch.core.types.DocumentData;
31 import org.galagosearch.core.types.DocumentExtent;
32 import org.galagosearch.core.types.DocumentSplit;
33 import org.galagosearch.core.types.DocumentWordPosition;
34 import org.galagosearch.core.types.ExtractedLink;
35 import org.galagosearch.core.types.NumberWordPosition;
36 import org.galagosearch.core.types.NumberedDocumentData;
37 import org.galagosearch.core.types.NumberedExtent;
38 import org.galagosearch.core.types.NumberedValuedExtent;
39 import org.galagosearch.tupleflow.NullSource;
40 import org.galagosearch.tupleflow.Order;
41 import org.galagosearch.tupleflow.Parameters;
42 import org.galagosearch.tupleflow.StreamCombiner;
43 import org.galagosearch.tupleflow.Utility;
44 import org.galagosearch.tupleflow.execution.ConnectionAssignmentType;
45 import org.galagosearch.tupleflow.execution.ConnectionPointType;
46 import org.galagosearch.tupleflow.execution.InputStep;
47 import org.galagosearch.tupleflow.execution.Job;
48 import org.galagosearch.tupleflow.execution.MultiStep;
49 import org.galagosearch.tupleflow.execution.OutputStep;
50 import org.galagosearch.tupleflow.execution.Stage;
51 import org.galagosearch.tupleflow.execution.StageConnectionPoint;
52 import org.galagosearch.tupleflow.execution.Step;
53 import org.galagosearch.tupleflow.types.XMLFragment;
54
55 /***
56 *
57 * @author trevor
58 */
59 public class BuildIndex {
60 String indexPath;
61 boolean stemming;
62 boolean useLinks;
63
64 public BuildIndex() {
65 this.stemming = false;
66 this.useLinks = false;
67 }
68
69 public BuildIndex(String indexPath) {
70 this.indexPath = indexPath;
71 this.stemming = true;
72 this.useLinks = true;
73 }
74
75 public Stage getSplitStage(String[] inputs) throws IOException {
76 Stage stage = new Stage("inputSplit");
77 stage.add(new StageConnectionPoint(ConnectionPointType.Output, "splits",
78 new DocumentSplit.FileNameStartKeyOrder()));
79
80 Parameters p = new Parameters();
81 for (String input : inputs) {
82 File inputFile = new File(input);
83
84 if (inputFile.isFile()) {
85 p.add("filename", input);
86 } else if (inputFile.isDirectory()) {
87 p.add("directory", input);
88 } else {
89 throw new IOException("Couldn't find file/directory: " + input);
90 }
91 }
92
93 stage.add(new Step(DocumentSource.class, p));
94 stage.add(Utility.getSorter(new DocumentSplit.FileNameStartKeyOrder()));
95 stage.add(new OutputStep("splits"));
96 return stage;
97 }
98
99 public ArrayList<Step> getExtractionSteps(
100 String outputName,
101 Class extractionClass,
102 Order sortOrder) {
103 ArrayList<Step> steps = new ArrayList<Step>();
104 steps.add(new Step(extractionClass));
105 steps.add(Utility.getSorter(sortOrder));
106 steps.add(new OutputStep(outputName));
107 return steps;
108 }
109
110 public Stage getParsePostingsStage() {
111 Stage stage = new Stage("parsePostings");
112
113 stage.add(new StageConnectionPoint(
114 ConnectionPointType.Input,
115 "splits", new DocumentSplit.FileNameStartKeyOrder()));
116 stage.add(new StageConnectionPoint(
117 ConnectionPointType.Output,
118 "postings", new DocumentWordPosition.DocumentWordPositionOrder()));
119 stage.add(new StageConnectionPoint(
120 ConnectionPointType.Output,
121 "extents", new DocumentExtent.IdentifierOrder()));
122 stage.add(new StageConnectionPoint(
123 ConnectionPointType.Output,
124 "documentData", new DocumentData.IdentifierOrder()));
125 if (stemming) {
126 stage.add(new StageConnectionPoint(
127 ConnectionPointType.Output,
128 "stemmedPostings", new DocumentWordPosition.DocumentWordPositionOrder()));
129 }
130 if (useLinks) {
131 stage.add(new StageConnectionPoint(
132 ConnectionPointType.Input,
133 "anchorText", new AdditionalDocumentText.IdentifierOrder()));
134 }
135
136 stage.add(new InputStep("splits"));
137 stage.add(new Step(UniversalParser.class));
138 if (useLinks) {
139 Parameters p = new Parameters();
140 p.add("textSource", "anchorText");
141 stage.add(new Step(AdditionalTextCombiner.class, p));
142 }
143 stage.add(new Step(TagTokenizer.class));
144
145 MultiStep multi = new MultiStep();
146 ArrayList<Step> text =
147 getExtractionSteps("postings", PostingsPositionExtractor.class,
148 new DocumentWordPosition.DocumentWordPositionOrder());
149 ArrayList<Step> extents =
150 getExtractionSteps("extents", ExtentExtractor.class,
151 new DocumentExtent.IdentifierOrder());
152 ArrayList<Step> documentData =
153 getExtractionSteps("documentData", DocumentDataExtractor.class,
154 new DocumentData.IdentifierOrder());
155
156 multi.groups.add(text);
157 multi.groups.add(extents);
158 multi.groups.add(documentData);
159
160 if (stemming) {
161 ArrayList<Step> stemmedSteps = new ArrayList<Step>();
162 stemmedSteps.add(new Step(Porter2Stemmer.class));
163 stemmedSteps.add(new Step(PostingsPositionExtractor.class));
164 stemmedSteps.add(Utility.getSorter(new DocumentWordPosition.DocumentWordPositionOrder()));
165 stemmedSteps.add(new OutputStep("stemmedPostings"));
166 multi.groups.add(stemmedSteps);
167 }
168
169 stage.add(multi);
170 return stage;
171 }
172
173 public Stage getParseLinksStage() {
174 Stage stage = new Stage("parseLinks");
175
176 stage.add(new StageConnectionPoint(
177 ConnectionPointType.Input,
178 "splits", new DocumentSplit.FileNameStartKeyOrder()));
179 stage.add(new StageConnectionPoint(
180 ConnectionPointType.Output,
181 "links", new ExtractedLink.DestUrlOrder()));
182 stage.add(new StageConnectionPoint(
183 ConnectionPointType.Output,
184 "documentUrls", new DocumentData.UrlOrder()));
185
186 stage.add(new InputStep("splits"));
187 stage.add(new Step(UniversalParser.class));
188 stage.add(new Step(TagTokenizer.class));
189
190 MultiStep multi = new MultiStep();
191 ArrayList<Step> links =
192 getExtractionSteps("links", LinkExtractor.class, new ExtractedLink.DestUrlOrder());
193 ArrayList<Step> data =
194 getExtractionSteps("documentUrls", DocumentDataExtractor.class,
195 new DocumentData.UrlOrder());
196
197 multi.groups.add(links);
198 multi.groups.add(data);
199 stage.add(multi);
200
201 return stage;
202 }
203
204 public Stage getLinkCombineStage() {
205 Stage stage = new Stage("linkCombine");
206
207 stage.add(new StageConnectionPoint(ConnectionPointType.Input, "documentUrls",
208 new DocumentData.UrlOrder()));
209 stage.add(new StageConnectionPoint(ConnectionPointType.Input, "links",
210 new ExtractedLink.DestUrlOrder()));
211 stage.add(new StageConnectionPoint(ConnectionPointType.Output, "anchorText",
212 new AdditionalDocumentText.IdentifierOrder()));
213
214 Parameters p = new Parameters();
215 p.add("documentDatas", "documentUrls");
216 p.add("extractedLinks", "links");
217 stage.add(new Step(LinkCombiner.class, p));
218 stage.add(new Step(AnchorTextCreator.class));
219 stage.add(Utility.getSorter(new AdditionalDocumentText.IdentifierOrder()));
220 stage.add(new OutputStep("anchorText"));
221
222 return stage;
223 }
224
225 public Stage getCollectionLengthStage() {
226 Stage stage = new Stage("collectionLength");
227
228 stage.add(new StageConnectionPoint(
229 ConnectionPointType.Input, "documentData",
230 new DocumentData.IdentifierOrder()));
231 stage.add(new StageConnectionPoint(
232 ConnectionPointType.Output, "collectionLength",
233 new XMLFragment.NodePathOrder()));
234
235 stage.add(new InputStep("documentData"));
236 stage.add(new Step(CollectionLengthCounter.class));
237 stage.add(Utility.getSorter(new XMLFragment.NodePathOrder()));
238 stage.add(new OutputStep("collectionLength"));
239
240 return stage;
241 }
242
243 public Stage getWritePostingsStage(String stageName, String inputName, String indexName) {
244 Stage stage = new Stage(stageName);
245
246 stage.add(new StageConnectionPoint(
247 ConnectionPointType.Input, inputName,
248 new NumberWordPosition.WordDocumentPositionOrder()));
249 stage.add(new InputStep(inputName));
250 Parameters p = new Parameters();
251 p.add("filename", indexPath + File.separator + "parts" + File.separator + indexName);
252 stage.add(new Step(PositionIndexWriter.class, p));
253 return stage;
254 }
255
256 public Stage getWriteExtentsStage() {
257 Stage stage = new Stage("writeExtents");
258
259 stage.add(new StageConnectionPoint(
260 ConnectionPointType.Input, "numberedExtents",
261 new NumberedExtent.ExtentNameNumberBeginOrder()));
262
263 stage.add(new InputStep("numberedExtents"));
264 Parameters p = new Parameters();
265 p.add("filename", indexPath + File.separator + "parts" + File.separator + "extents");
266 stage.add(new Step(ExtentIndexWriter.class, p));
267 return stage;
268 }
269
270 public Stage getWriteDatesStage() {
271 Stage stage = new Stage("writeDates");
272
273 stage.add(new StageConnectionPoint(
274 ConnectionPointType.Input, "numberedDateExtents",
275 new NumberedValuedExtent.ExtentNameNumberBeginOrder()));
276 Parameters p = new Parameters();
277 p.add("filename", indexPath + File.separator + "parts" + File.separator + "dates");
278 stage.add(new Step(ExtentValueIndexWriter.class));
279
280 return stage;
281 }
282
283 /***
284 * Write out document count and collection length information.
285 */
286 public Stage getWriteManifestStage() {
287 Stage stage = new Stage("writeManifest");
288
289 stage.add(new StageConnectionPoint(ConnectionPointType.Input,
290 "collectionLength",
291 new XMLFragment.NodePathOrder()));
292 stage.add(new InputStep("collectionLength"));
293 Parameters p = new Parameters();
294 p.add("filename", indexPath + File.separator + "manifest");
295 stage.add(new Step(ManifestWriter.class, p));
296 return stage;
297 }
298
299 /***
300 * Writes document lengths to a document lengths file.
301 */
302 public Stage getWriteDocumentLengthsStage() {
303 Stage stage = new Stage("writeDocumentLengths");
304
305 stage.add(new StageConnectionPoint(ConnectionPointType.Input,
306 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
307 Parameters p = new Parameters();
308 p.add("filename", indexPath + File.separator + "documentLengths");
309 stage.add(new InputStep("numberedDocumentData"));
310 stage.add(new Step(DocumentLengthsWriter.class, p));
311
312 return stage;
313 }
314
315 /***
316 * Writes document names to a document names file.
317 */
318 public Stage getWriteDocumentNamesStage() {
319 Stage stage = new Stage("writeDocumentNames");
320
321 stage.add(new StageConnectionPoint(ConnectionPointType.Input,
322 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
323 Parameters p = new Parameters();
324 p.add("filename", indexPath + File.separator + "documentNames");
325 stage.add(new InputStep("numberedDocumentData"));
326 stage.add(new Step(DocumentNameWriter.class, p));
327 return stage;
328 }
329
330 public Stage getNumberDocumentsStage() {
331 Stage stage = new Stage("numberDocuments");
332
333 stage.add(new StageConnectionPoint(ConnectionPointType.Input, "documentData",
334 new DocumentData.IdentifierOrder()));
335 stage.add(new StageConnectionPoint(ConnectionPointType.Output, "numberedDocumentData",
336 new NumberedDocumentData.NumberOrder()));
337 stage.add(new InputStep("documentData"));
338 stage.add(new Step(DocumentDataNumberer.class));
339 stage.add(Utility.getSorter(new NumberedDocumentData.NumberOrder()));
340 stage.add(new OutputStep("numberedDocumentData"));
341
342 return stage;
343 }
344
345 public Stage getNumberPostingsStage(String stageName, String inputName, String outputName) {
346 Stage stage = new Stage(stageName);
347
348 stage.add(new StageConnectionPoint(
349 ConnectionPointType.Input,
350 inputName, new DocumentWordPosition.DocumentWordPositionOrder()));
351 stage.add(new StageConnectionPoint(
352 ConnectionPointType.Input,
353 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
354 stage.add(new StageConnectionPoint(
355 ConnectionPointType.Output,
356 outputName, new NumberWordPosition.WordDocumentPositionOrder()));
357
358 stage.add(new InputStep(inputName));
359 stage.add(new Step(PositionPostingsNumberer.class));
360 stage.add(Utility.getSorter(new NumberWordPosition.WordDocumentPositionOrder()));
361 stage.add(new OutputStep(outputName));
362
363 return stage;
364 }
365
366 public Stage getNumberExtentsStage() {
367 Stage stage = new Stage("numberExtents");
368
369 stage.add(new StageConnectionPoint(
370 ConnectionPointType.Input,
371 "extents", new DocumentExtent.IdentifierOrder()));
372 stage.add(new StageConnectionPoint(
373 ConnectionPointType.Input,
374 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
375 stage.add(new StageConnectionPoint(
376 ConnectionPointType.Output,
377 "numberedExtents", new NumberedExtent.ExtentNameNumberBeginOrder()));
378
379 stage.add(new InputStep("extents"));
380 stage.add(new Step(ExtentsNumberer.class));
381 stage.add(Utility.getSorter(new NumberedExtent.ExtentNameNumberBeginOrder()));
382 stage.add(new OutputStep("numberedExtents"));
383
384 return stage;
385 }
386
387 public Job getIndexJob(String indexDirectory, String[] indexInputs,
388 boolean extractAnchors, boolean useStemming) throws IOException {
389 Job job = new Job();
390 this.indexPath = indexDirectory;
391 this.stemming = useStemming;
392 this.useLinks = extractAnchors;
393
394 job.add(getSplitStage(indexInputs));
395 job.add(getParsePostingsStage());
396 job.add(getWritePostingsStage("writePostings", "numberedPostings", "postings"));
397 job.add(getWriteManifestStage());
398 job.add(getWriteExtentsStage());
399 job.add(getWriteDocumentNamesStage());
400 job.add(getWriteDocumentLengthsStage());
401 job.add(getNumberDocumentsStage());
402 job.add(getNumberPostingsStage("numberPostings", "postings", "numberedPostings"));
403 job.add(getNumberExtentsStage());
404 job.add(getCollectionLengthStage());
405
406 job.connect("inputSplit", "parsePostings", ConnectionAssignmentType.Each);
407 job.connect("parsePostings", "numberDocuments", ConnectionAssignmentType.Combined);
408 job.connect("numberDocuments", "writeDocumentLengths", ConnectionAssignmentType.Combined);
409 job.connect("numberDocuments", "writeDocumentNames", ConnectionAssignmentType.Combined);
410 job.connect("numberDocuments", "numberPostings", ConnectionAssignmentType.Combined);
411 job.connect("numberDocuments", "numberExtents", ConnectionAssignmentType.Combined);
412 job.connect("parsePostings", "numberPostings", ConnectionAssignmentType.Each);
413 job.connect("parsePostings", "numberExtents", ConnectionAssignmentType.Each);
414 job.connect("numberExtents", "writeExtents", ConnectionAssignmentType.Combined);
415 job.connect("numberPostings", "writePostings", ConnectionAssignmentType.Combined);
416 job.connect("parsePostings", "collectionLength", ConnectionAssignmentType.Combined);
417 job.connect("collectionLength", "writeManifest", ConnectionAssignmentType.Combined);
418
419 if (useLinks) {
420 job.add(getParseLinksStage());
421 job.add(getLinkCombineStage());
422
423 job.connect("inputSplit", "parseLinks", ConnectionAssignmentType.Each);
424 job.connect("parseLinks", "linkCombine", ConnectionAssignmentType.Each);
425 job.connect("linkCombine", "parsePostings", ConnectionAssignmentType.Each);
426 }
427
428 if (stemming) {
429 job.add(getNumberPostingsStage("numberStemmedPostings",
430 "stemmedPostings",
431 "numberedStemmedPostings"));
432 job.add(getWritePostingsStage("writeStemmedPostings",
433 "numberedStemmedPostings",
434 "stemmedPostings"));
435 job.connect("parsePostings", "numberStemmedPostings", ConnectionAssignmentType.Each);
436 job.connect("numberDocuments", "numberStemmedPostings", ConnectionAssignmentType.Combined);
437 job.connect("numberStemmedPostings", "writeStemmedPostings", ConnectionAssignmentType.Combined);
438 }
439
440 return job;
441 }
442 }