Coverage Report - org.galagosearch.core.tools.BuildIndex
 
Classes in this File Line Coverage Branch Coverage Complexity
BuildIndex
97%
194/201
89%
16/18
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.tools;
 4  
 
 5  
 import java.io.File;
 6  
 import java.io.IOException;
 7  
 import java.util.ArrayList;
 8  
 import org.galagosearch.core.index.DocumentLengthsWriter;
 9  
 import org.galagosearch.core.index.DocumentNameWriter;
 10  
 import org.galagosearch.core.index.ExtentIndexWriter;
 11  
 import org.galagosearch.core.index.ExtentValueIndexWriter;
 12  
 import org.galagosearch.core.index.ManifestWriter;
 13  
 import org.galagosearch.core.index.PositionIndexWriter;
 14  
 import org.galagosearch.core.parse.AdditionalTextCombiner;
 15  
 import org.galagosearch.core.parse.AnchorTextCreator;
 16  
 import org.galagosearch.core.parse.CollectionLengthCounter;
 17  
 import org.galagosearch.core.parse.DocumentDataExtractor;
 18  
 import org.galagosearch.core.parse.DocumentDataNumberer;
 19  
 import org.galagosearch.core.parse.DocumentSource;
 20  
 import org.galagosearch.core.parse.ExtentExtractor;
 21  
 import org.galagosearch.core.parse.ExtentsNumberer;
 22  
 import org.galagosearch.core.parse.LinkCombiner;
 23  
 import org.galagosearch.core.parse.LinkExtractor;
 24  
 import org.galagosearch.core.parse.Porter2Stemmer;
 25  
 import org.galagosearch.core.parse.PositionPostingsNumberer;
 26  
 import org.galagosearch.core.parse.PostingsPositionExtractor;
 27  
 import org.galagosearch.core.parse.TagTokenizer;
 28  
 import org.galagosearch.core.parse.UniversalParser;
 29  
 import org.galagosearch.core.types.AdditionalDocumentText;
 30  
 import org.galagosearch.core.types.DocumentData;
 31  
 import org.galagosearch.core.types.DocumentExtent;
 32  
 import org.galagosearch.core.types.DocumentSplit;
 33  
 import org.galagosearch.core.types.DocumentWordPosition;
 34  
 import org.galagosearch.core.types.ExtractedLink;
 35  
 import org.galagosearch.core.types.NumberWordPosition;
 36  
 import org.galagosearch.core.types.NumberedDocumentData;
 37  
 import org.galagosearch.core.types.NumberedExtent;
 38  
 import org.galagosearch.core.types.NumberedValuedExtent;
 39  
 import org.galagosearch.tupleflow.NullSource;
 40  
 import org.galagosearch.tupleflow.Order;
 41  
 import org.galagosearch.tupleflow.Parameters;
 42  
 import org.galagosearch.tupleflow.StreamCombiner;
 43  
 import org.galagosearch.tupleflow.Utility;
 44  
 import org.galagosearch.tupleflow.execution.ConnectionAssignmentType;
 45  
 import org.galagosearch.tupleflow.execution.ConnectionPointType;
 46  
 import org.galagosearch.tupleflow.execution.InputStep;
 47  
 import org.galagosearch.tupleflow.execution.Job;
 48  
 import org.galagosearch.tupleflow.execution.MultiStep;
 49  
 import org.galagosearch.tupleflow.execution.OutputStep;
 50  
 import org.galagosearch.tupleflow.execution.Stage;
 51  
 import org.galagosearch.tupleflow.execution.StageConnectionPoint;
 52  
 import org.galagosearch.tupleflow.execution.Step;
 53  
 import org.galagosearch.tupleflow.types.XMLFragment;
 54  
 
 55  
 /**
 56  
  *
 57  
  * @author trevor
 58  
  */
 59  
 public class BuildIndex {
 60  
     String indexPath;
 61  
     boolean stemming;
 62  
     boolean useLinks;
 63  
 
 64  52
     public BuildIndex() {
 65  52
         this.stemming = false;
 66  52
         this.useLinks = false;
 67  52
     }
 68  
 
 69  0
     public BuildIndex(String indexPath) {
 70  0
         this.indexPath = indexPath;
 71  0
         this.stemming = true;
 72  0
         this.useLinks = true;
 73  0
     }
 74  
 
 75  
     public Stage getSplitStage(String[] inputs) throws IOException {
 76  20
         Stage stage = new Stage("inputSplit");
 77  20
         stage.add(new StageConnectionPoint(ConnectionPointType.Output, "splits",
 78  
                                            new DocumentSplit.FileNameStartKeyOrder()));
 79  
 
 80  20
         Parameters p = new Parameters();
 81  40
         for (String input : inputs) {
 82  20
             File inputFile = new File(input);
 83  
             
 84  20
             if (inputFile.isFile()) {
 85  0
                 p.add("filename", input);
 86  20
             } else if (inputFile.isDirectory()) {
 87  20
                 p.add("directory", input);
 88  
             } else {
 89  0
                 throw new IOException("Couldn't find file/directory: " + input);
 90  
             }
 91  
         }
 92  
 
 93  20
         stage.add(new Step(DocumentSource.class, p));
 94  20
         stage.add(Utility.getSorter(new DocumentSplit.FileNameStartKeyOrder()));
 95  20
         stage.add(new OutputStep("splits"));
 96  20
         return stage;
 97  
     }
 98  
 
 99  
     public ArrayList<Step> getExtractionSteps(
 100  
             String outputName,
 101  
             Class extractionClass,
 102  
             Order sortOrder) {
 103  76
         ArrayList<Step> steps = new ArrayList<Step>();
 104  76
         steps.add(new Step(extractionClass));
 105  76
         steps.add(Utility.getSorter(sortOrder));
 106  76
         steps.add(new OutputStep(outputName));
 107  76
         return steps;
 108  
     }
 109  
 
 110  
     public Stage getParsePostingsStage() {
 111  20
         Stage stage = new Stage("parsePostings");
 112  
 
 113  20
         stage.add(new StageConnectionPoint(
 114  
                 ConnectionPointType.Input,
 115  
                 "splits", new DocumentSplit.FileNameStartKeyOrder()));
 116  20
         stage.add(new StageConnectionPoint(
 117  
                 ConnectionPointType.Output,
 118  
                 "postings", new DocumentWordPosition.DocumentWordPositionOrder()));
 119  20
         stage.add(new StageConnectionPoint(
 120  
                 ConnectionPointType.Output,
 121  
                 "extents", new DocumentExtent.IdentifierOrder()));
 122  20
         stage.add(new StageConnectionPoint(
 123  
                 ConnectionPointType.Output,
 124  
                 "documentData", new DocumentData.IdentifierOrder()));
 125  20
         if (stemming) {
 126  8
             stage.add(new StageConnectionPoint(
 127  
                 ConnectionPointType.Output,
 128  
                 "stemmedPostings", new DocumentWordPosition.DocumentWordPositionOrder()));
 129  
         }
 130  20
         if (useLinks) {
 131  8
             stage.add(new StageConnectionPoint(
 132  
                 ConnectionPointType.Input,
 133  
                 "anchorText", new AdditionalDocumentText.IdentifierOrder()));
 134  
         }
 135  
 
 136  20
         stage.add(new InputStep("splits"));
 137  20
         stage.add(new Step(UniversalParser.class));
 138  20
         if (useLinks) {
 139  8
             Parameters p = new Parameters();
 140  8
             p.add("textSource", "anchorText");
 141  8
             stage.add(new Step(AdditionalTextCombiner.class, p));
 142  
         }
 143  20
         stage.add(new Step(TagTokenizer.class));
 144  
 
 145  20
         MultiStep multi = new MultiStep();
 146  20
         ArrayList<Step> text =
 147  
                 getExtractionSteps("postings", PostingsPositionExtractor.class,
 148  
                                    new DocumentWordPosition.DocumentWordPositionOrder());
 149  20
         ArrayList<Step> extents =
 150  
                 getExtractionSteps("extents", ExtentExtractor.class,
 151  
                                    new DocumentExtent.IdentifierOrder());
 152  20
         ArrayList<Step> documentData =
 153  
                 getExtractionSteps("documentData", DocumentDataExtractor.class,
 154  
                                    new DocumentData.IdentifierOrder());
 155  
 
 156  20
         multi.groups.add(text);
 157  20
         multi.groups.add(extents);
 158  20
         multi.groups.add(documentData);
 159  
 
 160  20
         if (stemming) {
 161  8
             ArrayList<Step> stemmedSteps = new ArrayList<Step>();
 162  8
             stemmedSteps.add(new Step(Porter2Stemmer.class));
 163  8
             stemmedSteps.add(new Step(PostingsPositionExtractor.class));
 164  8
             stemmedSteps.add(Utility.getSorter(new DocumentWordPosition.DocumentWordPositionOrder()));
 165  8
             stemmedSteps.add(new OutputStep("stemmedPostings"));
 166  8
             multi.groups.add(stemmedSteps);
 167  
         }
 168  
 
 169  20
         stage.add(multi);
 170  20
         return stage;
 171  
     }
 172  
 
 173  
     public Stage getParseLinksStage() {
 174  8
         Stage stage = new Stage("parseLinks");
 175  
 
 176  8
         stage.add(new StageConnectionPoint(
 177  
                 ConnectionPointType.Input,
 178  
                 "splits", new DocumentSplit.FileNameStartKeyOrder()));
 179  8
         stage.add(new StageConnectionPoint(
 180  
                 ConnectionPointType.Output,
 181  
                 "links", new ExtractedLink.DestUrlOrder()));
 182  8
         stage.add(new StageConnectionPoint(
 183  
                 ConnectionPointType.Output,
 184  
                 "documentUrls", new DocumentData.UrlOrder()));
 185  
 
 186  8
         stage.add(new InputStep("splits"));
 187  8
         stage.add(new Step(UniversalParser.class));
 188  8
         stage.add(new Step(TagTokenizer.class));
 189  
 
 190  8
         MultiStep multi = new MultiStep();
 191  8
         ArrayList<Step> links =
 192  
                 getExtractionSteps("links", LinkExtractor.class, new ExtractedLink.DestUrlOrder());
 193  8
         ArrayList<Step> data =
 194  
                 getExtractionSteps("documentUrls", DocumentDataExtractor.class,
 195  
                                    new DocumentData.UrlOrder());
 196  
 
 197  8
         multi.groups.add(links);
 198  8
         multi.groups.add(data);
 199  8
         stage.add(multi);
 200  
 
 201  8
         return stage;
 202  
     }
 203  
 
 204  
     public Stage getLinkCombineStage() {
 205  12
         Stage stage = new Stage("linkCombine");
 206  
 
 207  12
         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "documentUrls",
 208  
                                            new DocumentData.UrlOrder()));
 209  12
         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "links",
 210  
                                            new ExtractedLink.DestUrlOrder()));
 211  12
         stage.add(new StageConnectionPoint(ConnectionPointType.Output, "anchorText",
 212  
                                            new AdditionalDocumentText.IdentifierOrder()));
 213  
 
 214  12
         Parameters p = new Parameters();
 215  12
         p.add("documentDatas", "documentUrls");
 216  12
         p.add("extractedLinks", "links");
 217  12
         stage.add(new Step(LinkCombiner.class, p));
 218  12
         stage.add(new Step(AnchorTextCreator.class));
 219  12
         stage.add(Utility.getSorter(new AdditionalDocumentText.IdentifierOrder()));
 220  12
         stage.add(new OutputStep("anchorText"));
 221  
 
 222  12
         return stage;
 223  
     }
 224  
 
 225  
     public Stage getCollectionLengthStage() {
 226  20
         Stage stage = new Stage("collectionLength");
 227  
 
 228  20
         stage.add(new StageConnectionPoint(
 229  
                   ConnectionPointType.Input, "documentData",
 230  
                   new DocumentData.IdentifierOrder()));
 231  20
         stage.add(new StageConnectionPoint(
 232  
                   ConnectionPointType.Output, "collectionLength",
 233  
                   new XMLFragment.NodePathOrder()));
 234  
 
 235  20
         stage.add(new InputStep("documentData"));
 236  20
         stage.add(new Step(CollectionLengthCounter.class));
 237  20
         stage.add(Utility.getSorter(new XMLFragment.NodePathOrder()));
 238  20
         stage.add(new OutputStep("collectionLength"));
 239  
 
 240  20
         return stage;
 241  
     }
 242  
 
 243  
     public Stage getWritePostingsStage(String stageName, String inputName, String indexName) {
 244  28
         Stage stage = new Stage(stageName);
 245  
 
 246  28
         stage.add(new StageConnectionPoint(
 247  
                 ConnectionPointType.Input, inputName,
 248  
                 new NumberWordPosition.WordDocumentPositionOrder()));
 249  28
         stage.add(new InputStep(inputName));
 250  28
         Parameters p = new Parameters();
 251  28
         p.add("filename", indexPath + File.separator + "parts" + File.separator + indexName);
 252  28
         stage.add(new Step(PositionIndexWriter.class, p));
 253  28
         return stage;
 254  
     }
 255  
 
 256  
     public Stage getWriteExtentsStage() {
 257  20
         Stage stage = new Stage("writeExtents");
 258  
 
 259  20
         stage.add(new StageConnectionPoint(
 260  
                 ConnectionPointType.Input, "numberedExtents",
 261  
                 new NumberedExtent.ExtentNameNumberBeginOrder()));
 262  
 
 263  20
         stage.add(new InputStep("numberedExtents"));
 264  20
         Parameters p = new Parameters();
 265  20
         p.add("filename", indexPath + File.separator + "parts" + File.separator + "extents");
 266  20
         stage.add(new Step(ExtentIndexWriter.class, p));
 267  20
         return stage;
 268  
     }
 269  
 
 270  
     public Stage getWriteDatesStage() {
 271  4
         Stage stage = new Stage("writeDates");
 272  
 
 273  4
         stage.add(new StageConnectionPoint(
 274  
                 ConnectionPointType.Input, "numberedDateExtents",
 275  
                 new NumberedValuedExtent.ExtentNameNumberBeginOrder()));
 276  4
         Parameters p = new Parameters();
 277  4
         p.add("filename", indexPath + File.separator + "parts" + File.separator + "dates");
 278  4
         stage.add(new Step(ExtentValueIndexWriter.class));
 279  
 
 280  4
         return stage;
 281  
     }
 282  
 
 283  
     /**
 284  
      * Write out document count and collection length information.
 285  
      */
 286  
     public Stage getWriteManifestStage() {
 287  20
         Stage stage = new Stage("writeManifest");
 288  
 
 289  20
         stage.add(new StageConnectionPoint(ConnectionPointType.Input,
 290  
                                            "collectionLength",
 291  
                                            new XMLFragment.NodePathOrder()));
 292  20
         stage.add(new InputStep("collectionLength"));
 293  20
         Parameters p = new Parameters();
 294  20
         p.add("filename", indexPath + File.separator + "manifest");
 295  20
         stage.add(new Step(ManifestWriter.class, p));
 296  20
         return stage;
 297  
     }
 298  
 
 299  
     /**
 300  
      * Writes document lengths to a document lengths file.
 301  
      */
 302  
     public Stage getWriteDocumentLengthsStage() {
 303  20
         Stage stage = new Stage("writeDocumentLengths");
 304  
 
 305  20
         stage.add(new StageConnectionPoint(ConnectionPointType.Input,
 306  
                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
 307  20
         Parameters p = new Parameters();
 308  20
         p.add("filename", indexPath + File.separator + "documentLengths");
 309  20
         stage.add(new InputStep("numberedDocumentData"));
 310  20
         stage.add(new Step(DocumentLengthsWriter.class, p));
 311  
 
 312  20
         return stage;
 313  
     }
 314  
 
 315  
     /**
 316  
      * Writes document names to a document names file.
 317  
      */
 318  
     public Stage getWriteDocumentNamesStage() {
 319  16
         Stage stage = new Stage("writeDocumentNames");
 320  
 
 321  16
         stage.add(new StageConnectionPoint(ConnectionPointType.Input,
 322  
                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
 323  16
         Parameters p = new Parameters();
 324  16
         p.add("filename", indexPath + File.separator + "documentNames");
 325  16
         stage.add(new InputStep("numberedDocumentData"));
 326  16
         stage.add(new Step(DocumentNameWriter.class, p));
 327  16
         return stage;
 328  
     }
 329  
 
 330  
     public Stage getNumberDocumentsStage() {
 331  16
         Stage stage = new Stage("numberDocuments");
 332  
 
 333  16
         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "documentData",
 334  
                     new DocumentData.IdentifierOrder()));
 335  16
         stage.add(new StageConnectionPoint(ConnectionPointType.Output, "numberedDocumentData",
 336  
                     new NumberedDocumentData.NumberOrder()));
 337  16
         stage.add(new InputStep("documentData"));
 338  16
         stage.add(new Step(DocumentDataNumberer.class));
 339  16
         stage.add(Utility.getSorter(new NumberedDocumentData.NumberOrder()));
 340  16
         stage.add(new OutputStep("numberedDocumentData"));
 341  
 
 342  16
         return stage;
 343  
     }
 344  
 
 345  
     public Stage getNumberPostingsStage(String stageName, String inputName, String outputName) {
 346  24
         Stage stage = new Stage(stageName);
 347  
 
 348  24
         stage.add(new StageConnectionPoint(
 349  
                 ConnectionPointType.Input,
 350  
                 inputName, new DocumentWordPosition.DocumentWordPositionOrder()));
 351  24
         stage.add(new StageConnectionPoint(
 352  
                 ConnectionPointType.Input,
 353  
                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
 354  24
         stage.add(new StageConnectionPoint(
 355  
                 ConnectionPointType.Output,
 356  
                 outputName, new NumberWordPosition.WordDocumentPositionOrder()));
 357  
 
 358  24
         stage.add(new InputStep(inputName));
 359  24
         stage.add(new Step(PositionPostingsNumberer.class));
 360  24
         stage.add(Utility.getSorter(new NumberWordPosition.WordDocumentPositionOrder()));
 361  24
         stage.add(new OutputStep(outputName));
 362  
 
 363  24
         return stage;
 364  
     }
 365  
 
 366  
     public Stage getNumberExtentsStage() {
 367  16
         Stage stage = new Stage("numberExtents");
 368  
 
 369  16
         stage.add(new StageConnectionPoint(
 370  
                 ConnectionPointType.Input,
 371  
                 "extents", new DocumentExtent.IdentifierOrder()));
 372  16
         stage.add(new StageConnectionPoint(
 373  
                 ConnectionPointType.Input,
 374  
                 "numberedDocumentData", new NumberedDocumentData.NumberOrder()));
 375  16
         stage.add(new StageConnectionPoint(
 376  
                 ConnectionPointType.Output,
 377  
                 "numberedExtents", new NumberedExtent.ExtentNameNumberBeginOrder()));
 378  
 
 379  16
         stage.add(new InputStep("extents"));
 380  16
         stage.add(new Step(ExtentsNumberer.class));
 381  16
         stage.add(Utility.getSorter(new NumberedExtent.ExtentNameNumberBeginOrder()));
 382  16
         stage.add(new OutputStep("numberedExtents"));
 383  
 
 384  16
         return stage;
 385  
     }
 386  
 
 387  
     public Job getIndexJob(String indexDirectory, String[] indexInputs,
 388  
                            boolean extractAnchors, boolean useStemming) throws IOException {
 389  16
         Job job = new Job();
 390  16
         this.indexPath = indexDirectory;
 391  16
         this.stemming = useStemming;
 392  16
         this.useLinks = extractAnchors;
 393  
 
 394  16
         job.add(getSplitStage(indexInputs));
 395  16
         job.add(getParsePostingsStage());
 396  16
         job.add(getWritePostingsStage("writePostings", "numberedPostings", "postings"));
 397  16
         job.add(getWriteManifestStage());
 398  16
         job.add(getWriteExtentsStage());
 399  16
         job.add(getWriteDocumentNamesStage());
 400  16
         job.add(getWriteDocumentLengthsStage());
 401  16
         job.add(getNumberDocumentsStage());
 402  16
         job.add(getNumberPostingsStage("numberPostings", "postings", "numberedPostings"));
 403  16
         job.add(getNumberExtentsStage());
 404  16
         job.add(getCollectionLengthStage());
 405  
 
 406  16
         job.connect("inputSplit", "parsePostings", ConnectionAssignmentType.Each);
 407  16
         job.connect("parsePostings", "numberDocuments", ConnectionAssignmentType.Combined);
 408  16
         job.connect("numberDocuments", "writeDocumentLengths", ConnectionAssignmentType.Combined);
 409  16
         job.connect("numberDocuments", "writeDocumentNames", ConnectionAssignmentType.Combined);
 410  16
         job.connect("numberDocuments", "numberPostings", ConnectionAssignmentType.Combined);
 411  16
         job.connect("numberDocuments", "numberExtents", ConnectionAssignmentType.Combined);
 412  16
         job.connect("parsePostings", "numberPostings", ConnectionAssignmentType.Each);
 413  16
         job.connect("parsePostings", "numberExtents", ConnectionAssignmentType.Each);
 414  16
         job.connect("numberExtents", "writeExtents", ConnectionAssignmentType.Combined);
 415  16
         job.connect("numberPostings", "writePostings", ConnectionAssignmentType.Combined);
 416  16
         job.connect("parsePostings", "collectionLength", ConnectionAssignmentType.Combined);
 417  16
         job.connect("collectionLength", "writeManifest", ConnectionAssignmentType.Combined);
 418  
 
 419  16
         if (useLinks) {
 420  8
             job.add(getParseLinksStage());
 421  8
             job.add(getLinkCombineStage());
 422  
 
 423  8
             job.connect("inputSplit", "parseLinks", ConnectionAssignmentType.Each);
 424  8
             job.connect("parseLinks", "linkCombine", ConnectionAssignmentType.Each);
 425  8
             job.connect("linkCombine", "parsePostings", ConnectionAssignmentType.Each);
 426  
         }
 427  
 
 428  16
         if (stemming) {
 429  8
             job.add(getNumberPostingsStage("numberStemmedPostings",
 430  
                                            "stemmedPostings",
 431  
                                            "numberedStemmedPostings"));
 432  8
             job.add(getWritePostingsStage("writeStemmedPostings",
 433  
                                           "numberedStemmedPostings",
 434  
                                           "stemmedPostings"));
 435  8
             job.connect("parsePostings", "numberStemmedPostings", ConnectionAssignmentType.Each);
 436  8
             job.connect("numberDocuments", "numberStemmedPostings", ConnectionAssignmentType.Combined);
 437  8
             job.connect("numberStemmedPostings", "writeStemmedPostings", ConnectionAssignmentType.Combined);
 438  
         }
 439  
 
 440  16
         return job;
 441  
     }
 442  
 }