Coverage Report - org.galagosearch.core.tools.App
 
Classes in this File Line Coverage Branch Coverage Complexity
App
10%
27/263
5%
4/76
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.tools;
 4  
 
 5  
 import java.io.File;
 6  
 import java.io.IOException;
 7  
 import java.util.ArrayList;
 8  
 import java.util.Map.Entry;
 9  
 import java.util.concurrent.ExecutionException;
 10  
 import org.galagosearch.core.index.StructuredIndex;
 11  
 import org.galagosearch.core.index.StructuredIndexPartReader;
 12  
 import org.galagosearch.core.parse.Document;
 13  
 import org.galagosearch.core.parse.DocumentIndexReader;
 14  
 import org.galagosearch.core.parse.DocumentIndexWriter;
 15  
 import org.galagosearch.core.parse.DocumentSource;
 16  
 import org.galagosearch.core.parse.DocumentToKeyValuePair;
 17  
 import org.galagosearch.core.parse.KeyValuePairToDocument;
 18  
 import org.galagosearch.core.parse.UniversalParser;
 19  
 import org.galagosearch.core.index.IndexReader;
 20  
 import org.galagosearch.core.retrieval.Retrieval;
 21  
 import org.galagosearch.core.retrieval.structured.IndexIterator;
 22  
 import org.galagosearch.core.store.DocumentIndexStore;
 23  
 import org.galagosearch.core.store.DocumentStore;
 24  
 import org.galagosearch.core.store.NullStore;
 25  
 import org.galagosearch.tupleflow.Parameters;
 26  
 import org.galagosearch.tupleflow.execution.ConnectionPointType;
 27  
 import org.galagosearch.tupleflow.execution.InputStep;
 28  
 import org.galagosearch.tupleflow.execution.Job;
 29  
 import org.galagosearch.tupleflow.execution.OutputStep;
 30  
 import org.galagosearch.tupleflow.execution.Stage;
 31  
 import org.galagosearch.tupleflow.execution.StageConnectionPoint;
 32  
 import org.galagosearch.tupleflow.execution.Step;
 33  
 import org.galagosearch.core.types.KeyValuePair;
 34  
 import org.galagosearch.tupleflow.FileOrderedReader;
 35  
 import org.galagosearch.tupleflow.Utility;
 36  
 import org.galagosearch.tupleflow.execution.ConnectionAssignmentType;
 37  
 import org.galagosearch.tupleflow.execution.ErrorStore;
 38  
 import org.galagosearch.tupleflow.execution.JobExecutor;
 39  
 import org.mortbay.jetty.Server;
 40  
 
 41  
 /**
 42  
  *
 43  
  * @author trevor
 44  
  */
 45  0
 public class App {
 46  
     private static void commandHelpBatchSearch() {
 47  0
         System.out.println("galago batch-search <args>");
 48  0
         System.out.println();
 49  0
         System.out.println("  Runs a batch of queries against an index and produces TREC-formatted");
 50  0
         System.out.println("  output.  The output can be used with retrieval evaluation tools like");
 51  0
         System.out.println("  galago eval (org.galagosearch.core.eval).");
 52  0
         System.out.println();
 53  0
         System.out.println("  Sample invocation:");
 54  0
         System.out.println("     galago batch-search --index=/tmp/myindex --count=200 /tmp/queries");
 55  0
         System.out.println();
 56  0
         System.out.println("  Args:");
 57  0
         System.out.println("     --index=path_to_your_index");
 58  0
         System.out.println("     --count : Number of results to return for each query, default=1000");
 59  0
         System.out.println();
 60  0
         System.out.println("  Query file format:");
 61  0
         System.out.println("    The query file is an XML file containing a set of queries.  Each query");
 62  0
         System.out.println("    has text tag, which contains the text of the query, and a number tag, ");
 63  0
         System.out.println("    which uniquely identifies the query in the output.");
 64  0
         System.out.println();
 65  0
         System.out.println("  Example query file:");
 66  0
         System.out.println("  <parameters>");
 67  0
         System.out.println("     <query>");
 68  0
         System.out.println("        <number>CACM-408</number>");
 69  0
         System.out.println("        <text>#combine(my query)</text>");
 70  0
         System.out.println("     </query>");
 71  0
         System.out.println("     <query>");
 72  0
         System.out.println("        <number>WIKI-410</number>");
 73  0
         System.out.println("        <text>#combine(another query)</text>");
 74  0
         System.out.println("     </query>");
 75  0
         System.out.println("  </parameters>");
 76  0
     }
 77  
 
 78  
     private static void commandHelpBuild() {
 79  0
         System.out.println("galago build [flags] <index> (<input>)+");
 80  0
         System.out.println();
 81  0
         System.out.println("  Builds a Galago StructuredIndex with TupleFlow, using one thread ");
 82  0
         System.out.println("  for each CPU core on your computer.  While some debugging output ");
 83  0
         System.out.println("  will be displayed on the screen, most of the status information will");
 84  0
         System.out.println("  appear on a web page.  A URL should appear in the command output ");
 85  0
         System.out.println("  that will direct you to the status page.");
 86  0
         System.out.println();
 87  
 
 88  0
         System.out.println("<input>:  Can be either a file or directory, and as many can be");
 89  0
         System.out.println("          specified as you like.  Galago can read html, xml, txt, ");
 90  0
         System.out.println("          arc (Heritrix), trectext, trecweb and corpus files.");
 91  0
         System.out.println("          Files may be gzip compressed (.gz).");
 92  0
         System.out.println("<index>:  The directory path of the index to produce.");
 93  0
         System.out.println();
 94  0
         System.out.println("Flags:");
 95  0
         System.out.println("  --links={true|false}:    Selects whether to collect anchor text ");
 96  0
         System.out.println("                           [default=false]");
 97  0
         System.out.println("  --stemming={true|false}: Selects whether to build stemmed inverted ");
 98  0
         System.out.println("                           lists in addition to non-stemmed ones.");
 99  0
         System.out.println("                           [default=true]");
 100  0
     }
 101  
 
 102  
     private static void handleBuild(String[] args) throws Exception {
 103  
         // handle --links and --stemming flags
 104  0
         ArrayList<String> documentFiles = new ArrayList<String>();
 105  0
         ArrayList<String> flags = new ArrayList<String>();
 106  0
         for (String arg : Utility.subarray(args, 2)) {
 107  0
             if (arg.startsWith("--")) {
 108  0
                 flags.add(arg);
 109  
             } else {
 110  0
                 documentFiles.add(arg);
 111  
             }
 112  
         }
 113  
 
 114  0
         Parameters p = new Parameters(flags.toArray(new String[0]));
 115  0
         boolean useLinks = p.get("links", false);
 116  0
         boolean stemming = p.get("stemming", true);
 117  0
         String[] docs = documentFiles.toArray(new String[0]);
 118  
 
 119  0
         BuildIndex build = new BuildIndex();
 120  0
         Job job = build.getIndexJob(args[1], docs, useLinks, stemming);
 121  0
         ErrorStore store = new ErrorStore();
 122  0
         JobExecutor.runLocally(job, store);
 123  0
         if (store.hasStatements()) {
 124  0
             System.out.println(store.toString());
 125  
         }
 126  0
     }
 127  
 
 128  
     private static void handleDoc(String[] args) throws IOException {
 129  0
         String indexPath = args[1];
 130  0
         String identifier = args[2];
 131  0
         DocumentIndexReader reader = new DocumentIndexReader(indexPath);
 132  0
         Document document = reader.getDocument(identifier);
 133  0
         System.out.println(document.text);
 134  0
     }
 135  
 
 136  
     private static void handleDumpIndex(String[] args) throws IOException {
 137  0
         StructuredIndexPartReader reader = StructuredIndex.openIndexPart(args[1]);
 138  0
         IndexIterator iterator = reader.getIterator();
 139  
         do {
 140  0
             System.out.println(iterator.getRecordString());
 141  0
         } while (iterator.nextRecord());
 142  0
     }
 143  
 
 144  
     private static void handleDumpCorpus(String[] args) throws IOException {
 145  0
         DocumentIndexReader reader = new DocumentIndexReader(args[1]);
 146  0
         DocumentIndexReader.Iterator iterator = reader.getIterator();
 147  0
         while (!iterator.isDone()) {
 148  0
             System.out.println("#IDENTIFIER: " + iterator.getKey());
 149  0
             Document document = iterator.getDocument();
 150  0
             System.out.println("#METADATA");
 151  0
             for (Entry<String, String> entry : document.metadata.entrySet()) {
 152  0
                 System.out.println(entry.getKey() + "," + entry.getValue());
 153  
             }
 154  0
             System.out.println("#TEXT");
 155  0
             System.out.println(document.text);
 156  0
             iterator.nextDocument();
 157  0
         }
 158  0
     }
 159  
 
 160  
     private static void handleDumpConnection(String[] args) throws IOException {
 161  0
         FileOrderedReader reader = new FileOrderedReader(args[1]);
 162  
         Object o;
 163  0
         while ((o = reader.read()) != null) {
 164  0
             System.out.println(o);
 165  
         }
 166  0
     }
 167  
 
 168  
     private static void handleDumpKeys(String[] args) throws IOException {
 169  0
         IndexReader reader = new IndexReader(args[1]);
 170  0
         IndexReader.Iterator iterator = reader.getIterator();
 171  0
         while (!iterator.isDone()) {
 172  0
             System.out.println(iterator.getKey());
 173  0
             iterator.getValueString();
 174  0
             iterator.nextKey();
 175  
         }
 176  0
     }
 177  
 
 178  
     private static void handleMakeCorpus(String[] args) throws Exception {
 179  0
         Job job = getDocumentConverter(args[1], Utility.subarray(args, 2));
 180  0
         ErrorStore store = new ErrorStore();
 181  0
         JobExecutor.runLocally(job, store);
 182  0
         if (store.hasStatements()) {
 183  0
             System.out.println(store.toString());
 184  
         }
 185  0
     }
 186  
 
 187  
     private static void handleBatchSearch(String[] args) throws Exception {
 188  0
         BatchSearch.main(Utility.subarray(args, 1));
 189  0
     }
 190  
 
 191  
     private static void handleSearch(String[] args) throws Exception, IOException {
 192  0
         String indexPath = args[1];
 193  
 
 194  0
         Retrieval retrieval = Retrieval.instance(indexPath);
 195  0
         DocumentStore store = null;
 196  0
         if (args.length > 2) {
 197  0
             ArrayList<DocumentIndexReader> readers = new ArrayList<DocumentIndexReader>();
 198  0
             for (int i = 2; i < args.length; ++i) {
 199  0
                 readers.add(new DocumentIndexReader(args[i]));
 200  
             }
 201  0
             store = new DocumentIndexStore(readers);
 202  0
         } else {
 203  0
             store = new NullStore();
 204  
         }
 205  0
         Search search = new Search(retrieval, store);
 206  0
         int port = Utility.getFreePort();
 207  0
         Server server = new Server(port);
 208  0
         server.addHandler(new SearchWebHandler(search));
 209  0
         server.start();
 210  0
         System.out.println("Server: http://localhost:" + port);
 211  0
     }
 212  
 
 213  
     public static void handleEval(String[] args) throws IOException {
 214  0
         org.galagosearch.core.eval.Main.main(args);
 215  0
     }
 216  
 
 217  
     public static Job getDocumentConverter(String outputCorpus, String[] inputs) throws IOException {
 218  4
         Job job = new Job();
 219  
 
 220  4
         Stage stage = new Stage("split");
 221  4
         stage.add(new StageConnectionPoint(ConnectionPointType.Output, "docs",
 222  
                 new KeyValuePair.KeyOrder()));
 223  4
         Parameters p = new Parameters();
 224  8
         for (String input : inputs) {
 225  4
             File inputFile = new File(input);
 226  
 
 227  4
             if (inputFile.isFile()) {
 228  0
                 p.add("filename", input);
 229  4
             } else if (inputFile.isDirectory()) {
 230  4
                 p.add("directory", input);
 231  
             } else {
 232  0
                 throw new IOException("Couldn't find file/directory: " + input);
 233  
             }
 234  
         }
 235  
 
 236  4
         stage.add(new Step(DocumentSource.class, p));
 237  4
         p = new Parameters();
 238  4
         p.add("identifier", "stripped");
 239  4
         stage.add(new Step(UniversalParser.class, p));
 240  4
         stage.add(new Step(DocumentToKeyValuePair.class));
 241  4
         stage.add(Utility.getSorter(new KeyValuePair.KeyOrder()));
 242  4
         stage.add(new OutputStep("docs"));
 243  4
         job.add(stage);
 244  
 
 245  4
         stage = new Stage("docwrite");
 246  4
         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "docs",
 247  
                 new KeyValuePair.KeyOrder()));
 248  4
         stage.add(new InputStep("docs"));
 249  4
         stage.add(new Step(KeyValuePairToDocument.class));
 250  4
         p = new Parameters();
 251  4
         p.add("filename", outputCorpus);
 252  4
         stage.add(new Step(DocumentIndexWriter.class, p));
 253  
 
 254  4
         job.add(stage);
 255  4
         job.connect("split", "docwrite", ConnectionAssignmentType.Combined);
 256  4
         return job;
 257  
     }
 258  
 
 259  
     public static void usage() {
 260  0
         System.out.println("Type 'galago help <command>' to get more help about any command,");
 261  0
         System.out.println("   or 'galago help all' to see all the documentation at once.");
 262  0
         System.out.println();
 263  
         
 264  0
         System.out.println("Popular commands:");
 265  0
         System.out.println("   build");
 266  0
         System.out.println("   search");
 267  0
         System.out.println("   batch-search");
 268  0
         System.out.println();
 269  
 
 270  0
         System.out.println("All commands:");
 271  0
         System.out.println("   batch-search");
 272  0
         System.out.println("   build");
 273  0
         System.out.println("   doc");
 274  0
         System.out.println("   dump-connection");
 275  0
         System.out.println("   dump-corpus");
 276  0
         System.out.println("   dump-index");
 277  0
         System.out.println("   dump-keys");
 278  0
         System.out.println("   eval");
 279  0
         System.out.println("   make-corpus");
 280  0
         System.out.println("   search");
 281  0
     }
 282  
 
 283  
     public static void commandHelp(String command) throws IOException {
 284  0
         if (command.equals("batch-search")) {
 285  0
             commandHelpBatchSearch();
 286  0
         } else if (command.equals("build")) {
 287  0
             commandHelpBuild();
 288  0
         } else if (command.equals("doc")) {
 289  0
             System.out.println("galago doc <corpus> <identifier>");
 290  0
             System.out.println();
 291  0
             System.out.println("  Prints the full text of the document named by <identifier>.");
 292  0
             System.out.println("  The document is retrieved from a Corpus file named <corpus>.");
 293  0
         } else if (command.equals("dump-connection")) {
 294  0
             System.out.println("galago dump-connection <connection-file>");
 295  0
             System.out.println();
 296  0
             System.out.println("  Dumps tuples from a Galago TupleFlow connection file in ");
 297  0
             System.out.println("  CSV format.  This can be useful for debugging strange problems ");
 298  0
             System.out.println("  in a TupleFlow execution.");
 299  0
         } else if (command.equals("dump-corpus")) {
 300  0
             System.out.println("galago dump-corpus <corpus>");
 301  0
             System.out.println();
 302  0
             System.out.println("  Dumps all documents from a corpus file to stdout.");
 303  0
         } else if (command.equals("dump-index")) {
 304  0
             System.out.println("galago dump-index <index-part>");
 305  0
             System.out.println();
 306  0
             System.out.println("  Dumps inverted list data from any index file in a StructuredIndex");
 307  0
             System.out.println("  (That is, any index that has a readerClass that's a subclass of ");
 308  0
             System.out.println("  StructuredIndexPartReader).  Output is in CSV format.");
 309  0
         } else if (command.equals("dump-keys")) {
 310  0
             System.out.println("galago dump-keys <indexwriter-file>");
 311  0
             System.out.println();
 312  0
             System.out.println("  Dumps all keys from any file created by IndexWriter.  This includes");
 313  0
             System.out.println("  corpus files and all index files built by Galago.");
 314  0
         } else if (command.equals("eval")) {
 315  0
             org.galagosearch.core.eval.Main.main(new String[] {});
 316  0
         } else if (command.equals("make-corpus")) {
 317  0
             System.out.println("galago make-corpus <corpus> (<input>)+");
 318  0
             System.out.println();
 319  0
             System.out.println("  Copies documents from input files into a corpus file.  A corpus");
 320  0
             System.out.println("  file is required to use any of the document lookup features in ");
 321  0
             System.out.println("  Galago, like printing snippets of search results.");
 322  0
             System.out.println();
 323  0
             System.out.println("<input>:  Can be either a file or directory, and as many can be");
 324  0
             System.out.println("          specified as you like.  Galago can read html, xml, txt, ");
 325  0
             System.out.println("          arc (Heritrix), trectext, trecweb and corpus files.");
 326  0
             System.out.println("          Files may be gzip compressed (.gz).");
 327  0
         } else if (command.equals("search")) {
 328  0
             System.out.println("galago search <index> <corpus>");
 329  0
             System.out.println();
 330  0
             System.out.println("  Starts a web interface for searching an index interactively.");
 331  0
             System.out.println("  The URL to use in your web browser will appear in the command ");
 332  0
             System.out.println("  output.  Cancel the process (Control-C) to quit.");
 333  0
         } else if (command.equals("all")) {
 334  0
             String[] commands = { "batch-search", "build", "doc", "dump-connection", "dump-corpus",
 335  
                                   "dump-index", "dump-keys", "eval", "make-corpus", "search" };
 336  0
             for (String c : commands) {
 337  0
                 commandHelp(c);
 338  0
                 System.out.println();
 339  
             }
 340  0
         } else {
 341  0
             usage();
 342  
         }
 343  0
     }
 344  
 
 345  
     public static void main(String[] args) throws IOException, InterruptedException, ExecutionException, Exception {
 346  0
         if (args.length < 1) {
 347  0
             usage();
 348  0
             return;
 349  
         }
 350  
 
 351  0
         String command = args[0];
 352  
 
 353  0
         if (command.equals("help") && args.length > 1) {
 354  0
             commandHelp(args[1]);
 355  0
         } else if (command.equals("batch-search")) {
 356  0
             handleBatchSearch(args);
 357  0
         } else if (command.equals("build")) {
 358  0
             handleBuild(args);
 359  0
         } else if (command.equals("doc")) {
 360  0
             handleDoc(args);
 361  0
         } else if (command.equals("dump-connection")) {
 362  0
             handleDumpConnection(args);
 363  0
         } else if (command.equals("dump-corpus")) {
 364  0
             handleDumpCorpus(args);
 365  0
         } else if (command.equals("dump-index")) {
 366  0
             handleDumpIndex(args);
 367  0
         } else if (command.equals("dump-keys")) {
 368  0
             handleDumpKeys(args);
 369  0
         } else if (command.equals("make-corpus")) {
 370  0
             handleMakeCorpus(args);
 371  0
         } else if (command.equals("search")) {
 372  0
             handleSearch(args);
 373  
         } else {
 374  0
             usage();
 375  
         }
 376  0
     }
 377  
 }