View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.tools;
4   
5   import java.io.File;
6   import java.io.IOException;
7   import java.util.ArrayList;
8   import java.util.Map.Entry;
9   import java.util.concurrent.ExecutionException;
10  import org.galagosearch.core.index.StructuredIndex;
11  import org.galagosearch.core.index.StructuredIndexPartReader;
12  import org.galagosearch.core.parse.Document;
13  import org.galagosearch.core.parse.DocumentIndexReader;
14  import org.galagosearch.core.parse.DocumentIndexWriter;
15  import org.galagosearch.core.parse.DocumentSource;
16  import org.galagosearch.core.parse.DocumentToKeyValuePair;
17  import org.galagosearch.core.parse.KeyValuePairToDocument;
18  import org.galagosearch.core.parse.UniversalParser;
19  import org.galagosearch.core.index.IndexReader;
20  import org.galagosearch.core.retrieval.Retrieval;
21  import org.galagosearch.core.retrieval.structured.IndexIterator;
22  import org.galagosearch.core.store.DocumentIndexStore;
23  import org.galagosearch.core.store.DocumentStore;
24  import org.galagosearch.core.store.NullStore;
25  import org.galagosearch.tupleflow.Parameters;
26  import org.galagosearch.tupleflow.execution.ConnectionPointType;
27  import org.galagosearch.tupleflow.execution.InputStep;
28  import org.galagosearch.tupleflow.execution.Job;
29  import org.galagosearch.tupleflow.execution.OutputStep;
30  import org.galagosearch.tupleflow.execution.Stage;
31  import org.galagosearch.tupleflow.execution.StageConnectionPoint;
32  import org.galagosearch.tupleflow.execution.Step;
33  import org.galagosearch.core.types.KeyValuePair;
34  import org.galagosearch.tupleflow.FileOrderedReader;
35  import org.galagosearch.tupleflow.Utility;
36  import org.galagosearch.tupleflow.execution.ConnectionAssignmentType;
37  import org.galagosearch.tupleflow.execution.ErrorStore;
38  import org.galagosearch.tupleflow.execution.JobExecutor;
39  import org.mortbay.jetty.Server;
40  
41  /***
42   *
43   * @author trevor
44   */
45  public class App {
46      private static void commandHelpBatchSearch() {
47          System.out.println("galago batch-search <args>");
48          System.out.println();
49          System.out.println("  Runs a batch of queries against an index and produces TREC-formatted");
50          System.out.println("  output.  The output can be used with retrieval evaluation tools like");
51          System.out.println("  galago eval (org.galagosearch.core.eval).");
52          System.out.println();
53          System.out.println("  Sample invocation:");
54          System.out.println("     galago batch-search --index=/tmp/myindex --count=200 /tmp/queries");
55          System.out.println();
56          System.out.println("  Args:");
57          System.out.println("     --index=path_to_your_index");
58          System.out.println("     --count : Number of results to return for each query, default=1000");
59          System.out.println();
60          System.out.println("  Query file format:");
61          System.out.println("    The query file is an XML file containing a set of queries.  Each query");
62          System.out.println("    has text tag, which contains the text of the query, and a number tag, ");
63          System.out.println("    which uniquely identifies the query in the output.");
64          System.out.println();
65          System.out.println("  Example query file:");
66          System.out.println("  <parameters>");
67          System.out.println("     <query>");
68          System.out.println("        <number>CACM-408</number>");
69          System.out.println("        <text>#combine(my query)</text>");
70          System.out.println("     </query>");
71          System.out.println("     <query>");
72          System.out.println("        <number>WIKI-410</number>");
73          System.out.println("        <text>#combine(another query)</text>");
74          System.out.println("     </query>");
75          System.out.println("  </parameters>");
76      }
77  
78      private static void commandHelpBuild() {
79          System.out.println("galago build [flags] <index> (<input>)+");
80          System.out.println();
81          System.out.println("  Builds a Galago StructuredIndex with TupleFlow, using one thread ");
82          System.out.println("  for each CPU core on your computer.  While some debugging output ");
83          System.out.println("  will be displayed on the screen, most of the status information will");
84          System.out.println("  appear on a web page.  A URL should appear in the command output ");
85          System.out.println("  that will direct you to the status page.");
86          System.out.println();
87  
88          System.out.println("<input>:  Can be either a file or directory, and as many can be");
89          System.out.println("          specified as you like.  Galago can read html, xml, txt, ");
90          System.out.println("          arc (Heritrix), trectext, trecweb and corpus files.");
91          System.out.println("          Files may be gzip compressed (.gz).");
92          System.out.println("<index>:  The directory path of the index to produce.");
93          System.out.println();
94          System.out.println("Flags:");
95          System.out.println("  --links={true|false}:    Selects whether to collect anchor text ");
96          System.out.println("                           [default=false]");
97          System.out.println("  --stemming={true|false}: Selects whether to build stemmed inverted ");
98          System.out.println("                           lists in addition to non-stemmed ones.");
99          System.out.println("                           [default=true]");
100     }
101 
102     private static void handleBuild(String[] args) throws Exception {
103         // handle --links and --stemming flags
104         ArrayList<String> documentFiles = new ArrayList<String>();
105         ArrayList<String> flags = new ArrayList<String>();
106         for (String arg : Utility.subarray(args, 2)) {
107             if (arg.startsWith("--")) {
108                 flags.add(arg);
109             } else {
110                 documentFiles.add(arg);
111             }
112         }
113 
114         Parameters p = new Parameters(flags.toArray(new String[0]));
115         boolean useLinks = p.get("links", false);
116         boolean stemming = p.get("stemming", true);
117         String[] docs = documentFiles.toArray(new String[0]);
118 
119         BuildIndex build = new BuildIndex();
120         Job job = build.getIndexJob(args[1], docs, useLinks, stemming);
121         ErrorStore store = new ErrorStore();
122         JobExecutor.runLocally(job, store);
123         if (store.hasStatements()) {
124             System.out.println(store.toString());
125         }
126     }
127 
128     private static void handleDoc(String[] args) throws IOException {
129         String indexPath = args[1];
130         String identifier = args[2];
131         DocumentIndexReader reader = new DocumentIndexReader(indexPath);
132         Document document = reader.getDocument(identifier);
133         System.out.println(document.text);
134     }
135 
136     private static void handleDumpIndex(String[] args) throws IOException {
137         StructuredIndexPartReader reader = StructuredIndex.openIndexPart(args[1]);
138         IndexIterator iterator = reader.getIterator();
139         do {
140             System.out.println(iterator.getRecordString());
141         } while (iterator.nextRecord());
142     }
143 
144     private static void handleDumpCorpus(String[] args) throws IOException {
145         DocumentIndexReader reader = new DocumentIndexReader(args[1]);
146         DocumentIndexReader.Iterator iterator = reader.getIterator();
147         while (!iterator.isDone()) {
148             System.out.println("#IDENTIFIER: " + iterator.getKey());
149             Document document = iterator.getDocument();
150             System.out.println("#METADATA");
151             for (Entry<String, String> entry : document.metadata.entrySet()) {
152                 System.out.println(entry.getKey() + "," + entry.getValue());
153             }
154             System.out.println("#TEXT");
155             System.out.println(document.text);
156             iterator.nextDocument();
157         }
158     }
159 
160     private static void handleDumpConnection(String[] args) throws IOException {
161         FileOrderedReader reader = new FileOrderedReader(args[1]);
162         Object o;
163         while ((o = reader.read()) != null) {
164             System.out.println(o);
165         }
166     }
167 
168     private static void handleDumpKeys(String[] args) throws IOException {
169         IndexReader reader = new IndexReader(args[1]);
170         IndexReader.Iterator iterator = reader.getIterator();
171         while (!iterator.isDone()) {
172             System.out.println(iterator.getKey());
173             iterator.getValueString();
174             iterator.nextKey();
175         }
176     }
177 
178     private static void handleMakeCorpus(String[] args) throws Exception {
179         Job job = getDocumentConverter(args[1], Utility.subarray(args, 2));
180         ErrorStore store = new ErrorStore();
181         JobExecutor.runLocally(job, store);
182         if (store.hasStatements()) {
183             System.out.println(store.toString());
184         }
185     }
186 
187     private static void handleBatchSearch(String[] args) throws Exception {
188         BatchSearch.main(Utility.subarray(args, 1));
189     }
190 
191     private static void handleSearch(String[] args) throws Exception, IOException {
192         String indexPath = args[1];
193 
194         Retrieval retrieval = Retrieval.instance(indexPath);
195         DocumentStore store = null;
196         if (args.length > 2) {
197             ArrayList<DocumentIndexReader> readers = new ArrayList<DocumentIndexReader>();
198             for (int i = 2; i < args.length; ++i) {
199                 readers.add(new DocumentIndexReader(args[i]));
200             }
201             store = new DocumentIndexStore(readers);
202         } else {
203             store = new NullStore();
204         }
205         Search search = new Search(retrieval, store);
206         int port = Utility.getFreePort();
207         Server server = new Server(port);
208         server.addHandler(new SearchWebHandler(search));
209         server.start();
210         System.out.println("Server: http://localhost:" + port);
211     }
212 
213     public static void handleEval(String[] args) throws IOException {
214         org.galagosearch.core.eval.Main.main(args);
215     }
216 
217     public static Job getDocumentConverter(String outputCorpus, String[] inputs) throws IOException {
218         Job job = new Job();
219 
220         Stage stage = new Stage("split");
221         stage.add(new StageConnectionPoint(ConnectionPointType.Output, "docs",
222                 new KeyValuePair.KeyOrder()));
223         Parameters p = new Parameters();
224         for (String input : inputs) {
225             File inputFile = new File(input);
226 
227             if (inputFile.isFile()) {
228                 p.add("filename", input);
229             } else if (inputFile.isDirectory()) {
230                 p.add("directory", input);
231             } else {
232                 throw new IOException("Couldn't find file/directory: " + input);
233             }
234         }
235 
236         stage.add(new Step(DocumentSource.class, p));
237         p = new Parameters();
238         p.add("identifier", "stripped");
239         stage.add(new Step(UniversalParser.class, p));
240         stage.add(new Step(DocumentToKeyValuePair.class));
241         stage.add(Utility.getSorter(new KeyValuePair.KeyOrder()));
242         stage.add(new OutputStep("docs"));
243         job.add(stage);
244 
245         stage = new Stage("docwrite");
246         stage.add(new StageConnectionPoint(ConnectionPointType.Input, "docs",
247                 new KeyValuePair.KeyOrder()));
248         stage.add(new InputStep("docs"));
249         stage.add(new Step(KeyValuePairToDocument.class));
250         p = new Parameters();
251         p.add("filename", outputCorpus);
252         stage.add(new Step(DocumentIndexWriter.class, p));
253 
254         job.add(stage);
255         job.connect("split", "docwrite", ConnectionAssignmentType.Combined);
256         return job;
257     }
258 
259     public static void usage() {
260         System.out.println("Type 'galago help <command>' to get more help about any command,");
261         System.out.println("   or 'galago help all' to see all the documentation at once.");
262         System.out.println();
263         
264         System.out.println("Popular commands:");
265         System.out.println("   build");
266         System.out.println("   search");
267         System.out.println("   batch-search");
268         System.out.println();
269 
270         System.out.println("All commands:");
271         System.out.println("   batch-search");
272         System.out.println("   build");
273         System.out.println("   doc");
274         System.out.println("   dump-connection");
275         System.out.println("   dump-corpus");
276         System.out.println("   dump-index");
277         System.out.println("   dump-keys");
278         System.out.println("   eval");
279         System.out.println("   make-corpus");
280         System.out.println("   search");
281     }
282 
283     public static void commandHelp(String command) throws IOException {
284         if (command.equals("batch-search")) {
285             commandHelpBatchSearch();
286         } else if (command.equals("build")) {
287             commandHelpBuild();
288         } else if (command.equals("doc")) {
289             System.out.println("galago doc <corpus> <identifier>");
290             System.out.println();
291             System.out.println("  Prints the full text of the document named by <identifier>.");
292             System.out.println("  The document is retrieved from a Corpus file named <corpus>.");
293         } else if (command.equals("dump-connection")) {
294             System.out.println("galago dump-connection <connection-file>");
295             System.out.println();
296             System.out.println("  Dumps tuples from a Galago TupleFlow connection file in ");
297             System.out.println("  CSV format.  This can be useful for debugging strange problems ");
298             System.out.println("  in a TupleFlow execution.");
299         } else if (command.equals("dump-corpus")) {
300             System.out.println("galago dump-corpus <corpus>");
301             System.out.println();
302             System.out.println("  Dumps all documents from a corpus file to stdout.");
303         } else if (command.equals("dump-index")) {
304             System.out.println("galago dump-index <index-part>");
305             System.out.println();
306             System.out.println("  Dumps inverted list data from any index file in a StructuredIndex");
307             System.out.println("  (That is, any index that has a readerClass that's a subclass of ");
308             System.out.println("  StructuredIndexPartReader).  Output is in CSV format.");
309         } else if (command.equals("dump-keys")) {
310             System.out.println("galago dump-keys <indexwriter-file>");
311             System.out.println();
312             System.out.println("  Dumps all keys from any file created by IndexWriter.  This includes");
313             System.out.println("  corpus files and all index files built by Galago.");
314         } else if (command.equals("eval")) {
315             org.galagosearch.core.eval.Main.main(new String[] {});
316         } else if (command.equals("make-corpus")) {
317             System.out.println("galago make-corpus <corpus> (<input>)+");
318             System.out.println();
319             System.out.println("  Copies documents from input files into a corpus file.  A corpus");
320             System.out.println("  file is required to use any of the document lookup features in ");
321             System.out.println("  Galago, like printing snippets of search results.");
322             System.out.println();
323             System.out.println("<input>:  Can be either a file or directory, and as many can be");
324             System.out.println("          specified as you like.  Galago can read html, xml, txt, ");
325             System.out.println("          arc (Heritrix), trectext, trecweb and corpus files.");
326             System.out.println("          Files may be gzip compressed (.gz).");
327         } else if (command.equals("search")) {
328             System.out.println("galago search <index> <corpus>");
329             System.out.println();
330             System.out.println("  Starts a web interface for searching an index interactively.");
331             System.out.println("  The URL to use in your web browser will appear in the command ");
332             System.out.println("  output.  Cancel the process (Control-C) to quit.");
333         } else if (command.equals("all")) {
334             String[] commands = { "batch-search", "build", "doc", "dump-connection", "dump-corpus",
335                                   "dump-index", "dump-keys", "eval", "make-corpus", "search" };
336             for (String c : commands) {
337                 commandHelp(c);
338                 System.out.println();
339             }
340         } else {
341             usage();
342         }
343     }
344 
345     public static void main(String[] args) throws IOException, InterruptedException, ExecutionException, Exception {
346         if (args.length < 1) {
347             usage();
348             return;
349         }
350 
351         String command = args[0];
352 
353         if (command.equals("help") && args.length > 1) {
354             commandHelp(args[1]);
355         } else if (command.equals("batch-search")) {
356             handleBatchSearch(args);
357         } else if (command.equals("build")) {
358             handleBuild(args);
359         } else if (command.equals("doc")) {
360             handleDoc(args);
361         } else if (command.equals("dump-connection")) {
362             handleDumpConnection(args);
363         } else if (command.equals("dump-corpus")) {
364             handleDumpCorpus(args);
365         } else if (command.equals("dump-index")) {
366             handleDumpIndex(args);
367         } else if (command.equals("dump-keys")) {
368             handleDumpKeys(args);
369         } else if (command.equals("make-corpus")) {
370             handleMakeCorpus(args);
371         } else if (command.equals("search")) {
372             handleSearch(args);
373         } else {
374             usage();
375         }
376     }
377 }