1
2
3 package org.galagosearch.core.tools;
4
5 import java.io.File;
6 import java.io.IOException;
7 import java.util.ArrayList;
8 import java.util.Map.Entry;
9 import java.util.concurrent.ExecutionException;
10 import org.galagosearch.core.index.StructuredIndex;
11 import org.galagosearch.core.index.StructuredIndexPartReader;
12 import org.galagosearch.core.parse.Document;
13 import org.galagosearch.core.parse.DocumentIndexReader;
14 import org.galagosearch.core.parse.DocumentIndexWriter;
15 import org.galagosearch.core.parse.DocumentSource;
16 import org.galagosearch.core.parse.DocumentToKeyValuePair;
17 import org.galagosearch.core.parse.KeyValuePairToDocument;
18 import org.galagosearch.core.parse.UniversalParser;
19 import org.galagosearch.core.index.IndexReader;
20 import org.galagosearch.core.retrieval.Retrieval;
21 import org.galagosearch.core.retrieval.structured.IndexIterator;
22 import org.galagosearch.core.store.DocumentIndexStore;
23 import org.galagosearch.core.store.DocumentStore;
24 import org.galagosearch.core.store.NullStore;
25 import org.galagosearch.tupleflow.Parameters;
26 import org.galagosearch.tupleflow.execution.ConnectionPointType;
27 import org.galagosearch.tupleflow.execution.InputStep;
28 import org.galagosearch.tupleflow.execution.Job;
29 import org.galagosearch.tupleflow.execution.OutputStep;
30 import org.galagosearch.tupleflow.execution.Stage;
31 import org.galagosearch.tupleflow.execution.StageConnectionPoint;
32 import org.galagosearch.tupleflow.execution.Step;
33 import org.galagosearch.core.types.KeyValuePair;
34 import org.galagosearch.tupleflow.FileOrderedReader;
35 import org.galagosearch.tupleflow.Utility;
36 import org.galagosearch.tupleflow.execution.ConnectionAssignmentType;
37 import org.galagosearch.tupleflow.execution.ErrorStore;
38 import org.galagosearch.tupleflow.execution.JobExecutor;
39 import org.mortbay.jetty.Server;
40
41 /***
42 *
43 * @author trevor
44 */
45 public class App {
46 private static void commandHelpBatchSearch() {
47 System.out.println("galago batch-search <args>");
48 System.out.println();
49 System.out.println(" Runs a batch of queries against an index and produces TREC-formatted");
50 System.out.println(" output. The output can be used with retrieval evaluation tools like");
51 System.out.println(" galago eval (org.galagosearch.core.eval).");
52 System.out.println();
53 System.out.println(" Sample invocation:");
54 System.out.println(" galago batch-search --index=/tmp/myindex --count=200 /tmp/queries");
55 System.out.println();
56 System.out.println(" Args:");
57 System.out.println(" --index=path_to_your_index");
58 System.out.println(" --count : Number of results to return for each query, default=1000");
59 System.out.println();
60 System.out.println(" Query file format:");
61 System.out.println(" The query file is an XML file containing a set of queries. Each query");
62 System.out.println(" has text tag, which contains the text of the query, and a number tag, ");
63 System.out.println(" which uniquely identifies the query in the output.");
64 System.out.println();
65 System.out.println(" Example query file:");
66 System.out.println(" <parameters>");
67 System.out.println(" <query>");
68 System.out.println(" <number>CACM-408</number>");
69 System.out.println(" <text>#combine(my query)</text>");
70 System.out.println(" </query>");
71 System.out.println(" <query>");
72 System.out.println(" <number>WIKI-410</number>");
73 System.out.println(" <text>#combine(another query)</text>");
74 System.out.println(" </query>");
75 System.out.println(" </parameters>");
76 }
77
78 private static void commandHelpBuild() {
79 System.out.println("galago build [flags] <index> (<input>)+");
80 System.out.println();
81 System.out.println(" Builds a Galago StructuredIndex with TupleFlow, using one thread ");
82 System.out.println(" for each CPU core on your computer. While some debugging output ");
83 System.out.println(" will be displayed on the screen, most of the status information will");
84 System.out.println(" appear on a web page. A URL should appear in the command output ");
85 System.out.println(" that will direct you to the status page.");
86 System.out.println();
87
88 System.out.println("<input>: Can be either a file or directory, and as many can be");
89 System.out.println(" specified as you like. Galago can read html, xml, txt, ");
90 System.out.println(" arc (Heritrix), trectext, trecweb and corpus files.");
91 System.out.println(" Files may be gzip compressed (.gz).");
92 System.out.println("<index>: The directory path of the index to produce.");
93 System.out.println();
94 System.out.println("Flags:");
95 System.out.println(" --links={true|false}: Selects whether to collect anchor text ");
96 System.out.println(" [default=false]");
97 System.out.println(" --stemming={true|false}: Selects whether to build stemmed inverted ");
98 System.out.println(" lists in addition to non-stemmed ones.");
99 System.out.println(" [default=true]");
100 }
101
102 private static void handleBuild(String[] args) throws Exception {
103
104 ArrayList<String> documentFiles = new ArrayList<String>();
105 ArrayList<String> flags = new ArrayList<String>();
106 for (String arg : Utility.subarray(args, 2)) {
107 if (arg.startsWith("--")) {
108 flags.add(arg);
109 } else {
110 documentFiles.add(arg);
111 }
112 }
113
114 Parameters p = new Parameters(flags.toArray(new String[0]));
115 boolean useLinks = p.get("links", false);
116 boolean stemming = p.get("stemming", true);
117 String[] docs = documentFiles.toArray(new String[0]);
118
119 BuildIndex build = new BuildIndex();
120 Job job = build.getIndexJob(args[1], docs, useLinks, stemming);
121 ErrorStore store = new ErrorStore();
122 JobExecutor.runLocally(job, store);
123 if (store.hasStatements()) {
124 System.out.println(store.toString());
125 }
126 }
127
128 private static void handleDoc(String[] args) throws IOException {
129 String indexPath = args[1];
130 String identifier = args[2];
131 DocumentIndexReader reader = new DocumentIndexReader(indexPath);
132 Document document = reader.getDocument(identifier);
133 System.out.println(document.text);
134 }
135
136 private static void handleDumpIndex(String[] args) throws IOException {
137 StructuredIndexPartReader reader = StructuredIndex.openIndexPart(args[1]);
138 IndexIterator iterator = reader.getIterator();
139 do {
140 System.out.println(iterator.getRecordString());
141 } while (iterator.nextRecord());
142 }
143
144 private static void handleDumpCorpus(String[] args) throws IOException {
145 DocumentIndexReader reader = new DocumentIndexReader(args[1]);
146 DocumentIndexReader.Iterator iterator = reader.getIterator();
147 while (!iterator.isDone()) {
148 System.out.println("#IDENTIFIER: " + iterator.getKey());
149 Document document = iterator.getDocument();
150 System.out.println("#METADATA");
151 for (Entry<String, String> entry : document.metadata.entrySet()) {
152 System.out.println(entry.getKey() + "," + entry.getValue());
153 }
154 System.out.println("#TEXT");
155 System.out.println(document.text);
156 iterator.nextDocument();
157 }
158 }
159
160 private static void handleDumpConnection(String[] args) throws IOException {
161 FileOrderedReader reader = new FileOrderedReader(args[1]);
162 Object o;
163 while ((o = reader.read()) != null) {
164 System.out.println(o);
165 }
166 }
167
168 private static void handleDumpKeys(String[] args) throws IOException {
169 IndexReader reader = new IndexReader(args[1]);
170 IndexReader.Iterator iterator = reader.getIterator();
171 while (!iterator.isDone()) {
172 System.out.println(iterator.getKey());
173 iterator.getValueString();
174 iterator.nextKey();
175 }
176 }
177
178 private static void handleMakeCorpus(String[] args) throws Exception {
179 Job job = getDocumentConverter(args[1], Utility.subarray(args, 2));
180 ErrorStore store = new ErrorStore();
181 JobExecutor.runLocally(job, store);
182 if (store.hasStatements()) {
183 System.out.println(store.toString());
184 }
185 }
186
187 private static void handleBatchSearch(String[] args) throws Exception {
188 BatchSearch.main(Utility.subarray(args, 1));
189 }
190
191 private static void handleSearch(String[] args) throws Exception, IOException {
192 String indexPath = args[1];
193
194 Retrieval retrieval = Retrieval.instance(indexPath);
195 DocumentStore store = null;
196 if (args.length > 2) {
197 ArrayList<DocumentIndexReader> readers = new ArrayList<DocumentIndexReader>();
198 for (int i = 2; i < args.length; ++i) {
199 readers.add(new DocumentIndexReader(args[i]));
200 }
201 store = new DocumentIndexStore(readers);
202 } else {
203 store = new NullStore();
204 }
205 Search search = new Search(retrieval, store);
206 int port = Utility.getFreePort();
207 Server server = new Server(port);
208 server.addHandler(new SearchWebHandler(search));
209 server.start();
210 System.out.println("Server: http://localhost:" + port);
211 }
212
213 public static void handleEval(String[] args) throws IOException {
214 org.galagosearch.core.eval.Main.main(args);
215 }
216
217 public static Job getDocumentConverter(String outputCorpus, String[] inputs) throws IOException {
218 Job job = new Job();
219
220 Stage stage = new Stage("split");
221 stage.add(new StageConnectionPoint(ConnectionPointType.Output, "docs",
222 new KeyValuePair.KeyOrder()));
223 Parameters p = new Parameters();
224 for (String input : inputs) {
225 File inputFile = new File(input);
226
227 if (inputFile.isFile()) {
228 p.add("filename", input);
229 } else if (inputFile.isDirectory()) {
230 p.add("directory", input);
231 } else {
232 throw new IOException("Couldn't find file/directory: " + input);
233 }
234 }
235
236 stage.add(new Step(DocumentSource.class, p));
237 p = new Parameters();
238 p.add("identifier", "stripped");
239 stage.add(new Step(UniversalParser.class, p));
240 stage.add(new Step(DocumentToKeyValuePair.class));
241 stage.add(Utility.getSorter(new KeyValuePair.KeyOrder()));
242 stage.add(new OutputStep("docs"));
243 job.add(stage);
244
245 stage = new Stage("docwrite");
246 stage.add(new StageConnectionPoint(ConnectionPointType.Input, "docs",
247 new KeyValuePair.KeyOrder()));
248 stage.add(new InputStep("docs"));
249 stage.add(new Step(KeyValuePairToDocument.class));
250 p = new Parameters();
251 p.add("filename", outputCorpus);
252 stage.add(new Step(DocumentIndexWriter.class, p));
253
254 job.add(stage);
255 job.connect("split", "docwrite", ConnectionAssignmentType.Combined);
256 return job;
257 }
258
259 public static void usage() {
260 System.out.println("Type 'galago help <command>' to get more help about any command,");
261 System.out.println(" or 'galago help all' to see all the documentation at once.");
262 System.out.println();
263
264 System.out.println("Popular commands:");
265 System.out.println(" build");
266 System.out.println(" search");
267 System.out.println(" batch-search");
268 System.out.println();
269
270 System.out.println("All commands:");
271 System.out.println(" batch-search");
272 System.out.println(" build");
273 System.out.println(" doc");
274 System.out.println(" dump-connection");
275 System.out.println(" dump-corpus");
276 System.out.println(" dump-index");
277 System.out.println(" dump-keys");
278 System.out.println(" eval");
279 System.out.println(" make-corpus");
280 System.out.println(" search");
281 }
282
283 public static void commandHelp(String command) throws IOException {
284 if (command.equals("batch-search")) {
285 commandHelpBatchSearch();
286 } else if (command.equals("build")) {
287 commandHelpBuild();
288 } else if (command.equals("doc")) {
289 System.out.println("galago doc <corpus> <identifier>");
290 System.out.println();
291 System.out.println(" Prints the full text of the document named by <identifier>.");
292 System.out.println(" The document is retrieved from a Corpus file named <corpus>.");
293 } else if (command.equals("dump-connection")) {
294 System.out.println("galago dump-connection <connection-file>");
295 System.out.println();
296 System.out.println(" Dumps tuples from a Galago TupleFlow connection file in ");
297 System.out.println(" CSV format. This can be useful for debugging strange problems ");
298 System.out.println(" in a TupleFlow execution.");
299 } else if (command.equals("dump-corpus")) {
300 System.out.println("galago dump-corpus <corpus>");
301 System.out.println();
302 System.out.println(" Dumps all documents from a corpus file to stdout.");
303 } else if (command.equals("dump-index")) {
304 System.out.println("galago dump-index <index-part>");
305 System.out.println();
306 System.out.println(" Dumps inverted list data from any index file in a StructuredIndex");
307 System.out.println(" (That is, any index that has a readerClass that's a subclass of ");
308 System.out.println(" StructuredIndexPartReader). Output is in CSV format.");
309 } else if (command.equals("dump-keys")) {
310 System.out.println("galago dump-keys <indexwriter-file>");
311 System.out.println();
312 System.out.println(" Dumps all keys from any file created by IndexWriter. This includes");
313 System.out.println(" corpus files and all index files built by Galago.");
314 } else if (command.equals("eval")) {
315 org.galagosearch.core.eval.Main.main(new String[] {});
316 } else if (command.equals("make-corpus")) {
317 System.out.println("galago make-corpus <corpus> (<input>)+");
318 System.out.println();
319 System.out.println(" Copies documents from input files into a corpus file. A corpus");
320 System.out.println(" file is required to use any of the document lookup features in ");
321 System.out.println(" Galago, like printing snippets of search results.");
322 System.out.println();
323 System.out.println("<input>: Can be either a file or directory, and as many can be");
324 System.out.println(" specified as you like. Galago can read html, xml, txt, ");
325 System.out.println(" arc (Heritrix), trectext, trecweb and corpus files.");
326 System.out.println(" Files may be gzip compressed (.gz).");
327 } else if (command.equals("search")) {
328 System.out.println("galago search <index> <corpus>");
329 System.out.println();
330 System.out.println(" Starts a web interface for searching an index interactively.");
331 System.out.println(" The URL to use in your web browser will appear in the command ");
332 System.out.println(" output. Cancel the process (Control-C) to quit.");
333 } else if (command.equals("all")) {
334 String[] commands = { "batch-search", "build", "doc", "dump-connection", "dump-corpus",
335 "dump-index", "dump-keys", "eval", "make-corpus", "search" };
336 for (String c : commands) {
337 commandHelp(c);
338 System.out.println();
339 }
340 } else {
341 usage();
342 }
343 }
344
345 public static void main(String[] args) throws IOException, InterruptedException, ExecutionException, Exception {
346 if (args.length < 1) {
347 usage();
348 return;
349 }
350
351 String command = args[0];
352
353 if (command.equals("help") && args.length > 1) {
354 commandHelp(args[1]);
355 } else if (command.equals("batch-search")) {
356 handleBatchSearch(args);
357 } else if (command.equals("build")) {
358 handleBuild(args);
359 } else if (command.equals("doc")) {
360 handleDoc(args);
361 } else if (command.equals("dump-connection")) {
362 handleDumpConnection(args);
363 } else if (command.equals("dump-corpus")) {
364 handleDumpCorpus(args);
365 } else if (command.equals("dump-index")) {
366 handleDumpIndex(args);
367 } else if (command.equals("dump-keys")) {
368 handleDumpKeys(args);
369 } else if (command.equals("make-corpus")) {
370 handleMakeCorpus(args);
371 } else if (command.equals("search")) {
372 handleSearch(args);
373 } else {
374 usage();
375 }
376 }
377 }