| 1 | |
|
| 2 | |
|
| 3 | |
package org.galagosearch.core.tools; |
| 4 | |
|
| 5 | |
import java.io.File; |
| 6 | |
import java.io.IOException; |
| 7 | |
import java.util.ArrayList; |
| 8 | |
import java.util.Map.Entry; |
| 9 | |
import java.util.concurrent.ExecutionException; |
| 10 | |
import org.galagosearch.core.index.StructuredIndex; |
| 11 | |
import org.galagosearch.core.index.StructuredIndexPartReader; |
| 12 | |
import org.galagosearch.core.parse.Document; |
| 13 | |
import org.galagosearch.core.parse.DocumentIndexReader; |
| 14 | |
import org.galagosearch.core.parse.DocumentIndexWriter; |
| 15 | |
import org.galagosearch.core.parse.DocumentSource; |
| 16 | |
import org.galagosearch.core.parse.DocumentToKeyValuePair; |
| 17 | |
import org.galagosearch.core.parse.KeyValuePairToDocument; |
| 18 | |
import org.galagosearch.core.parse.UniversalParser; |
| 19 | |
import org.galagosearch.core.index.IndexReader; |
| 20 | |
import org.galagosearch.core.retrieval.Retrieval; |
| 21 | |
import org.galagosearch.core.retrieval.structured.IndexIterator; |
| 22 | |
import org.galagosearch.core.store.DocumentIndexStore; |
| 23 | |
import org.galagosearch.core.store.DocumentStore; |
| 24 | |
import org.galagosearch.core.store.NullStore; |
| 25 | |
import org.galagosearch.tupleflow.Parameters; |
| 26 | |
import org.galagosearch.tupleflow.execution.ConnectionPointType; |
| 27 | |
import org.galagosearch.tupleflow.execution.InputStep; |
| 28 | |
import org.galagosearch.tupleflow.execution.Job; |
| 29 | |
import org.galagosearch.tupleflow.execution.OutputStep; |
| 30 | |
import org.galagosearch.tupleflow.execution.Stage; |
| 31 | |
import org.galagosearch.tupleflow.execution.StageConnectionPoint; |
| 32 | |
import org.galagosearch.tupleflow.execution.Step; |
| 33 | |
import org.galagosearch.core.types.KeyValuePair; |
| 34 | |
import org.galagosearch.tupleflow.FileOrderedReader; |
| 35 | |
import org.galagosearch.tupleflow.Utility; |
| 36 | |
import org.galagosearch.tupleflow.execution.ConnectionAssignmentType; |
| 37 | |
import org.galagosearch.tupleflow.execution.ErrorStore; |
| 38 | |
import org.galagosearch.tupleflow.execution.JobExecutor; |
| 39 | |
import org.mortbay.jetty.Server; |
| 40 | |
|
| 41 | |
|
| 42 | |
|
| 43 | |
|
| 44 | |
|
| 45 | 0 | public class App { |
| 46 | |
private static void commandHelpBatchSearch() { |
| 47 | 0 | System.out.println("galago batch-search <args>"); |
| 48 | 0 | System.out.println(); |
| 49 | 0 | System.out.println(" Runs a batch of queries against an index and produces TREC-formatted"); |
| 50 | 0 | System.out.println(" output. The output can be used with retrieval evaluation tools like"); |
| 51 | 0 | System.out.println(" galago eval (org.galagosearch.core.eval)."); |
| 52 | 0 | System.out.println(); |
| 53 | 0 | System.out.println(" Sample invocation:"); |
| 54 | 0 | System.out.println(" galago batch-search --index=/tmp/myindex --count=200 /tmp/queries"); |
| 55 | 0 | System.out.println(); |
| 56 | 0 | System.out.println(" Args:"); |
| 57 | 0 | System.out.println(" --index=path_to_your_index"); |
| 58 | 0 | System.out.println(" --count : Number of results to return for each query, default=1000"); |
| 59 | 0 | System.out.println(); |
| 60 | 0 | System.out.println(" Query file format:"); |
| 61 | 0 | System.out.println(" The query file is an XML file containing a set of queries. Each query"); |
| 62 | 0 | System.out.println(" has text tag, which contains the text of the query, and a number tag, "); |
| 63 | 0 | System.out.println(" which uniquely identifies the query in the output."); |
| 64 | 0 | System.out.println(); |
| 65 | 0 | System.out.println(" Example query file:"); |
| 66 | 0 | System.out.println(" <parameters>"); |
| 67 | 0 | System.out.println(" <query>"); |
| 68 | 0 | System.out.println(" <number>CACM-408</number>"); |
| 69 | 0 | System.out.println(" <text>#combine(my query)</text>"); |
| 70 | 0 | System.out.println(" </query>"); |
| 71 | 0 | System.out.println(" <query>"); |
| 72 | 0 | System.out.println(" <number>WIKI-410</number>"); |
| 73 | 0 | System.out.println(" <text>#combine(another query)</text>"); |
| 74 | 0 | System.out.println(" </query>"); |
| 75 | 0 | System.out.println(" </parameters>"); |
| 76 | 0 | } |
| 77 | |
|
| 78 | |
private static void commandHelpBuild() { |
| 79 | 0 | System.out.println("galago build [flags] <index> (<input>)+"); |
| 80 | 0 | System.out.println(); |
| 81 | 0 | System.out.println(" Builds a Galago StructuredIndex with TupleFlow, using one thread "); |
| 82 | 0 | System.out.println(" for each CPU core on your computer. While some debugging output "); |
| 83 | 0 | System.out.println(" will be displayed on the screen, most of the status information will"); |
| 84 | 0 | System.out.println(" appear on a web page. A URL should appear in the command output "); |
| 85 | 0 | System.out.println(" that will direct you to the status page."); |
| 86 | 0 | System.out.println(); |
| 87 | |
|
| 88 | 0 | System.out.println("<input>: Can be either a file or directory, and as many can be"); |
| 89 | 0 | System.out.println(" specified as you like. Galago can read html, xml, txt, "); |
| 90 | 0 | System.out.println(" arc (Heritrix), trectext, trecweb and corpus files."); |
| 91 | 0 | System.out.println(" Files may be gzip compressed (.gz)."); |
| 92 | 0 | System.out.println("<index>: The directory path of the index to produce."); |
| 93 | 0 | System.out.println(); |
| 94 | 0 | System.out.println("Flags:"); |
| 95 | 0 | System.out.println(" --links={true|false}: Selects whether to collect anchor text "); |
| 96 | 0 | System.out.println(" [default=false]"); |
| 97 | 0 | System.out.println(" --stemming={true|false}: Selects whether to build stemmed inverted "); |
| 98 | 0 | System.out.println(" lists in addition to non-stemmed ones."); |
| 99 | 0 | System.out.println(" [default=true]"); |
| 100 | 0 | } |
| 101 | |
|
| 102 | |
private static void handleBuild(String[] args) throws Exception { |
| 103 | |
|
| 104 | 0 | ArrayList<String> documentFiles = new ArrayList<String>(); |
| 105 | 0 | ArrayList<String> flags = new ArrayList<String>(); |
| 106 | 0 | for (String arg : Utility.subarray(args, 2)) { |
| 107 | 0 | if (arg.startsWith("--")) { |
| 108 | 0 | flags.add(arg); |
| 109 | |
} else { |
| 110 | 0 | documentFiles.add(arg); |
| 111 | |
} |
| 112 | |
} |
| 113 | |
|
| 114 | 0 | Parameters p = new Parameters(flags.toArray(new String[0])); |
| 115 | 0 | boolean useLinks = p.get("links", false); |
| 116 | 0 | boolean stemming = p.get("stemming", true); |
| 117 | 0 | String[] docs = documentFiles.toArray(new String[0]); |
| 118 | |
|
| 119 | 0 | BuildIndex build = new BuildIndex(); |
| 120 | 0 | Job job = build.getIndexJob(args[1], docs, useLinks, stemming); |
| 121 | 0 | ErrorStore store = new ErrorStore(); |
| 122 | 0 | JobExecutor.runLocally(job, store); |
| 123 | 0 | if (store.hasStatements()) { |
| 124 | 0 | System.out.println(store.toString()); |
| 125 | |
} |
| 126 | 0 | } |
| 127 | |
|
| 128 | |
private static void handleDoc(String[] args) throws IOException { |
| 129 | 0 | String indexPath = args[1]; |
| 130 | 0 | String identifier = args[2]; |
| 131 | 0 | DocumentIndexReader reader = new DocumentIndexReader(indexPath); |
| 132 | 0 | Document document = reader.getDocument(identifier); |
| 133 | 0 | System.out.println(document.text); |
| 134 | 0 | } |
| 135 | |
|
| 136 | |
private static void handleDumpIndex(String[] args) throws IOException { |
| 137 | 0 | StructuredIndexPartReader reader = StructuredIndex.openIndexPart(args[1]); |
| 138 | 0 | IndexIterator iterator = reader.getIterator(); |
| 139 | |
do { |
| 140 | 0 | System.out.println(iterator.getRecordString()); |
| 141 | 0 | } while (iterator.nextRecord()); |
| 142 | 0 | } |
| 143 | |
|
| 144 | |
private static void handleDumpCorpus(String[] args) throws IOException { |
| 145 | 0 | DocumentIndexReader reader = new DocumentIndexReader(args[1]); |
| 146 | 0 | DocumentIndexReader.Iterator iterator = reader.getIterator(); |
| 147 | 0 | while (!iterator.isDone()) { |
| 148 | 0 | System.out.println("#IDENTIFIER: " + iterator.getKey()); |
| 149 | 0 | Document document = iterator.getDocument(); |
| 150 | 0 | System.out.println("#METADATA"); |
| 151 | 0 | for (Entry<String, String> entry : document.metadata.entrySet()) { |
| 152 | 0 | System.out.println(entry.getKey() + "," + entry.getValue()); |
| 153 | |
} |
| 154 | 0 | System.out.println("#TEXT"); |
| 155 | 0 | System.out.println(document.text); |
| 156 | 0 | iterator.nextDocument(); |
| 157 | 0 | } |
| 158 | 0 | } |
| 159 | |
|
| 160 | |
private static void handleDumpConnection(String[] args) throws IOException { |
| 161 | 0 | FileOrderedReader reader = new FileOrderedReader(args[1]); |
| 162 | |
Object o; |
| 163 | 0 | while ((o = reader.read()) != null) { |
| 164 | 0 | System.out.println(o); |
| 165 | |
} |
| 166 | 0 | } |
| 167 | |
|
| 168 | |
private static void handleDumpKeys(String[] args) throws IOException { |
| 169 | 0 | IndexReader reader = new IndexReader(args[1]); |
| 170 | 0 | IndexReader.Iterator iterator = reader.getIterator(); |
| 171 | 0 | while (!iterator.isDone()) { |
| 172 | 0 | System.out.println(iterator.getKey()); |
| 173 | 0 | iterator.getValueString(); |
| 174 | 0 | iterator.nextKey(); |
| 175 | |
} |
| 176 | 0 | } |
| 177 | |
|
| 178 | |
private static void handleMakeCorpus(String[] args) throws Exception { |
| 179 | 0 | Job job = getDocumentConverter(args[1], Utility.subarray(args, 2)); |
| 180 | 0 | ErrorStore store = new ErrorStore(); |
| 181 | 0 | JobExecutor.runLocally(job, store); |
| 182 | 0 | if (store.hasStatements()) { |
| 183 | 0 | System.out.println(store.toString()); |
| 184 | |
} |
| 185 | 0 | } |
| 186 | |
|
| 187 | |
private static void handleBatchSearch(String[] args) throws Exception { |
| 188 | 0 | BatchSearch.main(Utility.subarray(args, 1)); |
| 189 | 0 | } |
| 190 | |
|
| 191 | |
private static void handleSearch(String[] args) throws Exception, IOException { |
| 192 | 0 | String indexPath = args[1]; |
| 193 | |
|
| 194 | 0 | Retrieval retrieval = Retrieval.instance(indexPath); |
| 195 | 0 | DocumentStore store = null; |
| 196 | 0 | if (args.length > 2) { |
| 197 | 0 | ArrayList<DocumentIndexReader> readers = new ArrayList<DocumentIndexReader>(); |
| 198 | 0 | for (int i = 2; i < args.length; ++i) { |
| 199 | 0 | readers.add(new DocumentIndexReader(args[i])); |
| 200 | |
} |
| 201 | 0 | store = new DocumentIndexStore(readers); |
| 202 | 0 | } else { |
| 203 | 0 | store = new NullStore(); |
| 204 | |
} |
| 205 | 0 | Search search = new Search(retrieval, store); |
| 206 | 0 | int port = Utility.getFreePort(); |
| 207 | 0 | Server server = new Server(port); |
| 208 | 0 | server.addHandler(new SearchWebHandler(search)); |
| 209 | 0 | server.start(); |
| 210 | 0 | System.out.println("Server: http://localhost:" + port); |
| 211 | 0 | } |
| 212 | |
|
| 213 | |
public static void handleEval(String[] args) throws IOException { |
| 214 | 0 | org.galagosearch.core.eval.Main.main(args); |
| 215 | 0 | } |
| 216 | |
|
| 217 | |
public static Job getDocumentConverter(String outputCorpus, String[] inputs) throws IOException { |
| 218 | 4 | Job job = new Job(); |
| 219 | |
|
| 220 | 4 | Stage stage = new Stage("split"); |
| 221 | 4 | stage.add(new StageConnectionPoint(ConnectionPointType.Output, "docs", |
| 222 | |
new KeyValuePair.KeyOrder())); |
| 223 | 4 | Parameters p = new Parameters(); |
| 224 | 8 | for (String input : inputs) { |
| 225 | 4 | File inputFile = new File(input); |
| 226 | |
|
| 227 | 4 | if (inputFile.isFile()) { |
| 228 | 0 | p.add("filename", input); |
| 229 | 4 | } else if (inputFile.isDirectory()) { |
| 230 | 4 | p.add("directory", input); |
| 231 | |
} else { |
| 232 | 0 | throw new IOException("Couldn't find file/directory: " + input); |
| 233 | |
} |
| 234 | |
} |
| 235 | |
|
| 236 | 4 | stage.add(new Step(DocumentSource.class, p)); |
| 237 | 4 | p = new Parameters(); |
| 238 | 4 | p.add("identifier", "stripped"); |
| 239 | 4 | stage.add(new Step(UniversalParser.class, p)); |
| 240 | 4 | stage.add(new Step(DocumentToKeyValuePair.class)); |
| 241 | 4 | stage.add(Utility.getSorter(new KeyValuePair.KeyOrder())); |
| 242 | 4 | stage.add(new OutputStep("docs")); |
| 243 | 4 | job.add(stage); |
| 244 | |
|
| 245 | 4 | stage = new Stage("docwrite"); |
| 246 | 4 | stage.add(new StageConnectionPoint(ConnectionPointType.Input, "docs", |
| 247 | |
new KeyValuePair.KeyOrder())); |
| 248 | 4 | stage.add(new InputStep("docs")); |
| 249 | 4 | stage.add(new Step(KeyValuePairToDocument.class)); |
| 250 | 4 | p = new Parameters(); |
| 251 | 4 | p.add("filename", outputCorpus); |
| 252 | 4 | stage.add(new Step(DocumentIndexWriter.class, p)); |
| 253 | |
|
| 254 | 4 | job.add(stage); |
| 255 | 4 | job.connect("split", "docwrite", ConnectionAssignmentType.Combined); |
| 256 | 4 | return job; |
| 257 | |
} |
| 258 | |
|
| 259 | |
public static void usage() { |
| 260 | 0 | System.out.println("Type 'galago help <command>' to get more help about any command,"); |
| 261 | 0 | System.out.println(" or 'galago help all' to see all the documentation at once."); |
| 262 | 0 | System.out.println(); |
| 263 | |
|
| 264 | 0 | System.out.println("Popular commands:"); |
| 265 | 0 | System.out.println(" build"); |
| 266 | 0 | System.out.println(" search"); |
| 267 | 0 | System.out.println(" batch-search"); |
| 268 | 0 | System.out.println(); |
| 269 | |
|
| 270 | 0 | System.out.println("All commands:"); |
| 271 | 0 | System.out.println(" batch-search"); |
| 272 | 0 | System.out.println(" build"); |
| 273 | 0 | System.out.println(" doc"); |
| 274 | 0 | System.out.println(" dump-connection"); |
| 275 | 0 | System.out.println(" dump-corpus"); |
| 276 | 0 | System.out.println(" dump-index"); |
| 277 | 0 | System.out.println(" dump-keys"); |
| 278 | 0 | System.out.println(" eval"); |
| 279 | 0 | System.out.println(" make-corpus"); |
| 280 | 0 | System.out.println(" search"); |
| 281 | 0 | } |
| 282 | |
|
| 283 | |
public static void commandHelp(String command) throws IOException { |
| 284 | 0 | if (command.equals("batch-search")) { |
| 285 | 0 | commandHelpBatchSearch(); |
| 286 | 0 | } else if (command.equals("build")) { |
| 287 | 0 | commandHelpBuild(); |
| 288 | 0 | } else if (command.equals("doc")) { |
| 289 | 0 | System.out.println("galago doc <corpus> <identifier>"); |
| 290 | 0 | System.out.println(); |
| 291 | 0 | System.out.println(" Prints the full text of the document named by <identifier>."); |
| 292 | 0 | System.out.println(" The document is retrieved from a Corpus file named <corpus>."); |
| 293 | 0 | } else if (command.equals("dump-connection")) { |
| 294 | 0 | System.out.println("galago dump-connection <connection-file>"); |
| 295 | 0 | System.out.println(); |
| 296 | 0 | System.out.println(" Dumps tuples from a Galago TupleFlow connection file in "); |
| 297 | 0 | System.out.println(" CSV format. This can be useful for debugging strange problems "); |
| 298 | 0 | System.out.println(" in a TupleFlow execution."); |
| 299 | 0 | } else if (command.equals("dump-corpus")) { |
| 300 | 0 | System.out.println("galago dump-corpus <corpus>"); |
| 301 | 0 | System.out.println(); |
| 302 | 0 | System.out.println(" Dumps all documents from a corpus file to stdout."); |
| 303 | 0 | } else if (command.equals("dump-index")) { |
| 304 | 0 | System.out.println("galago dump-index <index-part>"); |
| 305 | 0 | System.out.println(); |
| 306 | 0 | System.out.println(" Dumps inverted list data from any index file in a StructuredIndex"); |
| 307 | 0 | System.out.println(" (That is, any index that has a readerClass that's a subclass of "); |
| 308 | 0 | System.out.println(" StructuredIndexPartReader). Output is in CSV format."); |
| 309 | 0 | } else if (command.equals("dump-keys")) { |
| 310 | 0 | System.out.println("galago dump-keys <indexwriter-file>"); |
| 311 | 0 | System.out.println(); |
| 312 | 0 | System.out.println(" Dumps all keys from any file created by IndexWriter. This includes"); |
| 313 | 0 | System.out.println(" corpus files and all index files built by Galago."); |
| 314 | 0 | } else if (command.equals("eval")) { |
| 315 | 0 | org.galagosearch.core.eval.Main.main(new String[] {}); |
| 316 | 0 | } else if (command.equals("make-corpus")) { |
| 317 | 0 | System.out.println("galago make-corpus <corpus> (<input>)+"); |
| 318 | 0 | System.out.println(); |
| 319 | 0 | System.out.println(" Copies documents from input files into a corpus file. A corpus"); |
| 320 | 0 | System.out.println(" file is required to use any of the document lookup features in "); |
| 321 | 0 | System.out.println(" Galago, like printing snippets of search results."); |
| 322 | 0 | System.out.println(); |
| 323 | 0 | System.out.println("<input>: Can be either a file or directory, and as many can be"); |
| 324 | 0 | System.out.println(" specified as you like. Galago can read html, xml, txt, "); |
| 325 | 0 | System.out.println(" arc (Heritrix), trectext, trecweb and corpus files."); |
| 326 | 0 | System.out.println(" Files may be gzip compressed (.gz)."); |
| 327 | 0 | } else if (command.equals("search")) { |
| 328 | 0 | System.out.println("galago search <index> <corpus>"); |
| 329 | 0 | System.out.println(); |
| 330 | 0 | System.out.println(" Starts a web interface for searching an index interactively."); |
| 331 | 0 | System.out.println(" The URL to use in your web browser will appear in the command "); |
| 332 | 0 | System.out.println(" output. Cancel the process (Control-C) to quit."); |
| 333 | 0 | } else if (command.equals("all")) { |
| 334 | 0 | String[] commands = { "batch-search", "build", "doc", "dump-connection", "dump-corpus", |
| 335 | |
"dump-index", "dump-keys", "eval", "make-corpus", "search" }; |
| 336 | 0 | for (String c : commands) { |
| 337 | 0 | commandHelp(c); |
| 338 | 0 | System.out.println(); |
| 339 | |
} |
| 340 | 0 | } else { |
| 341 | 0 | usage(); |
| 342 | |
} |
| 343 | 0 | } |
| 344 | |
|
| 345 | |
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException, Exception { |
| 346 | 0 | if (args.length < 1) { |
| 347 | 0 | usage(); |
| 348 | 0 | return; |
| 349 | |
} |
| 350 | |
|
| 351 | 0 | String command = args[0]; |
| 352 | |
|
| 353 | 0 | if (command.equals("help") && args.length > 1) { |
| 354 | 0 | commandHelp(args[1]); |
| 355 | 0 | } else if (command.equals("batch-search")) { |
| 356 | 0 | handleBatchSearch(args); |
| 357 | 0 | } else if (command.equals("build")) { |
| 358 | 0 | handleBuild(args); |
| 359 | 0 | } else if (command.equals("doc")) { |
| 360 | 0 | handleDoc(args); |
| 361 | 0 | } else if (command.equals("dump-connection")) { |
| 362 | 0 | handleDumpConnection(args); |
| 363 | 0 | } else if (command.equals("dump-corpus")) { |
| 364 | 0 | handleDumpCorpus(args); |
| 365 | 0 | } else if (command.equals("dump-index")) { |
| 366 | 0 | handleDumpIndex(args); |
| 367 | 0 | } else if (command.equals("dump-keys")) { |
| 368 | 0 | handleDumpKeys(args); |
| 369 | 0 | } else if (command.equals("make-corpus")) { |
| 370 | 0 | handleMakeCorpus(args); |
| 371 | 0 | } else if (command.equals("search")) { |
| 372 | 0 | handleSearch(args); |
| 373 | |
} else { |
| 374 | 0 | usage(); |
| 375 | |
} |
| 376 | 0 | } |
| 377 | |
} |