Coverage Report - org.galagosearch.core.eval.Main
 
Classes in this File Line Coverage Branch Coverage Complexity
Main
0%
0/191
0%
0/58
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.eval;
 4  
 
 5  
 import java.io.BufferedReader;
 6  
 import java.io.FileNotFoundException;
 7  
 import java.io.FileReader;
 8  
 import java.io.IOException;
 9  
 import java.util.ArrayList;
 10  
 import java.util.Map;
 11  
 import java.util.TreeMap;
 12  
 import org.galagosearch.core.eval.RetrievalEvaluator.Document;
 13  
 import org.galagosearch.core.eval.RetrievalEvaluator.Judgment;
 14  
 
 15  
 /**
 16  
  *
 17  
  * @author trevor
 18  
  */
 19  0
 public class Main {
 20  
     /**
 21  
      * Loads a TREC judgments file.
 22  
      *
 23  
      * @param filename The filename of the judgments file to load.
 24  
      * @return Maps from query numbers to lists of judgments for each query.
 25  
      */
 26  
     public static TreeMap<String, ArrayList<Judgment>> loadJudgments(String filename) throws IOException, FileNotFoundException {
 27  
         // open file
 28  0
         BufferedReader in = new BufferedReader(new FileReader(filename));
 29  0
         String line = null;
 30  0
         TreeMap<String, ArrayList<Judgment>> judgments = new TreeMap<String, ArrayList<Judgment>>();
 31  0
         String recentQuery = null;
 32  0
         ArrayList<Judgment> recentJudgments = null;
 33  
 
 34  0
         while ((line = in.readLine()) != null) {
 35  0
             int[] columns = splits(line, 4);
 36  
 
 37  0
             String number = line.substring(columns[0], columns[1]);
 38  0
             String unused = line.substring(columns[2], columns[3]);
 39  0
             String docno = line.substring(columns[4], columns[5]);
 40  0
             String judgment = line.substring(columns[6]);
 41  
 
 42  0
             Judgment j = new Judgment(docno, Integer.valueOf(judgment));
 43  
 
 44  0
             if (recentQuery == null || !recentQuery.equals(number)) {
 45  0
                 if (!judgments.containsKey(number)) {
 46  0
                     judgments.put(number, new ArrayList<Judgment>());
 47  
                 }
 48  
 
 49  0
                 recentJudgments = judgments.get(number);
 50  0
                 recentQuery = number;
 51  
             }
 52  
 
 53  0
             recentJudgments.add(j);
 54  0
         }
 55  
 
 56  0
         in.close();
 57  0
         return judgments;
 58  
     }
 59  
 
 60  
     private static int[] splits(String s, int columns) {
 61  0
         int[] result = new int[2 * columns];
 62  0
         boolean lastWs = true;
 63  0
         int column = 0;
 64  0
         result[0] = 0;
 65  
 
 66  0
         for (int i = 0; i < s.length() && column < columns; i++) {
 67  0
             char c = s.charAt(i);
 68  0
             boolean isWs = (c == ' ') || (c == '\t');
 69  
 
 70  0
             if (!isWs && lastWs) {
 71  0
                 result[2 * column] = i;
 72  0
             } else if (isWs && !lastWs) {
 73  0
                 result[2 * column + 1] = i;
 74  0
                 column++;
 75  
             }
 76  
 
 77  0
             lastWs = isWs;
 78  
         }
 79  
 
 80  0
         return result;
 81  
     }
 82  
 
 83  
     /**
 84  
      * Reads in a TREC ranking file.
 85  
      *
 86  
      * @param filename The filename of the ranking file.
 87  
      * @return A map from query numbers to document ranking lists.
 88  
      */
 89  
     public static TreeMap<String, ArrayList<Document>> loadRanking(String filename) throws IOException, FileNotFoundException {
 90  
         // open file
 91  0
         BufferedReader in = new BufferedReader(new FileReader(filename), 256 * 1024);
 92  0
         String line = null;
 93  0
         TreeMap<String, ArrayList<Document>> ranking = new TreeMap<String, ArrayList<Document>>();
 94  0
         ArrayList<Document> recentRanking = null;
 95  0
         String recentQuery = null;
 96  
 
 97  0
         while ((line = in.readLine()) != null) {
 98  0
             int[] splits = splits(line, 6);
 99  
 
 100  
             // 1 Q0 WSJ880711-0086 39 -3.05948 Exp
 101  
 
 102  0
             String number = line.substring(splits[0], splits[1]);
 103  0
             String unused = line.substring(splits[2], splits[3]);
 104  0
             String docno = line.substring(splits[4], splits[5]);
 105  0
             String rank = line.substring(splits[6], splits[7]);
 106  0
             String score = line.substring(splits[8], splits[9]);
 107  0
             String runtag = line.substring(splits[10]);
 108  
 
 109  0
             Document document = new Document(docno, Integer.valueOf(rank), Double.valueOf(score));
 110  
 
 111  0
             if (recentQuery == null || !recentQuery.equals(number)) {
 112  0
                 if (!ranking.containsKey(number)) {
 113  0
                     ranking.put(number, new ArrayList<Document>());
 114  
                 }
 115  
 
 116  0
                 recentQuery = number;
 117  0
                 recentRanking = ranking.get(number);
 118  
             }
 119  
 
 120  0
             recentRanking.add(document);
 121  0
         }
 122  
 
 123  0
         in.close();
 124  0
         return ranking;
 125  
     }
 126  
 
 127  
     /**
 128  
      * Creates a SetRetrievalEvaluator from data from loadRanking and loadJudgments.
 129  
      */
 130  
     public static SetRetrievalEvaluator create(TreeMap<String, ArrayList<Document>> allRankings, TreeMap<String, ArrayList<Judgment>> allJudgments) {
 131  0
         TreeMap<String, RetrievalEvaluator> evaluators = new TreeMap<String, RetrievalEvaluator>();
 132  
 
 133  0
         for (String query : allRankings.keySet()) {
 134  0
             ArrayList<Judgment> judgments = allJudgments.get(query);
 135  0
             ArrayList<Document> ranking = allRankings.get(query);
 136  
 
 137  0
             if (judgments == null || ranking == null) {
 138  0
                 continue;
 139  
             }
 140  
 
 141  0
             RetrievalEvaluator evaluator = new RetrievalEvaluator(query, ranking, judgments);
 142  0
             evaluators.put(query, evaluator);
 143  0
         }
 144  
 
 145  0
         return new SetRetrievalEvaluator(evaluators.values());
 146  
     }
 147  
 
 148  
     /**
 149  
      * When run as a standalone application, this returns output 
 150  
      * very similar to that of trec_eval.  The first argument is 
 151  
      * the ranking file, and the second argument is the judgments
 152  
      * file, both in standard TREC format.
 153  
      */
 154  
     public static void singleEvaluation(SetRetrievalEvaluator setEvaluator) {
 155  0
         String formatString = "%2$-16s%1$3s ";
 156  
 
 157  
         // print trec_eval relational-style output
 158  0
         for (RetrievalEvaluator evaluator : setEvaluator.getEvaluators()) {
 159  0
             String query = evaluator.queryName();
 160  
 
 161  
             // counts
 162  0
             System.out.format(formatString + "%3$d\n", query, "num_ret", evaluator.
 163  
                               retrievedDocuments().size());
 164  0
             System.out.format(formatString + "%3$d\n", query, "num_rel", evaluator.relevantDocuments().
 165  
                               size());
 166  0
             System.out.format(formatString + "%3$d\n", query, "num_rel_ret", evaluator.
 167  
                               relevantRetrievedDocuments().size());
 168  
 
 169  
             // aggregate measures
 170  0
             System.out.format(formatString + "%3$6.4f\n", query, "map", evaluator.averagePrecision());
 171  0
             System.out.format(formatString + "%3$6.4f\n", query, "ndcg", evaluator.
 172  
                               normalizedDiscountedCumulativeGain());
 173  0
             System.out.format(formatString + "%3$6.4f\n", query, "ndcg15", evaluator.
 174  
                               normalizedDiscountedCumulativeGain(15));
 175  0
             System.out.format(formatString + "%3$6.4f\n", query, "R-prec", evaluator.rPrecision());
 176  0
             System.out.format(formatString + "%3$6.4f\n", query, "bpref",
 177  
                               evaluator.binaryPreference());
 178  0
             System.out.format(formatString + "%3$6.4f\n", query, "recip_rank", evaluator.
 179  
                               reciprocalRank());
 180  
 
 181  
             // precision at fixed points
 182  0
             int[] fixedPoints = {5, 10, 15, 20, 30, 100, 200, 500, 1000};
 183  
 
 184  0
             for (int i = 0; i < fixedPoints.length; i++) {
 185  0
                 int point = fixedPoints[i];
 186  0
                 System.out.format(formatString + "%3$6.4f\n", query, "P" + point, evaluator.
 187  
                                   precision(fixedPoints[i]));
 188  
             }
 189  0
         }
 190  
 
 191  
         // print summary data
 192  0
         System.out.format(formatString + "%3$d\n", "all", "num_ret", setEvaluator.numberRetrieved());
 193  0
         System.out.format(formatString + "%3$d\n", "all", "num_rel", setEvaluator.numberRelevant());
 194  0
         System.out.format(formatString + "%3$d\n", "all", "num_rel_ret", setEvaluator.
 195  
                           numberRelevantRetrieved());
 196  
 
 197  0
         System.out.format(formatString + "%3$6.4f\n", "all", "map", setEvaluator.
 198  
                           meanAveragePrecision());
 199  0
         System.out.format(formatString + "%3$6.4f\n", "all", "ndcg", setEvaluator.
 200  
                           meanNormalizedDiscountedCumulativeGain());
 201  0
         System.out.format(formatString + "%3$6.4f\n", "all", "ndcg15", setEvaluator.
 202  
                           meanNormalizedDiscountedCumulativeGain(15));
 203  0
         System.out.format(formatString + "%3$6.4f\n", "all", "R-prec", setEvaluator.meanRPrecision());
 204  0
         System.out.format(formatString + "%3$6.4f\n", "all", "bpref", setEvaluator.
 205  
                           meanBinaryPreference());
 206  0
         System.out.format(formatString + "%3$6.4f\n", "all", "recip_rank", setEvaluator.
 207  
                           meanReciprocalRank());
 208  
 
 209  
         // precision at fixed points
 210  0
         int[] fixedPoints = {5, 10, 15, 20, 30, 100, 200, 500, 1000};
 211  
 
 212  0
         for (int i = 0; i < fixedPoints.length; i++) {
 213  0
             int point = fixedPoints[i];
 214  0
             System.out.format(formatString + "%3$6.4f\n", "all", "P" + point, setEvaluator.
 215  
                               meanPrecision(fixedPoints[i]));
 216  
         }
 217  0
     }
 218  
 
 219  
     /**
 220  
      * Compares two ranked lists with statistical tests on most major metrics.
 221  
      */
 222  
     public static void comparisonEvaluation(SetRetrievalEvaluator baseline, SetRetrievalEvaluator treatment, boolean useRandomized) {
 223  0
         String[] metrics = {"averagePrecision", "ndcg", "ndcg15", "bpref", "P10", "P20"};
 224  0
         String formatString = "%1$-20s%2$-20s%3$6.4f\n";
 225  0
         String integerFormatString = "%1$-20s%2$-20s%3$d\n";
 226  
 
 227  0
         for (String metric : metrics) {
 228  0
             Map<String, Double> baselineMetric = baseline.evaluateAll(metric);
 229  0
             Map<String, Double> treatmentMetric = treatment.evaluateAll(metric);
 230  
 
 231  0
             SetRetrievalComparator comparator = new SetRetrievalComparator(baselineMetric,
 232  
                                                                            treatmentMetric);
 233  
 
 234  0
             System.out.format(formatString, metric, "baseline", comparator.meanBaselineMetric());
 235  0
             System.out.format(formatString, metric, "treatment", comparator.meanTreatmentMetric());
 236  
 
 237  0
             System.out.format(integerFormatString, metric, "basebetter", comparator.
 238  
                               countBaselineBetter());
 239  0
             System.out.format(integerFormatString, metric, "treatbetter", comparator.
 240  
                               countTreatmentBetter());
 241  0
             System.out.format(integerFormatString, metric, "equal", comparator.countEqual());
 242  
 
 243  0
             System.out.format(formatString, metric, "ttest", comparator.pairedTTest());
 244  0
             System.out.format(formatString, metric, "signtest", comparator.signTest());
 245  0
             if (useRandomized) {
 246  0
                 System.out.format(formatString, metric, "randomized", comparator.
 247  
                                                  randomizedTest());
 248  
             }
 249  0
             System.out.format(formatString, metric, "h-ttest-0.05", comparator.supportedHypothesis(
 250  
                               "ttest", 0.05));
 251  0
             System.out.format(formatString, metric, "h-signtest-0.05", comparator.
 252  
                               supportedHypothesis("sign", 0.05));
 253  0
             if (useRandomized) {
 254  0
                 System.out.format(formatString, metric, "h-randomized-0.05",
 255  
                                                  comparator.supportedHypothesis("randomized", 0.05));
 256  
             }
 257  0
             System.out.format(formatString, metric, "h-ttest-0.01", comparator.supportedHypothesis(
 258  
                               "ttest", 0.01));
 259  0
             System.out.format(formatString, metric, "h-signtest-0.01", comparator.
 260  
                               supportedHypothesis("sign", 0.01));
 261  0
             if (useRandomized) {
 262  0
                 System.out.format(formatString, metric, "h-randomized-0.01",
 263  
                                                  comparator.supportedHypothesis("randomized", 0.01));
 264  
             }
 265  
         }
 266  0
     }
 267  
 
 268  
     public static void usage() {
 269  0
         System.err.println("galago eval <args>: ");
 270  0
         System.err.println(
 271  
                 "   There are two ways to use this program.  First, you can evaluate a single ranking: ");
 272  0
         System.err.println("      java -jar ireval.jar TREC-Ranking-File TREC-Judgments-File");
 273  0
         System.err.println("   or, you can use it to compare two rankings with statistical tests: ");
 274  0
         System.err.println(
 275  
                 "      java -jar ireval.jar TREC-Baseline-Ranking-File TREC-Improved-Ranking-File TREC-Judgments-File");
 276  0
         System.err.println("   you can also include randomized tests (these take a bit longer): ");
 277  0
         System.err.println(
 278  
                 "      java -jar ireval.jar TREC-Baseline-Ranking-File TREC-Treatment-Ranking-File TREC-Judgments-File randomized");
 279  0
         System.err.println();
 280  0
         System.err.println("Single evaluation:");
 281  0
         System.err.println(
 282  
                 "   The first column is the query number, or 'all' for a mean of the metric over all queries.");
 283  0
         System.err.println(
 284  
                 "   The second column is the metric, which is one of:                                        ");
 285  0
         System.err.println(
 286  
                 "       num_ret        Number of retrieved documents                                         ");
 287  0
         System.err.println(
 288  
                 "       num_rel        Number of relevant documents listed in the judgments file             ");
 289  0
         System.err.println(
 290  
                 "       num_rel_ret    Number of relevant retrieved documents                                ");
 291  0
         System.err.println(
 292  
                 "       map            Mean average precision                                                ");
 293  0
         System.err.println(
 294  
                 "       bpref          Bpref (binary preference)                                             ");
 295  0
         System.err.println(
 296  
                 "       ndcg           Normalized Discounted Cumulative Gain, computed over all documents    ");
 297  0
         System.err.println(
 298  
                 "       ndcg15         Normalized Discounted Cumulative Gain, 15 document cutoff             ");
 299  0
         System.err.println(
 300  
                 "       Pn             Precision, n document cutoff                                          ");
 301  0
         System.err.println(
 302  
                 "       R-prec         R-Precision                                                           ");
 303  0
         System.err.println(
 304  
                 "       recip_rank     Reciprocal Rank (precision at first relevant document)                ");
 305  0
         System.err.println(
 306  
                 "   The third column is the metric value.                                                    ");
 307  0
         System.err.println();
 308  0
         System.err.println("Compared evaluation: ");
 309  0
         System.err.println("   The first column is the metric (e.g. averagePrecision, ndcg, etc.)");
 310  0
         System.err.println(
 311  
                 "   The second column is the test/formula used:                                               ");
 312  0
         System.err.println(
 313  
                 "       baseline       The baseline mean (mean of the metric over all baseline queries)       ");
 314  0
         System.err.println(
 315  
                 "       treatment      The \'improved\' mean (mean of the metric over all treatment queries)  ");
 316  0
         System.err.println(
 317  
                 "       basebetter     Number of queries where the baseline outperforms the treatment.        ");
 318  0
         System.err.println(
 319  
                 "       treatbetter    Number of queries where the treatment outperforms the baseline.        ");
 320  0
         System.err.println(
 321  
                 "       equal          Number of queries where the treatment and baseline perform identically.");
 322  0
         System.err.println("       ttest          P-value of a paired t-test.");
 323  0
         System.err.println(
 324  
                 "       signtest       P-value of the Fisher sign test.                                       ");
 325  0
         System.err.println(
 326  
                 "       randomized      P-value of a randomized test.                                          ");
 327  
 
 328  0
         System.err.println(
 329  
                 "   The second column also includes difference tests.  In these tests, the null hypothesis is ");
 330  0
         System.err.println(
 331  
                 "     that the mean of the treatment is at least k times the mean of the baseline.  We run the");
 332  0
         System.err.println(
 333  
                 "     same tests as before, but we artificially improve the baseline values by a factor of k. ");
 334  
 
 335  0
         System.err.println(
 336  
                 "       h-ttest-0.5    Largest value of k such that the ttest has a p-value of less than 0.5. ");
 337  0
         System.err.println(
 338  
                 "       h-signtest-0.5 Largest value of k such that the sign test has a p-value of less than 0.5. ");
 339  0
         System.err.println(
 340  
                 "       h-randomized-0.5 Largest value of k such that the randomized test has a p-value of less than 0.5. ");
 341  
 
 342  0
         System.err.println(
 343  
                 "       h-ttest-0.1    Largest value of k such that the ttest has a p-value of less than 0.1. ");
 344  0
         System.err.println(
 345  
                 "       h-signtest-0.1 Largest value of k such that the sign test has a p-value of less than 0.1. ");
 346  0
         System.err.println(
 347  
                 "       h-randomized-0.1 Largest value of k such that the randomized test has a p-value of less than 0.1. ");
 348  0
         System.err.println();
 349  
 
 350  0
         System.err.println("  The third column is the value of the test.");
 351  
 
 352  0
         System.exit(-1);
 353  0
     }
 354  
 
 355  
     public static void main(String[] args) throws IOException {
 356  
         try {
 357  0
             if (args.length >= 3) {
 358  0
                 TreeMap<String, ArrayList<Document>> baselineRanking = loadRanking(args[0]);
 359  0
                 TreeMap<String, ArrayList<Document>> treatmentRanking = loadRanking(args[1]);
 360  0
                 TreeMap<String, ArrayList<Judgment>> judgments = loadJudgments(args[2]);
 361  
 
 362  0
                 SetRetrievalEvaluator baseline = create(baselineRanking, judgments);
 363  0
                 SetRetrievalEvaluator treatment = create(treatmentRanking, judgments);
 364  
 
 365  0
                 comparisonEvaluation(baseline, treatment, args.length >= 4);
 366  0
             } else if (args.length == 2) {
 367  0
                 TreeMap<String, ArrayList<Document>> ranking = loadRanking(args[0]);
 368  0
                 TreeMap<String, ArrayList<Judgment>> judgments = loadJudgments(args[1]);
 369  
 
 370  0
                 SetRetrievalEvaluator setEvaluator = create(ranking, judgments);
 371  0
                 singleEvaluation(setEvaluator);
 372  0
             } else {
 373  0
                 usage();
 374  
             }
 375  0
         } catch (Exception e) {
 376  0
             e.printStackTrace();
 377  0
             usage();
 378  0
         }
 379  0
     }
 380  
 }