View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.eval;
4   
5   import java.io.BufferedReader;
6   import java.io.FileNotFoundException;
7   import java.io.FileReader;
8   import java.io.IOException;
9   import java.util.ArrayList;
10  import java.util.Map;
11  import java.util.TreeMap;
12  import org.galagosearch.core.eval.RetrievalEvaluator.Document;
13  import org.galagosearch.core.eval.RetrievalEvaluator.Judgment;
14  
15  /***
16   *
17   * @author trevor
18   */
19  public class Main {
20      /***
21       * Loads a TREC judgments file.
22       *
23       * @param filename The filename of the judgments file to load.
24       * @return Maps from query numbers to lists of judgments for each query.
25       */
26      public static TreeMap<String, ArrayList<Judgment>> loadJudgments(String filename) throws IOException, FileNotFoundException {
27          // open file
28          BufferedReader in = new BufferedReader(new FileReader(filename));
29          String line = null;
30          TreeMap<String, ArrayList<Judgment>> judgments = new TreeMap<String, ArrayList<Judgment>>();
31          String recentQuery = null;
32          ArrayList<Judgment> recentJudgments = null;
33  
34          while ((line = in.readLine()) != null) {
35              int[] columns = splits(line, 4);
36  
37              String number = line.substring(columns[0], columns[1]);
38              String unused = line.substring(columns[2], columns[3]);
39              String docno = line.substring(columns[4], columns[5]);
40              String judgment = line.substring(columns[6]);
41  
42              Judgment j = new Judgment(docno, Integer.valueOf(judgment));
43  
44              if (recentQuery == null || !recentQuery.equals(number)) {
45                  if (!judgments.containsKey(number)) {
46                      judgments.put(number, new ArrayList<Judgment>());
47                  }
48  
49                  recentJudgments = judgments.get(number);
50                  recentQuery = number;
51              }
52  
53              recentJudgments.add(j);
54          }
55  
56          in.close();
57          return judgments;
58      }
59  
60      private static int[] splits(String s, int columns) {
61          int[] result = new int[2 * columns];
62          boolean lastWs = true;
63          int column = 0;
64          result[0] = 0;
65  
66          for (int i = 0; i < s.length() && column < columns; i++) {
67              char c = s.charAt(i);
68              boolean isWs = (c == ' ') || (c == '\t');
69  
70              if (!isWs && lastWs) {
71                  result[2 * column] = i;
72              } else if (isWs && !lastWs) {
73                  result[2 * column + 1] = i;
74                  column++;
75              }
76  
77              lastWs = isWs;
78          }
79  
80          return result;
81      }
82  
83      /***
84       * Reads in a TREC ranking file.
85       *
86       * @param filename The filename of the ranking file.
87       * @return A map from query numbers to document ranking lists.
88       */
89      public static TreeMap<String, ArrayList<Document>> loadRanking(String filename) throws IOException, FileNotFoundException {
90          // open file
91          BufferedReader in = new BufferedReader(new FileReader(filename), 256 * 1024);
92          String line = null;
93          TreeMap<String, ArrayList<Document>> ranking = new TreeMap<String, ArrayList<Document>>();
94          ArrayList<Document> recentRanking = null;
95          String recentQuery = null;
96  
97          while ((line = in.readLine()) != null) {
98              int[] splits = splits(line, 6);
99  
100             // 1 Q0 WSJ880711-0086 39 -3.05948 Exp
101 
102             String number = line.substring(splits[0], splits[1]);
103             String unused = line.substring(splits[2], splits[3]);
104             String docno = line.substring(splits[4], splits[5]);
105             String rank = line.substring(splits[6], splits[7]);
106             String score = line.substring(splits[8], splits[9]);
107             String runtag = line.substring(splits[10]);
108 
109             Document document = new Document(docno, Integer.valueOf(rank), Double.valueOf(score));
110 
111             if (recentQuery == null || !recentQuery.equals(number)) {
112                 if (!ranking.containsKey(number)) {
113                     ranking.put(number, new ArrayList<Document>());
114                 }
115 
116                 recentQuery = number;
117                 recentRanking = ranking.get(number);
118             }
119 
120             recentRanking.add(document);
121         }
122 
123         in.close();
124         return ranking;
125     }
126 
127     /***
128      * Creates a SetRetrievalEvaluator from data from loadRanking and loadJudgments.
129      */
130     public static SetRetrievalEvaluator create(TreeMap<String, ArrayList<Document>> allRankings, TreeMap<String, ArrayList<Judgment>> allJudgments) {
131         TreeMap<String, RetrievalEvaluator> evaluators = new TreeMap<String, RetrievalEvaluator>();
132 
133         for (String query : allRankings.keySet()) {
134             ArrayList<Judgment> judgments = allJudgments.get(query);
135             ArrayList<Document> ranking = allRankings.get(query);
136 
137             if (judgments == null || ranking == null) {
138                 continue;
139             }
140 
141             RetrievalEvaluator evaluator = new RetrievalEvaluator(query, ranking, judgments);
142             evaluators.put(query, evaluator);
143         }
144 
145         return new SetRetrievalEvaluator(evaluators.values());
146     }
147 
148     /***
149      * When run as a standalone application, this returns output 
150      * very similar to that of trec_eval.  The first argument is 
151      * the ranking file, and the second argument is the judgments
152      * file, both in standard TREC format.
153      */
154     public static void singleEvaluation(SetRetrievalEvaluator setEvaluator) {
155         String formatString = "%2$-16s%1$3s ";
156 
157         // print trec_eval relational-style output
158         for (RetrievalEvaluator evaluator : setEvaluator.getEvaluators()) {
159             String query = evaluator.queryName();
160 
161             // counts
162             System.out.format(formatString + "%3$d\n", query, "num_ret", evaluator.
163                               retrievedDocuments().size());
164             System.out.format(formatString + "%3$d\n", query, "num_rel", evaluator.relevantDocuments().
165                               size());
166             System.out.format(formatString + "%3$d\n", query, "num_rel_ret", evaluator.
167                               relevantRetrievedDocuments().size());
168 
169             // aggregate measures
170             System.out.format(formatString + "%3$6.4f\n", query, "map", evaluator.averagePrecision());
171             System.out.format(formatString + "%3$6.4f\n", query, "ndcg", evaluator.
172                               normalizedDiscountedCumulativeGain());
173             System.out.format(formatString + "%3$6.4f\n", query, "ndcg15", evaluator.
174                               normalizedDiscountedCumulativeGain(15));
175             System.out.format(formatString + "%3$6.4f\n", query, "R-prec", evaluator.rPrecision());
176             System.out.format(formatString + "%3$6.4f\n", query, "bpref",
177                               evaluator.binaryPreference());
178             System.out.format(formatString + "%3$6.4f\n", query, "recip_rank", evaluator.
179                               reciprocalRank());
180 
181             // precision at fixed points
182             int[] fixedPoints = {5, 10, 15, 20, 30, 100, 200, 500, 1000};
183 
184             for (int i = 0; i < fixedPoints.length; i++) {
185                 int point = fixedPoints[i];
186                 System.out.format(formatString + "%3$6.4f\n", query, "P" + point, evaluator.
187                                   precision(fixedPoints[i]));
188             }
189         }
190 
191         // print summary data
192         System.out.format(formatString + "%3$d\n", "all", "num_ret", setEvaluator.numberRetrieved());
193         System.out.format(formatString + "%3$d\n", "all", "num_rel", setEvaluator.numberRelevant());
194         System.out.format(formatString + "%3$d\n", "all", "num_rel_ret", setEvaluator.
195                           numberRelevantRetrieved());
196 
197         System.out.format(formatString + "%3$6.4f\n", "all", "map", setEvaluator.
198                           meanAveragePrecision());
199         System.out.format(formatString + "%3$6.4f\n", "all", "ndcg", setEvaluator.
200                           meanNormalizedDiscountedCumulativeGain());
201         System.out.format(formatString + "%3$6.4f\n", "all", "ndcg15", setEvaluator.
202                           meanNormalizedDiscountedCumulativeGain(15));
203         System.out.format(formatString + "%3$6.4f\n", "all", "R-prec", setEvaluator.meanRPrecision());
204         System.out.format(formatString + "%3$6.4f\n", "all", "bpref", setEvaluator.
205                           meanBinaryPreference());
206         System.out.format(formatString + "%3$6.4f\n", "all", "recip_rank", setEvaluator.
207                           meanReciprocalRank());
208 
209         // precision at fixed points
210         int[] fixedPoints = {5, 10, 15, 20, 30, 100, 200, 500, 1000};
211 
212         for (int i = 0; i < fixedPoints.length; i++) {
213             int point = fixedPoints[i];
214             System.out.format(formatString + "%3$6.4f\n", "all", "P" + point, setEvaluator.
215                               meanPrecision(fixedPoints[i]));
216         }
217     }
218 
219     /***
220      * Compares two ranked lists with statistical tests on most major metrics.
221      */
222     public static void comparisonEvaluation(SetRetrievalEvaluator baseline, SetRetrievalEvaluator treatment, boolean useRandomized) {
223         String[] metrics = {"averagePrecision", "ndcg", "ndcg15", "bpref", "P10", "P20"};
224         String formatString = "%1$-20s%2$-20s%3$6.4f\n";
225         String integerFormatString = "%1$-20s%2$-20s%3$d\n";
226 
227         for (String metric : metrics) {
228             Map<String, Double> baselineMetric = baseline.evaluateAll(metric);
229             Map<String, Double> treatmentMetric = treatment.evaluateAll(metric);
230 
231             SetRetrievalComparator comparator = new SetRetrievalComparator(baselineMetric,
232                                                                            treatmentMetric);
233 
234             System.out.format(formatString, metric, "baseline", comparator.meanBaselineMetric());
235             System.out.format(formatString, metric, "treatment", comparator.meanTreatmentMetric());
236 
237             System.out.format(integerFormatString, metric, "basebetter", comparator.
238                               countBaselineBetter());
239             System.out.format(integerFormatString, metric, "treatbetter", comparator.
240                               countTreatmentBetter());
241             System.out.format(integerFormatString, metric, "equal", comparator.countEqual());
242 
243             System.out.format(formatString, metric, "ttest", comparator.pairedTTest());
244             System.out.format(formatString, metric, "signtest", comparator.signTest());
245             if (useRandomized) {
246                 System.out.format(formatString, metric, "randomized", comparator.
247                                                  randomizedTest());
248             }
249             System.out.format(formatString, metric, "h-ttest-0.05", comparator.supportedHypothesis(
250                               "ttest", 0.05));
251             System.out.format(formatString, metric, "h-signtest-0.05", comparator.
252                               supportedHypothesis("sign", 0.05));
253             if (useRandomized) {
254                 System.out.format(formatString, metric, "h-randomized-0.05",
255                                                  comparator.supportedHypothesis("randomized", 0.05));
256             }
257             System.out.format(formatString, metric, "h-ttest-0.01", comparator.supportedHypothesis(
258                               "ttest", 0.01));
259             System.out.format(formatString, metric, "h-signtest-0.01", comparator.
260                               supportedHypothesis("sign", 0.01));
261             if (useRandomized) {
262                 System.out.format(formatString, metric, "h-randomized-0.01",
263                                                  comparator.supportedHypothesis("randomized", 0.01));
264             }
265         }
266     }
267 
268     public static void usage() {
269         System.err.println("galago eval <args>: ");
270         System.err.println(
271                 "   There are two ways to use this program.  First, you can evaluate a single ranking: ");
272         System.err.println("      java -jar ireval.jar TREC-Ranking-File TREC-Judgments-File");
273         System.err.println("   or, you can use it to compare two rankings with statistical tests: ");
274         System.err.println(
275                 "      java -jar ireval.jar TREC-Baseline-Ranking-File TREC-Improved-Ranking-File TREC-Judgments-File");
276         System.err.println("   you can also include randomized tests (these take a bit longer): ");
277         System.err.println(
278                 "      java -jar ireval.jar TREC-Baseline-Ranking-File TREC-Treatment-Ranking-File TREC-Judgments-File randomized");
279         System.err.println();
280         System.err.println("Single evaluation:");
281         System.err.println(
282                 "   The first column is the query number, or 'all' for a mean of the metric over all queries.");
283         System.err.println(
284                 "   The second column is the metric, which is one of:                                        ");
285         System.err.println(
286                 "       num_ret        Number of retrieved documents                                         ");
287         System.err.println(
288                 "       num_rel        Number of relevant documents listed in the judgments file             ");
289         System.err.println(
290                 "       num_rel_ret    Number of relevant retrieved documents                                ");
291         System.err.println(
292                 "       map            Mean average precision                                                ");
293         System.err.println(
294                 "       bpref          Bpref (binary preference)                                             ");
295         System.err.println(
296                 "       ndcg           Normalized Discounted Cumulative Gain, computed over all documents    ");
297         System.err.println(
298                 "       ndcg15         Normalized Discounted Cumulative Gain, 15 document cutoff             ");
299         System.err.println(
300                 "       Pn             Precision, n document cutoff                                          ");
301         System.err.println(
302                 "       R-prec         R-Precision                                                           ");
303         System.err.println(
304                 "       recip_rank     Reciprocal Rank (precision at first relevant document)                ");
305         System.err.println(
306                 "   The third column is the metric value.                                                    ");
307         System.err.println();
308         System.err.println("Compared evaluation: ");
309         System.err.println("   The first column is the metric (e.g. averagePrecision, ndcg, etc.)");
310         System.err.println(
311                 "   The second column is the test/formula used:                                               ");
312         System.err.println(
313                 "       baseline       The baseline mean (mean of the metric over all baseline queries)       ");
314         System.err.println(
315                 "       treatment      The \'improved\' mean (mean of the metric over all treatment queries)  ");
316         System.err.println(
317                 "       basebetter     Number of queries where the baseline outperforms the treatment.        ");
318         System.err.println(
319                 "       treatbetter    Number of queries where the treatment outperforms the baseline.        ");
320         System.err.println(
321                 "       equal          Number of queries where the treatment and baseline perform identically.");
322         System.err.println("       ttest          P-value of a paired t-test.");
323         System.err.println(
324                 "       signtest       P-value of the Fisher sign test.                                       ");
325         System.err.println(
326                 "       randomized      P-value of a randomized test.                                          ");
327 
328         System.err.println(
329                 "   The second column also includes difference tests.  In these tests, the null hypothesis is ");
330         System.err.println(
331                 "     that the mean of the treatment is at least k times the mean of the baseline.  We run the");
332         System.err.println(
333                 "     same tests as before, but we artificially improve the baseline values by a factor of k. ");
334 
335         System.err.println(
336                 "       h-ttest-0.5    Largest value of k such that the ttest has a p-value of less than 0.5. ");
337         System.err.println(
338                 "       h-signtest-0.5 Largest value of k such that the sign test has a p-value of less than 0.5. ");
339         System.err.println(
340                 "       h-randomized-0.5 Largest value of k such that the randomized test has a p-value of less than 0.5. ");
341 
342         System.err.println(
343                 "       h-ttest-0.1    Largest value of k such that the ttest has a p-value of less than 0.1. ");
344         System.err.println(
345                 "       h-signtest-0.1 Largest value of k such that the sign test has a p-value of less than 0.1. ");
346         System.err.println(
347                 "       h-randomized-0.1 Largest value of k such that the randomized test has a p-value of less than 0.1. ");
348         System.err.println();
349 
350         System.err.println("  The third column is the value of the test.");
351 
352         System.exit(-1);
353     }
354 
355     public static void main(String[] args) throws IOException {
356         try {
357             if (args.length >= 3) {
358                 TreeMap<String, ArrayList<Document>> baselineRanking = loadRanking(args[0]);
359                 TreeMap<String, ArrayList<Document>> treatmentRanking = loadRanking(args[1]);
360                 TreeMap<String, ArrayList<Judgment>> judgments = loadJudgments(args[2]);
361 
362                 SetRetrievalEvaluator baseline = create(baselineRanking, judgments);
363                 SetRetrievalEvaluator treatment = create(treatmentRanking, judgments);
364 
365                 comparisonEvaluation(baseline, treatment, args.length >= 4);
366             } else if (args.length == 2) {
367                 TreeMap<String, ArrayList<Document>> ranking = loadRanking(args[0]);
368                 TreeMap<String, ArrayList<Judgment>> judgments = loadJudgments(args[1]);
369 
370                 SetRetrievalEvaluator setEvaluator = create(ranking, judgments);
371                 singleEvaluation(setEvaluator);
372             } else {
373                 usage();
374             }
375         } catch (Exception e) {
376             e.printStackTrace();
377             usage();
378         }
379     }
380 }