1
2
3 package org.galagosearch.core.eval;
4
5 import java.io.BufferedReader;
6 import java.io.FileNotFoundException;
7 import java.io.FileReader;
8 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.Map;
11 import java.util.TreeMap;
12 import org.galagosearch.core.eval.RetrievalEvaluator.Document;
13 import org.galagosearch.core.eval.RetrievalEvaluator.Judgment;
14
15 /***
16 *
17 * @author trevor
18 */
19 public class Main {
20 /***
21 * Loads a TREC judgments file.
22 *
23 * @param filename The filename of the judgments file to load.
24 * @return Maps from query numbers to lists of judgments for each query.
25 */
26 public static TreeMap<String, ArrayList<Judgment>> loadJudgments(String filename) throws IOException, FileNotFoundException {
27
28 BufferedReader in = new BufferedReader(new FileReader(filename));
29 String line = null;
30 TreeMap<String, ArrayList<Judgment>> judgments = new TreeMap<String, ArrayList<Judgment>>();
31 String recentQuery = null;
32 ArrayList<Judgment> recentJudgments = null;
33
34 while ((line = in.readLine()) != null) {
35 int[] columns = splits(line, 4);
36
37 String number = line.substring(columns[0], columns[1]);
38 String unused = line.substring(columns[2], columns[3]);
39 String docno = line.substring(columns[4], columns[5]);
40 String judgment = line.substring(columns[6]);
41
42 Judgment j = new Judgment(docno, Integer.valueOf(judgment));
43
44 if (recentQuery == null || !recentQuery.equals(number)) {
45 if (!judgments.containsKey(number)) {
46 judgments.put(number, new ArrayList<Judgment>());
47 }
48
49 recentJudgments = judgments.get(number);
50 recentQuery = number;
51 }
52
53 recentJudgments.add(j);
54 }
55
56 in.close();
57 return judgments;
58 }
59
60 private static int[] splits(String s, int columns) {
61 int[] result = new int[2 * columns];
62 boolean lastWs = true;
63 int column = 0;
64 result[0] = 0;
65
66 for (int i = 0; i < s.length() && column < columns; i++) {
67 char c = s.charAt(i);
68 boolean isWs = (c == ' ') || (c == '\t');
69
70 if (!isWs && lastWs) {
71 result[2 * column] = i;
72 } else if (isWs && !lastWs) {
73 result[2 * column + 1] = i;
74 column++;
75 }
76
77 lastWs = isWs;
78 }
79
80 return result;
81 }
82
83 /***
84 * Reads in a TREC ranking file.
85 *
86 * @param filename The filename of the ranking file.
87 * @return A map from query numbers to document ranking lists.
88 */
89 public static TreeMap<String, ArrayList<Document>> loadRanking(String filename) throws IOException, FileNotFoundException {
90
91 BufferedReader in = new BufferedReader(new FileReader(filename), 256 * 1024);
92 String line = null;
93 TreeMap<String, ArrayList<Document>> ranking = new TreeMap<String, ArrayList<Document>>();
94 ArrayList<Document> recentRanking = null;
95 String recentQuery = null;
96
97 while ((line = in.readLine()) != null) {
98 int[] splits = splits(line, 6);
99
100
101
102 String number = line.substring(splits[0], splits[1]);
103 String unused = line.substring(splits[2], splits[3]);
104 String docno = line.substring(splits[4], splits[5]);
105 String rank = line.substring(splits[6], splits[7]);
106 String score = line.substring(splits[8], splits[9]);
107 String runtag = line.substring(splits[10]);
108
109 Document document = new Document(docno, Integer.valueOf(rank), Double.valueOf(score));
110
111 if (recentQuery == null || !recentQuery.equals(number)) {
112 if (!ranking.containsKey(number)) {
113 ranking.put(number, new ArrayList<Document>());
114 }
115
116 recentQuery = number;
117 recentRanking = ranking.get(number);
118 }
119
120 recentRanking.add(document);
121 }
122
123 in.close();
124 return ranking;
125 }
126
127 /***
128 * Creates a SetRetrievalEvaluator from data from loadRanking and loadJudgments.
129 */
130 public static SetRetrievalEvaluator create(TreeMap<String, ArrayList<Document>> allRankings, TreeMap<String, ArrayList<Judgment>> allJudgments) {
131 TreeMap<String, RetrievalEvaluator> evaluators = new TreeMap<String, RetrievalEvaluator>();
132
133 for (String query : allRankings.keySet()) {
134 ArrayList<Judgment> judgments = allJudgments.get(query);
135 ArrayList<Document> ranking = allRankings.get(query);
136
137 if (judgments == null || ranking == null) {
138 continue;
139 }
140
141 RetrievalEvaluator evaluator = new RetrievalEvaluator(query, ranking, judgments);
142 evaluators.put(query, evaluator);
143 }
144
145 return new SetRetrievalEvaluator(evaluators.values());
146 }
147
148 /***
149 * When run as a standalone application, this returns output
150 * very similar to that of trec_eval. The first argument is
151 * the ranking file, and the second argument is the judgments
152 * file, both in standard TREC format.
153 */
154 public static void singleEvaluation(SetRetrievalEvaluator setEvaluator) {
155 String formatString = "%2$-16s%1$3s ";
156
157
158 for (RetrievalEvaluator evaluator : setEvaluator.getEvaluators()) {
159 String query = evaluator.queryName();
160
161
162 System.out.format(formatString + "%3$d\n", query, "num_ret", evaluator.
163 retrievedDocuments().size());
164 System.out.format(formatString + "%3$d\n", query, "num_rel", evaluator.relevantDocuments().
165 size());
166 System.out.format(formatString + "%3$d\n", query, "num_rel_ret", evaluator.
167 relevantRetrievedDocuments().size());
168
169
170 System.out.format(formatString + "%3$6.4f\n", query, "map", evaluator.averagePrecision());
171 System.out.format(formatString + "%3$6.4f\n", query, "ndcg", evaluator.
172 normalizedDiscountedCumulativeGain());
173 System.out.format(formatString + "%3$6.4f\n", query, "ndcg15", evaluator.
174 normalizedDiscountedCumulativeGain(15));
175 System.out.format(formatString + "%3$6.4f\n", query, "R-prec", evaluator.rPrecision());
176 System.out.format(formatString + "%3$6.4f\n", query, "bpref",
177 evaluator.binaryPreference());
178 System.out.format(formatString + "%3$6.4f\n", query, "recip_rank", evaluator.
179 reciprocalRank());
180
181
182 int[] fixedPoints = {5, 10, 15, 20, 30, 100, 200, 500, 1000};
183
184 for (int i = 0; i < fixedPoints.length; i++) {
185 int point = fixedPoints[i];
186 System.out.format(formatString + "%3$6.4f\n", query, "P" + point, evaluator.
187 precision(fixedPoints[i]));
188 }
189 }
190
191
192 System.out.format(formatString + "%3$d\n", "all", "num_ret", setEvaluator.numberRetrieved());
193 System.out.format(formatString + "%3$d\n", "all", "num_rel", setEvaluator.numberRelevant());
194 System.out.format(formatString + "%3$d\n", "all", "num_rel_ret", setEvaluator.
195 numberRelevantRetrieved());
196
197 System.out.format(formatString + "%3$6.4f\n", "all", "map", setEvaluator.
198 meanAveragePrecision());
199 System.out.format(formatString + "%3$6.4f\n", "all", "ndcg", setEvaluator.
200 meanNormalizedDiscountedCumulativeGain());
201 System.out.format(formatString + "%3$6.4f\n", "all", "ndcg15", setEvaluator.
202 meanNormalizedDiscountedCumulativeGain(15));
203 System.out.format(formatString + "%3$6.4f\n", "all", "R-prec", setEvaluator.meanRPrecision());
204 System.out.format(formatString + "%3$6.4f\n", "all", "bpref", setEvaluator.
205 meanBinaryPreference());
206 System.out.format(formatString + "%3$6.4f\n", "all", "recip_rank", setEvaluator.
207 meanReciprocalRank());
208
209
210 int[] fixedPoints = {5, 10, 15, 20, 30, 100, 200, 500, 1000};
211
212 for (int i = 0; i < fixedPoints.length; i++) {
213 int point = fixedPoints[i];
214 System.out.format(formatString + "%3$6.4f\n", "all", "P" + point, setEvaluator.
215 meanPrecision(fixedPoints[i]));
216 }
217 }
218
219 /***
220 * Compares two ranked lists with statistical tests on most major metrics.
221 */
222 public static void comparisonEvaluation(SetRetrievalEvaluator baseline, SetRetrievalEvaluator treatment, boolean useRandomized) {
223 String[] metrics = {"averagePrecision", "ndcg", "ndcg15", "bpref", "P10", "P20"};
224 String formatString = "%1$-20s%2$-20s%3$6.4f\n";
225 String integerFormatString = "%1$-20s%2$-20s%3$d\n";
226
227 for (String metric : metrics) {
228 Map<String, Double> baselineMetric = baseline.evaluateAll(metric);
229 Map<String, Double> treatmentMetric = treatment.evaluateAll(metric);
230
231 SetRetrievalComparator comparator = new SetRetrievalComparator(baselineMetric,
232 treatmentMetric);
233
234 System.out.format(formatString, metric, "baseline", comparator.meanBaselineMetric());
235 System.out.format(formatString, metric, "treatment", comparator.meanTreatmentMetric());
236
237 System.out.format(integerFormatString, metric, "basebetter", comparator.
238 countBaselineBetter());
239 System.out.format(integerFormatString, metric, "treatbetter", comparator.
240 countTreatmentBetter());
241 System.out.format(integerFormatString, metric, "equal", comparator.countEqual());
242
243 System.out.format(formatString, metric, "ttest", comparator.pairedTTest());
244 System.out.format(formatString, metric, "signtest", comparator.signTest());
245 if (useRandomized) {
246 System.out.format(formatString, metric, "randomized", comparator.
247 randomizedTest());
248 }
249 System.out.format(formatString, metric, "h-ttest-0.05", comparator.supportedHypothesis(
250 "ttest", 0.05));
251 System.out.format(formatString, metric, "h-signtest-0.05", comparator.
252 supportedHypothesis("sign", 0.05));
253 if (useRandomized) {
254 System.out.format(formatString, metric, "h-randomized-0.05",
255 comparator.supportedHypothesis("randomized", 0.05));
256 }
257 System.out.format(formatString, metric, "h-ttest-0.01", comparator.supportedHypothesis(
258 "ttest", 0.01));
259 System.out.format(formatString, metric, "h-signtest-0.01", comparator.
260 supportedHypothesis("sign", 0.01));
261 if (useRandomized) {
262 System.out.format(formatString, metric, "h-randomized-0.01",
263 comparator.supportedHypothesis("randomized", 0.01));
264 }
265 }
266 }
267
268 public static void usage() {
269 System.err.println("galago eval <args>: ");
270 System.err.println(
271 " There are two ways to use this program. First, you can evaluate a single ranking: ");
272 System.err.println(" java -jar ireval.jar TREC-Ranking-File TREC-Judgments-File");
273 System.err.println(" or, you can use it to compare two rankings with statistical tests: ");
274 System.err.println(
275 " java -jar ireval.jar TREC-Baseline-Ranking-File TREC-Improved-Ranking-File TREC-Judgments-File");
276 System.err.println(" you can also include randomized tests (these take a bit longer): ");
277 System.err.println(
278 " java -jar ireval.jar TREC-Baseline-Ranking-File TREC-Treatment-Ranking-File TREC-Judgments-File randomized");
279 System.err.println();
280 System.err.println("Single evaluation:");
281 System.err.println(
282 " The first column is the query number, or 'all' for a mean of the metric over all queries.");
283 System.err.println(
284 " The second column is the metric, which is one of: ");
285 System.err.println(
286 " num_ret Number of retrieved documents ");
287 System.err.println(
288 " num_rel Number of relevant documents listed in the judgments file ");
289 System.err.println(
290 " num_rel_ret Number of relevant retrieved documents ");
291 System.err.println(
292 " map Mean average precision ");
293 System.err.println(
294 " bpref Bpref (binary preference) ");
295 System.err.println(
296 " ndcg Normalized Discounted Cumulative Gain, computed over all documents ");
297 System.err.println(
298 " ndcg15 Normalized Discounted Cumulative Gain, 15 document cutoff ");
299 System.err.println(
300 " Pn Precision, n document cutoff ");
301 System.err.println(
302 " R-prec R-Precision ");
303 System.err.println(
304 " recip_rank Reciprocal Rank (precision at first relevant document) ");
305 System.err.println(
306 " The third column is the metric value. ");
307 System.err.println();
308 System.err.println("Compared evaluation: ");
309 System.err.println(" The first column is the metric (e.g. averagePrecision, ndcg, etc.)");
310 System.err.println(
311 " The second column is the test/formula used: ");
312 System.err.println(
313 " baseline The baseline mean (mean of the metric over all baseline queries) ");
314 System.err.println(
315 " treatment The \'improved\' mean (mean of the metric over all treatment queries) ");
316 System.err.println(
317 " basebetter Number of queries where the baseline outperforms the treatment. ");
318 System.err.println(
319 " treatbetter Number of queries where the treatment outperforms the baseline. ");
320 System.err.println(
321 " equal Number of queries where the treatment and baseline perform identically.");
322 System.err.println(" ttest P-value of a paired t-test.");
323 System.err.println(
324 " signtest P-value of the Fisher sign test. ");
325 System.err.println(
326 " randomized P-value of a randomized test. ");
327
328 System.err.println(
329 " The second column also includes difference tests. In these tests, the null hypothesis is ");
330 System.err.println(
331 " that the mean of the treatment is at least k times the mean of the baseline. We run the");
332 System.err.println(
333 " same tests as before, but we artificially improve the baseline values by a factor of k. ");
334
335 System.err.println(
336 " h-ttest-0.5 Largest value of k such that the ttest has a p-value of less than 0.5. ");
337 System.err.println(
338 " h-signtest-0.5 Largest value of k such that the sign test has a p-value of less than 0.5. ");
339 System.err.println(
340 " h-randomized-0.5 Largest value of k such that the randomized test has a p-value of less than 0.5. ");
341
342 System.err.println(
343 " h-ttest-0.1 Largest value of k such that the ttest has a p-value of less than 0.1. ");
344 System.err.println(
345 " h-signtest-0.1 Largest value of k such that the sign test has a p-value of less than 0.1. ");
346 System.err.println(
347 " h-randomized-0.1 Largest value of k such that the randomized test has a p-value of less than 0.1. ");
348 System.err.println();
349
350 System.err.println(" The third column is the value of the test.");
351
352 System.exit(-1);
353 }
354
355 public static void main(String[] args) throws IOException {
356 try {
357 if (args.length >= 3) {
358 TreeMap<String, ArrayList<Document>> baselineRanking = loadRanking(args[0]);
359 TreeMap<String, ArrayList<Document>> treatmentRanking = loadRanking(args[1]);
360 TreeMap<String, ArrayList<Judgment>> judgments = loadJudgments(args[2]);
361
362 SetRetrievalEvaluator baseline = create(baselineRanking, judgments);
363 SetRetrievalEvaluator treatment = create(treatmentRanking, judgments);
364
365 comparisonEvaluation(baseline, treatment, args.length >= 4);
366 } else if (args.length == 2) {
367 TreeMap<String, ArrayList<Document>> ranking = loadRanking(args[0]);
368 TreeMap<String, ArrayList<Judgment>> judgments = loadJudgments(args[1]);
369
370 SetRetrievalEvaluator setEvaluator = create(ranking, judgments);
371 singleEvaluation(setEvaluator);
372 } else {
373 usage();
374 }
375 } catch (Exception e) {
376 e.printStackTrace();
377 usage();
378 }
379 }
380 }