1 """
2
3 @author: Eleftherios Avramidis
4 """
5
6 from multirankeddataset import MultiRankedDataset
7 import logging
8 import sys
9 from ranking import Ranking
10 from evaluation.ranking.segment import kendall_tau, kendall_tau_prob
11 from evaluation.ranking.set import *
12
13 SET_METRIC_FUNCTIONS = [kendall_tau_set,
14 kendall_tau_set_no_ties,
15 mrr,
16 avg_ndgc_err,
17 best_predicted_vs_human,
18 avg_predicted_ranked
19 ]
20
22 """
23 classdocs
24 """
26
27 self.invert_ranks = kwargs.setdefault("invert_ranks", False)
28
29 super(Scoring, self).__init__(*params)
30
31
33
34 """
35 Provides a performance score for every system. The score is the percentage of times the system
36 performed better than all other systems or equally to the systems that performed better than all other systems
37 @param rank_attribute_name: the name of the target sentence attribute which contains the rank value, that we compare upon
38 Smaller rank means better system.
39 @type rank_attribute_name: string
40 @return A map of the system names and their performance percentage
41 """
42 systems_performance = {}
43 for parallelsentence in self.parallelsentences:
44 rank_per_system = {}
45
46 for target in parallelsentence.get_translations():
47 system = target.get_attribute("system")
48 rank = int(float(target.get_attribute(rank_attribute_name)))
49 rank_per_system[system] = rank
50
51 for system in rank_per_system:
52 if rank_per_system[system] == min(rank_per_system.values()):
53 try:
54 systems_performance[system] += 1
55 except KeyError:
56 systems_performance[system] = 1
57
58 for system in systems_performance:
59 systems_performance[system] = 1.00 * systems_performance[system] / len(self.parallelsentences)
60 return systems_performance
61
63 from scipy.stats import spearmanr
64 """
65 Calculates the system-level Spearman rank correlation given two sentence-level features, i.e.
66 the human and the estimated rank of each parallelsentence
67 @param rank_name_1: the name of the target sentence attribute containing the first rank value
68 @type rank_name_1: string
69 @param rank_name_2: the name of the target sentence attribute containing the second rank value
70 @type rank_name_2: string
71 @return the Spearman correlation rho and p value
72 """
73 systems_evaluation_1 = self.get_systems_scoring_from_segment_ranks(rank_name_1)
74 systems_evaluation_2 = self.get_systems_scoring_from_segment_ranks(rank_name_2)
75
76
77 rank_evaluation_1 = []
78 rank_evaluation_2 = []
79 for (system, rank_1) in systems_evaluation_1.items():
80 rank_evaluation_1.append(rank_1)
81 rank_2 = systems_evaluation_2[system]
82 rank_evaluation_2.append(rank_2)
83
84
85
86 return spearmanr(rank_evaluation_1, rank_evaluation_2)
87
88
90 """
91 Calculates a metric
92 @param predicted_rank_name: the name of the attribute containing the predicted rank
93 @type predicted_rank_name: str
94 @param original_rank_name: the name of the attribute containing the human rank
95 @type original_rank_name: str
96 @param filter_ref: don't include reference sentences when existing in the pairs
97 @type filter_ref: boolean
98 @param exclude_ties: don't include human ties in the calculation, even if correctly predicted
99 @type exclude_ties: boolean
100 @return: the Kendall tau score and the probability for the null hypothesis of X and Y being independent
101 @rtype: tuple(float, float)
102 """
103
104
105
106 filter_ref = kwargs.setdefault("filter_ref", True)
107 suffix = kwargs.setdefault("suffix", "")
108 prefix = kwargs.setdefault("prefix", "")
109 kwargs["invert_ranks"] = self.invert_ranks
110
111 predicted_rank_vectors = []
112 original_rank_vectors = []
113
114 for parallesentence in self.parallelsentences:
115 if filter_ref:
116 predicted_rank_vector = parallesentence.get_filtered_target_attribute_values(predicted_rank_name, "system", "_ref")
117 original_rank_vector = parallesentence.get_filtered_target_attribute_values(original_rank_name, "system", "_ref")
118 else:
119 predicted_rank_vector = parallesentence.get_target_attribute_values(predicted_rank_name)
120 original_rank_vector = parallesentence.get_target_attribute_values(original_rank_name)
121 predicted_ranking = Ranking(predicted_rank_vector)
122 original_ranking = Ranking(original_rank_vector)
123 if self.invert_ranks:
124 predicted_ranking = predicted_ranking.inverse()
125 original_ranking = original_ranking.inverse()
126 predicted_rank_vectors.append(predicted_ranking)
127 original_rank_vectors.append(original_ranking)
128
129 stats = {}
130 for callback in SET_METRIC_FUNCTIONS:
131 current_stats = callback(predicted_rank_vectors, original_rank_vectors)
132 stats.update(current_stats)
133
134
135
136 stats = dict([("{}-{}{}".format(prefix, key, suffix),value) for key,value in stats.iteritems()])
137 print stats
138 return stats
139
140
142 from scipy.stats import kendalltau
143
144 taus = []
145 pis = []
146 for parallesentence in self.parallelsentences:
147 rank_vector_1 = parallesentence.get_target_attribute_values(rank_name_1)
148 rank_vector_2 = parallesentence.get_target_attribute_values(rank_name_2)
149
150 fullscore = kendalltau(rank_vector_1, rank_vector_2)
151 tau = fullscore[0]
152 pi = fullscore[1]
153 taus.append(tau)
154 pis.append(pi)
155 return taus, pis
156
158 taus, pis = self.get_kendall_tau_vector(rank_name_1, rank_name_2)
159 avg = sum(taus)/len(taus)
160 pi = min(pis)
161 return avg, pi
162
164 taus = self.get_kendall_tau_vector(rank_name_1, rank_name_2)[0]
165 frequency = {}
166 for tau in taus:
167 try:
168 frequency[tau] += 1
169 except KeyError:
170 frequency[tau] = 0
171 return frequency
172
174 truepositive_withfirsteffort = 0.00
175 truepositive_withanyeffort = 0.00
176 pairwise_comparisons = 0.00
177 for parallesentence in self.parallelsentences:
178 estimated_rank_vector = parallesentence.get_target_attribute_values(estimated_rank_name)
179 original_rank_vector = parallesentence.get_target_attribute_values(original_rank_name)
180
181 true_positive = 0.00
182 false_positive = 0.00
183 alreadyfoundfirst = False
184
185 for i in range(len(estimated_rank_vector)):
186 if int(estimated_rank_vector[i]) == 1:
187 pairwise_comparisons += 1
188 if int(original_rank_vector[i]) == 1:
189 true_positive += 1
190 if not alreadyfoundfirst:
191 truepositive_withfirsteffort +=1
192 else:
193 false_positive += 1
194 alreadyfoundfirst = True
195 if true_positive > 0 :
196 truepositive_withanyeffort +=1
197
198
199
200 accuracy_firsteffort = truepositive_withfirsteffort/len(self.parallelsentences)
201 accuracy_anyeffort = truepositive_withanyeffort / len(self.parallelsentences)
202 return (accuracy_firsteffort, accuracy_anyeffort)
203
205 """
206 Normalizes a rank list so that it doesn't contain gaps. E.g [1,3,3,4] will be converted to [1,2,2,3]
207
208 """
209
210
212 from numpy import average
213 actual_values_of_best_predicted = {}
214
215 if self.invert_ranks:
216 inv = -1.00
217 else:
218 inv = 1.00
219
220 for parallesentence in self.parallelsentences:
221 predicted_rank_vector = parallesentence.get_filtered_target_attribute_values(predicted_rank_name, "system", "_ref")
222 original_rank_vector = parallesentence.get_filtered_target_attribute_values(original_rank_name, "system", "_ref")
223
224
225 predicted_rank_vector = [float(v) for v in predicted_rank_vector]
226 original_rank_vector = [inv*float(v) for v in original_rank_vector]
227
228 if not predicted_rank_vector:
229 continue
230 best_predicted_rank = min(predicted_rank_vector)
231 original_rank_order = sorted(original_rank_vector)
232
233
234
235 original_ranks = []
236 for original_rank, predicted_rank in zip(original_rank_vector, predicted_rank_vector):
237 if predicted_rank == best_predicted_rank:
238 corrected_original_rank = original_rank_order.index(original_rank) + 1
239 original_ranks.append(corrected_original_rank)
240
241
242 selected_original_rank = max(original_ranks)
243 a = actual_values_of_best_predicted.setdefault(selected_original_rank, 0)
244 actual_values_of_best_predicted[selected_original_rank] = a + 1
245
246 n = len(self.parallelsentences)
247 percentages = {}
248 total = 0
249 for rank, counts in actual_values_of_best_predicted.iteritems():
250 percentages[rank] = round(100.00 * counts / n , 2 )
251 total += counts
252 return percentages
253
254 - def mrr(self, predicted_rank_name, original_rank_name):
255 from numpy import average
256
257 if self.invert_ranks:
258 nv = -1.00
259 else:
260 inv = 1.00
261
262 reciprocal_ranks = []
263
264 for parallesentence in self.parallelsentences:
265 predicted_rank_vector = parallesentence.get_filtered_target_attribute_values(predicted_rank_name, "system", "_ref")
266 original_rank_vector = parallesentence.get_filtered_target_attribute_values(original_rank_name, "system", "_ref")
267
268
269 predicted_rank_vector = [float(v) for v in predicted_rank_vector]
270 original_rank_vector = [inv*float(v) for v in original_rank_vector]
271
272 if not predicted_rank_vector:
273 continue
274 best_original_rank = min(original_rank_vector)
275 predicted_rank_order = sorted(predicted_rank_vector)
276
277
278 predicted_ranks = []
279 for original_rank, predicted_rank in zip(original_rank_vector, predicted_rank_vector):
280 if predicted_rank == best_original_rank:
281 try:
282 corrected_predicted_rank = predicted_rank_order.index(original_rank) + 1
283 predicted_ranks.append(corrected_predicted_rank)
284 except:
285 pass
286
287
288 selected_original_rank = max(predicted_ranks)
289 reciprocal_ranks.append(1.00/selected_original_rank)
290
291 n = len(self.parallelsentences)
292 return average(reciprocal_ranks)
293
294
297
299 """
300 Provide an integer that shows the predicted rank of the best system
301 It is averaged over all segments. Tied predictions are penalized
302 """
303 from numpy import average
304 corresponding_ranks = []
305
306 if self.invert_ranks:
307 inv = -1.00
308 else:
309 inv = 1.00
310
311 for parallesentence in self.parallelsentences:
312 predicted_rank_vector = parallesentence.get_filtered_target_attribute_values(predicted_rank_name, "system", "_ref")
313 original_rank_vector = parallesentence.get_filtered_target_attribute_values(original_rank_name, "system", "_ref")
314
315
316 predicted_rank_vector = [float(v) for v in predicted_rank_vector]
317 original_rank_vector = [inv*float(v) for v in original_rank_vector]
318
319 best_original_rank = min(original_rank_vector)
320 predicted_rank_order = sorted(predicted_rank_vector)
321
322
323
324
325 current_corresponding_ranks = []
326 for original_rank, predicted_rank in zip(original_rank_vector, predicted_rank_vector):
327 if original_rank == best_original_rank:
328
329 predicted_rank_normalized = predicted_rank_order.index(predicted_rank)
330
331
332 penalized_rank = predicted_rank_normalized + predicted_rank_vector.count(predicted_rank)
333
334 current_corresponding_ranks.append(penalized_rank)
335
336
337
338 corresponding_ranks.append(average(current_corresponding_ranks))
339
340 first_ranked = average(corresponding_ranks)
341 return first_ranked
342
343
344
345 - def get_kendall_tau(self, predicted_rank_name, original_rank_name, **kwargs):
346 """
347 Calculates average Kendall tau of predicted vs human ranking according to WMT12 (Birch et. al 2012)
348 @param predicted_rank_name: the name of the attribute containing the predicted rank
349 @type predicted_rank_name: str
350 @param original_rank_name: the name of the attribute containing the human rank
351 @type original_rank_name: str
352 @param filter_ref: don't include reference sentences when existing in the pairs
353 @type filter_ref: boolean
354 @param exclude_ties: don't include human ties in the calculation, even if correctly predicted
355 @type exclude_ties: boolean
356 @return: the Kendall tau score and the probability for the null hypothesis of X and Y being independent
357 @rtype: tuple(float, float)
358 """
359 import itertools
360 import numpy as np
361
362
363 filter_ref = kwargs.setdefault("filter_ref", True)
364 suffix = kwargs.setdefault("suffix", "")
365 prefix = kwargs.setdefault("prefix", "")
366 kwargs["invert_ranks"] = self.invert_ranks
367
368 segtaus = []
369 segprobs = []
370
371 concordant = 0
372 discordant = 0
373 valid_pairs = 0
374 original_ties_overall = 0
375 predicted_ties_overall = 0
376 pairs_overall = 0
377 sentences_with_ties = 0
378
379 for parallesentence in self.parallelsentences:
380 if filter_ref:
381 predicted_rank_vector = parallesentence.get_filtered_target_attribute_values(predicted_rank_name, "system", "_ref")
382 original_rank_vector = parallesentence.get_filtered_target_attribute_values(original_rank_name, "system", "_ref")
383 else:
384 predicted_rank_vector = parallesentence.get_target_attribute_values(predicted_rank_name)
385 original_rank_vector = parallesentence.get_target_attribute_values(original_rank_name)
386 segtau, segprob, concordant_count, discordant_count, all_pairs_count, original_ties, predicted_ties, pairs = kendall_tau(predicted_rank_vector, original_rank_vector, **kwargs)
387 if segtau and segprob:
388 segtaus.append(segtau)
389 segprobs.append(segprob)
390
391 concordant += concordant_count
392 discordant += discordant_count
393 valid_pairs += all_pairs_count
394
395 original_ties_overall += original_ties
396 predicted_ties_overall += predicted_ties
397 if predicted_ties > 0:
398 sentences_with_ties += 1
399 pairs_overall += pairs
400
401
402
403
404 tau = 1.00 * (concordant - discordant) / (concordant + discordant)
405 prob = kendall_tau_prob(tau, valid_pairs)
406
407 avg_seg_tau = np.average(segtaus)
408 avg_seg_prob = np.product(segprobs)
409
410 predicted_ties_avg = 100.00*predicted_ties / pairs_overall
411 sentence_ties_avg = 100.00*sentences_with_ties / len(self.parallelsentences)
412
413 stats = {'tau': tau,
414 'prob': prob,
415 'avg_seg_tau': avg_seg_tau,
416 'avg_seg_prob': avg_seg_prob,
417 'concordant': concordant,
418 'discordant': discordant,
419 'valid_pairs': valid_pairs,
420 'all_pairs': pairs_overall,
421 'original_ties': original_ties_overall,
422 'predicted_ties': predicted_ties_overall,
423 'predicted_ties_per': predicted_ties_avg,
424 'sentence_ties': sentences_with_ties,
425 'sentence_ties_per' : sentence_ties_avg
426
427 }
428
429 stats = dict([("{}{}{}".format(prefix, key, suffix),value) for key,value in stats.iteritems()])
430
431 return stats
432
433
434
436 """
437 Calculates Kendall tau beta of predicted vs human ranking according to the Knight (1966)
438 [scipy implementation] taking account of ties
439 @param predicted_rank_name: the name of the attribute containing the predicted rank
440 @type predicted_rank_name: str
441 @param original_rank_name: the name of the attribute containing the human rank
442 @type original_rank_name: str
443 @return: the Kendall tau score and the probability for the null hypothesis of X and Y being independent
444 @rtype: tuple(float, float)
445 """
446 from scipy.stats import kendalltau
447 from numpy import isnan
448
449 segment_tau = 0.00
450 segment_pi = 1.00
451 for parallesentence in self.parallelsentences:
452 predicted_rank_vector = parallesentence.get_target_attribute_values(predicted_rank_name)
453 original_rank_vector = parallesentence.get_target_attribute_values(original_rank_name)
454
455 print "[{0}]".format(" , ".join(predicted_rank_vector))
456 print "[{0}]".format(" , ".join(original_rank_vector))
457
458
459 try:
460 tau, pi = kendalltau(original_rank_vector, predicted_rank_vector)
461 except TypeError:
462 tau = kendalltau(original_rank_vector, predicted_rank_vector)
463 pi = 1.00
464 sys.stderr.write("==============\nScipy gave an erroneous tau = {}\n==============".format(tau))
465 if isnan(tau) or isnan(pi):
466 tau = 0.00
467 pi = 1.00
468
469 segment_tau += tau
470 segment_pi *= pi
471 print tau
472
473 avg_tau = 1.00 * segment_tau / len(self.parallelsentences)
474
475
476 logging.debug("Avg tau: segment_tau / len(self.parallelsentences) \n= {0} / {1} \n= {2}".format(segment_tau, len(self.parallelsentences), avg_tau))
477 return avg_tau, segment_pi
478
479
490
491
492
493
494 if __name__ == '__main__':
495
496 pass
497