evaluation.ranking.set

1 ''' 2 This module allows for the calculation of the basic rank metrics that evaluate 3 on a segment level (i.e. one ranking list at a time) 4 5 Created on 18 Dec 2012 6 7 @author: Eleftherios Avramidis 8 ''' 9 10 import segment 11 from numpy import average 12 import numpy as np 13

14 -def kendall_tau_set_no_ties(predicted_rank_vectors, original_rank_vectors, **kwargs):

15 kwargs["penalize_predicted_ties"] = False 16 result = kendall_tau_set(predicted_rank_vectors, original_rank_vectors, **kwargs) 17 newresult = {} 18 for key, value in result.iteritems(): 19 newkey = key.replace("tau", "tau-nt") 20 newresult[newkey] = value 21 return newresult

22

23 -def kendall_tau_set(predicted_rank_vectors, original_rank_vectors, **kwargs):

24 """ 25 This is the refined calculation of set-level Kendall tau of predicted vs human ranking according to WMT12 (Birch et. al 2012) 26 It returns both set-level Kendall tau and average segment-level Kendall tau 27 @param predicted_rank_vectors: a list of lists containing integers representing the predicted ranks, one ranking for each segment 28 @type predicted_rank_vectors: [Ranking, ..] 29 @param original_rank_vectors: a list of the names of the attribute containing the human rank, one ranking for each segment 30 @type original_rank_vectors: [Ranking, ..] 31 @return: overall Kendall tau score, 32 - average segment Kendall tau score, 33 - the probability for the null hypothesis of X and Y being independent 34 - the count of concordant pairs, 35 - the count of discordant pairs, 36 - the count of pairs used for calculating tau (excluding "invalid" pairs) 37 - the count of original ties, 38 - the count of predicted ties, 39 - the count of all pairs 40 @rtype: {string:float, string:float, string:int, string:int, string:int, string:int, string:int, string:int} 41 42 """ 43 segtaus = [] 44 segprobs = [] 45 46 concordant = 0 47 discordant = 0 48 valid_pairs = 0 49 original_ties_overall = 0 50 predicted_ties_overall = 0 51 pairs_overall = 0 52 sentences_with_ties = 0 53 54 for predicted_rank_vector, original_rank_vector in zip(predicted_rank_vectors, original_rank_vectors): 55 56 57 segtau, segprob, concordant_count, discordant_count, all_pairs_count, original_ties, predicted_ties, pairs = segment.kendall_tau(predicted_rank_vector, original_rank_vector, **kwargs) 58 59 if segtau and segprob: 60 segtaus.append(segtau) 61 segprobs.append(segprob) 62 63 concordant += concordant_count 64 discordant += discordant_count 65 valid_pairs += all_pairs_count 66 67 original_ties_overall += original_ties 68 predicted_ties_overall += predicted_ties 69 if predicted_ties > 0: 70 sentences_with_ties += 1 71 pairs_overall += pairs 72 73 74 tau = 1.00 * (concordant - discordant) / (concordant + discordant) 75 prob = segment.kendall_tau_prob(tau, valid_pairs) 76 77 avg_seg_tau = np.average(segtaus) 78 avg_seg_prob = np.product(segprobs) 79 80 predicted_ties_avg = 100.00*predicted_ties / pairs_overall 81 sentence_ties_avg = 100.00*sentences_with_ties / len(predicted_rank_vector) 82 83 stats = {'tau': tau, 84 'tau_prob': prob, 85 'tau_avg_seg': avg_seg_tau, 86 'tau_avg_seg_prob': avg_seg_prob, 87 'tau_concordant': concordant, 88 'tau_discordant': discordant, 89 'tau_valid_pairs': valid_pairs, 90 'tau_all_pairs': pairs_overall, 91 'tau_original_ties': original_ties_overall, 92 'tau_predicted_ties': predicted_ties_overall, 93 'tau_predicted_ties_per': predicted_ties_avg, 94 'tau_sentence_ties': sentences_with_ties, 95 'tau_sentence_ties_per' : sentence_ties_avg 96 97 } 98 99 return stats

100 101

102 -def mrr(predicted_rank_vectors, original_rank_vectors, **kwargs):

103 """ 104 Calculation of mean reciprocal rank based on Radev et. all (2002) 105 @param predicted_rank_vectors: a list of lists containing integers representing the predicted ranks, one ranking for each segment 106 @type predicted_rank_vectors: [Ranking, ..] 107 @param original_rank_vectors: a list of the names of the attribute containing the human rank, one ranking for each segment 108 @type original_rank_vectors: [Ranking, ..] 109 @return: mean reciprocal rank 110 @rtype: {string, float} 111 """ 112 reciprocal_ranks = [] 113 114 for predicted_rank_vector, original_rank_vector in zip(predicted_rank_vectors, original_rank_vectors): 115 reciprocal_rank = segment.reciprocal_rank(predicted_rank_vector, original_rank_vector) 116 reciprocal_ranks.append(reciprocal_rank) 117 118 return {'mrr' : average(reciprocal_ranks)}

119 120

121 -def best_predicted_vs_human(predicted_rank_vectors, original_rank_vectors):

122 """ 123 For each sentence, the item selected as best by our system, may have been ranked lower by the humans. This 124 statistic counts how many times the item predicted as best has fallen into each of the human ranks. 125 This is useful for plotting. 126 @param predicted_rank_vectors: a list of lists containing integers representing the predicted ranks, one ranking for each segment 127 @type predicted_rank_vectors: [Ranking, ..] 128 @param original_rank_vectors: a list of the names of the attribute containing the human rank, one ranking for each segment 129 @type original_rank_vectors: [Ranking, ..] 130 @return: a dictionary with percentages for each human rank 131 @rtype: {string, float} 132 """ 133 actual_values_of_best_predicted = {} 134 for predicted_rank_vector, original_rank_vector in zip(predicted_rank_vectors, original_rank_vectors): 135 136 #make sure vectors are normalized 137 predicted_rank_vector = predicted_rank_vector.normalize() 138 original_rank_vector = original_rank_vector.normalize() 139 if not predicted_rank_vector: 140 continue 141 best_predicted_rank = min(predicted_rank_vector) 142 143 original_ranks = [] 144 for original_rank, predicted_rank in zip(original_rank_vector, predicted_rank_vector): 145 if predicted_rank == best_predicted_rank: 146 original_ranks.append(original_rank) 147 148 #if best rank given to many items, get the worst human rank for it 149 selected_original_rank = max(original_ranks) 150 a = actual_values_of_best_predicted.setdefault(selected_original_rank, 0) 151 actual_values_of_best_predicted[selected_original_rank] = a + 1 152 153 n = len(predicted_rank_vectors) 154 percentages = {} 155 total = 0 156 #gather everything into a dictionary 157 for rank, counts in actual_values_of_best_predicted.iteritems(): 158 percentages["bph_" + str(rank)] = round(100.00 * counts / n , 2 ) 159 total += counts 160 return percentages

161

162 -def avg_predicted_ranked(predicted_rank_vectors, original_rank_vectors, **kwargs):

163 164 """ 165 It will provide the average human rank of the item chosen by the system as best 166 @param predicted_rank_vectors: a list of lists containing integers representing the predicted ranks, one ranking for each segment 167 @type predicted_rank_vectors: [Ranking, ..] 168 @param original_rank_vectors: a list of the names of the attribute containing the human rank, one ranking for each segment 169 @type original_rank_vectors: [Ranking, ..] 170 @return: a dictionary with the name of the metric and its value 171 @rtype: {string, float} 172 """ 173 174 original_ranks = [] 175 176 for predicted_rank_vector, original_rank_vector in zip(predicted_rank_vectors, original_rank_vectors): 177 178 #make sure vectors are normalized 179 predicted_rank_vector = predicted_rank_vector.normalize(ties='ceiling') 180 original_rank_vector = original_rank_vector.normalize(ties='ceiling') 181 182 best_predicted_rank = min(predicted_rank_vector) 183 mapped_original_ranks = [] 184 185 for original_rank, predicted_rank in zip(original_rank_vector, predicted_rank_vector): 186 if predicted_rank == best_predicted_rank: 187 mapped_original_ranks.append(original_rank) 188 189 #in case of ties get the worst one 190 original_ranks.append(max(mapped_original_ranks)) 191 192 return {'avg_predicted_ranked': average(original_ranks)}

193 194 195 196

197 -def avg_ndgc_err(predicted_rank_vectors, original_rank_vectors, **kwargs):

198 """ 199 Returns normalize Discounted Cumulative Gain and the Expected Reciprocal Rank, both averaged over number of sentences 200 @param predicted_rank_vectors: a list of lists containing integers representing the predicted ranks, one ranking for each segment 201 @type predicted_rank_vectors: [Ranking, ..] 202 @param original_rank_vectors: a list of the names of the attribute containing the human rank, one ranking for each segment 203 @type original_rank_vectors: [Ranking, ..] 204 @keyword k: cut-off passed to the segment L{ndgc_err} function 205 @type k: int 206 @return: a dictionary with the name of the metric and the respective result 207 @rtype: {string, float} 208 """ 209 ndgc_list = [] 210 err_list = [] 211 for predicted_rank_vector, original_rank_vector in zip(predicted_rank_vectors, original_rank_vectors): 212 k = kwargs.setdefault('k', len(predicted_rank_vector)) 213 ndgc, err = segment.ndgc_err(predicted_rank_vector, original_rank_vector, k) 214 ndgc_list.append(ndgc) 215 err_list.append(err) 216 avg_ndgc = average(ndgc_list) 217 avg_err = average(err_list) 218 return {'ndgc':avg_ndgc, 'err':avg_err}

219 220

221 -def allmetrics(predicted_rank_vectors, original_rank_vectors, **kwargs):

222 stats = {} 223 functions = [kendall_tau_set, mrr, best_predicted_vs_human, avg_predicted_ranked, avg_ndgc_err] 224 for function in functions: 225 stats.update(function(predicted_rank_vectors, original_rank_vectors, **kwargs)) 226 227 return stats

228 229 #if __name__ == '__main__': 230 # from sentence.ranking import Ranking 231 # a = Ranking([1,2,3,4]) 232 # b = Ranking([2,3,1,4]) 233 # 234 # c = [a,b,a,b,a,a] 235 # d = [b,a,a,b,a,] 236 # 237 # print avg_ndgc_err(c,d) 238

Source Code for Module evaluation.ranking.set