evaluation.ranking.segment

1 ''' 2 This module allows for the calculation of the basic rank metrics that evaluate 3 on a segment level (i.e. one ranking list at a time) 4 5 Created on Nov 25, 2012 6 7 @author: Eleftherios Avramidis 8 ''' 9 10 from math import log 11 import logging 12 from collections import namedtuple 13 from sentence.ranking import Ranking 14 15 """"""""" 16 Kendall tau 17 """"""""" 18

19 -def kendall_tau_prob(tau, pairs):

20 """ 21 Calculation of Kendall tau hypothesis testing based on scipy calculation 22 @param tau: already calculated tau coefficient 23 @type tau: float 24 @param pairs: count of pairs 25 @type pairs: int 26 @return: the probability for the null hypothesis of X and Y being independent 27 @rtype: float 28 """ 29 import numpy as np 30 from scipy import special 31 try: 32 svar = (4.0 * pairs + 10.0) / (9.0 * pairs * (pairs - 1)) 33 except: 34 svar = 1 35 try: 36 z = tau / np.sqrt(svar) 37 except: 38 z = 1 39 return special.erfc(np.abs(z) / 1.4142136)

40 41

42 -def kendall_tau(predicted_rank_vector, original_rank_vector, **kwargs):

43 """ 44 This is the refined calculation of segment-level Kendall tau of predicted vs human ranking according to WMT12 (Birch et. al 2012) 45 @param predicted_rank_vector: a list of integers representing the predicted ranks 46 @type predicted_rank_vector: [str, ..] 47 @param original_rank_vector: the name of the attribute containing the human rank 48 @type original_rank_vector: [str, ..] 49 @kwarg ties: way of handling ties, passed to L{sentence.ranking.Ranking} object 50 @type ties: string 51 @return: the Kendall tau score, 52 the probability for the null hypothesis of X and Y being independent 53 the count of concordant pairs, 54 the count of discordant pairs, 55 the count of pairs used for calculating tau (excluding "invalid" pairs) 56 the count of original ties, 57 the count of predicted ties, 58 the count of all pairs 59 @rtype: namedtuple(float, float, int, int, int, int, int, int) 60 """ 61 import itertools 62 ties_handling = kwargs.setdefault('ties','ceiling') 63 64 #do necessary normalization of rank vectors 65 predicted_rank_vector = predicted_rank_vector.normalize(ties=ties_handling) 66 original_rank_vector = original_rank_vector.normalize(ties=ties_handling) 67 68 logging.debug("\n* Segment tau *") 69 logging.debug("predicted vector: {}".format(predicted_rank_vector)) 70 logging.debug("original vector : {}".format(original_rank_vector)) 71 72 #default wmt implementation excludes ties from the human (original) ranks 73 exclude_ties = kwargs.setdefault("exclude_ties", True) 74 #ignore also predicted ties 75 penalize_predicted_ties = kwargs.setdefault("penalize_predicted_ties", True) 76 logging.debug("exclude_ties: {}".format(exclude_ties)) 77 78 # 79 if kwargs.setdefault("invert_ranks", False): 80 inv = -1.00 81 else: 82 inv = 1.00 83 84 #construct tuples, which contain pairs of ranks 85 predicted_pairs = [(float(i), float(j)) for i, j in itertools.combinations(predicted_rank_vector, 2)] 86 original_pairs = [(inv*float(i), inv*float(j)) for i, j in itertools.combinations(original_rank_vector, 2)] 87 88 concordant_count = 0 89 discordant_count = 0 90 91 original_ties = 0 92 predicted_ties = 0 93 pairs = 0 94 95 #iterate over the pairs 96 for original_pair, predicted_pair in zip(original_pairs, predicted_pairs): 97 original_i, original_j = original_pair 98 #invert original ranks if required 99 100 predicted_i, predicted_j = predicted_pair 101 102 logging.debug("%s\t%s", original_pair, predicted_pair) 103 104 #general statistics 105 pairs +=1 106 if original_i == original_j: 107 original_ties +=1 108 if predicted_i == predicted_j: 109 predicted_ties += 1 110 111 # don't include refs, human no-ranks (-1), human ties 112 if original_i == -1 or original_j == -1: 113 pass 114 #don't count ties on the original rank 115 elif original_i == original_j and exclude_ties: 116 pass 117 #concordant 118 elif (original_i > original_j and predicted_i > predicted_j) \ 119 or (original_i < original_j and predicted_i < predicted_j) \ 120 or (original_i == original_j and predicted_i == predicted_j): 121 #the former line will be true only if ties are not excluded 122 concordant_count += 1 123 logging.debug("\t\tCON") 124 #ignore false predicted ties if requested 125 elif (predicted_i == predicted_j and not penalize_predicted_ties): 126 pass 127 else: 128 discordant_count += 1 129 logging.debug("\t\tDIS") 130 all_pairs_count = concordant_count + discordant_count 131 132 logging.debug("original_ties = %d, predicted_ties = %d", original_ties, predicted_ties) 133 logging.debug("conc = %d, disc= %d", concordant_count, discordant_count) 134 135 try: 136 tau = 1.00 * (concordant_count - discordant_count) / all_pairs_count 137 logging.debug("tau = {0} - {1} / {0} + {1}".format(concordant_count, discordant_count)) 138 logging.debug("tau = {0} / {1}".format(concordant_count - discordant_count, all_pairs_count)) 139 140 except ZeroDivisionError: 141 tau = None 142 prob = None 143 else: 144 prob = kendall_tau_prob(tau, all_pairs_count) 145 146 logging.debug("tau = {}, prob = {}\n".format(tau, prob)) 147 148 #wrap results in a named tuple 149 Result = namedtuple('Result', ['tau', 'prob', 'concordant_count', 'discordant_count', 'all_pairs_count', 'original_ties', 'predicted_ties', 'pairs']) 150 result = Result(tau, prob, concordant_count, discordant_count, all_pairs_count, original_ties, predicted_ties, pairs) 151 152 return result

153 154 155 """"""""" 156 DCG 157 """"""""" 158

159 -def _calculate_gains(predicted_rank_vector, original_rank_vector, verbose=True):

160 """ 161 Calculate the gain for each one of the predicted ranks 162 @param predicted_rank_vector: list of integers representing the predicted ranks 163 @type predicted_rank_vector: [int, ...] 164 @param original_rank_vector: list of integers containing the original ranks 165 @type original_rank_vector: [int, ...] 166 @return: a list of gains, relevant to the DCG calculation 167 @rtype: [float, ...] 168 """ 169 170 #the original calculation 171 r = predicted_rank_vector 172 n = len(original_rank_vector) 173 174 #the relevance of each rank is inv proportional to its rank index 175 l = [n-i+1 for i in original_rank_vector] 176 177 expn = 2**n 178 gains = [0]*n 179 180 for j in range(n): 181 gains[r[j]-1] = (2**l[j]-1.0)/expn 182 183 logging.debug("j={}\nr[j]={}\nl[j]={}\n".format(j,r[j],l[j])) 184 logging.debug("gains[{}] = ".format(r[j]-1)) 185 logging.debug("\t(2**l[j]-1.0) / 2**n =") 186 logging.debug("\t(2**{}-1.0) / 2**{}=".format(l[j], n)) 187 logging.debug("\t{} / {} =".format((2**l[j]-1.0),expn)) 188 logging.debug("{}".format((2**l[j]-1.0)/expn)) 189 logging.debug("gains = {}".format(gains)) 190 191 assert min(gains)>=0, 'Not all ranks present' 192 return gains

193 194

195 -def idcg(gains, k):

196 """ 197 Calculate the Ideal Discounted Cumulative Gain, for the given vector of ranking gains 198 @param gains: a list of integers pointing to the ranks 199 @type gains: [float, ...] 200 @param k: the DCG cut-off 201 @type k: int 202 @return: the calculated Ideal Discounted Cumulative Gain 203 @rtype: float 204 """ 205 #put the ranks in an order 206 gains.sort() 207 #invert the list 208 gains = gains[::-1] 209 ideal_dcg = sum([g/log(j+2) for (j,g) in enumerate(gains[:k])]) 210 return ideal_dcg

211 212

213 -def ndgc_err(predicted_rank_vector, original_rank_vector, k=None):

214 """ 215 Calculate the normalize Discounted Cumulative Gain and the Expected Reciprocal Rank on a sentence level 216 This follows the definition of U{DCG<http://en.wikipedia.org/wiki/Discounted_cumulative_gain#Cumulative_Gain>} 217 and U{ERR<http://research.yahoo.com/files/err.pdf>}, and the implementation of 218 U{Yahoo Learning to Rank challenge<http://learningtorankchallenge.yahoo.com/evaluate.py.txt>} 219 @param predicted_rank_vector: list of integers representing the predicted ranks 220 @type predicted_rank_vector: [int, ...] 221 @param original_rank_vector: list of integers containing the original ranks. 222 @type original_rank_vector: [int, ...] 223 @param k: the cut-off for the calculation of the gains. If not specified, the length of the ranking is used 224 @type k: int 225 @return: a tuple containing the values for the two metrics 226 @rtype: tuple(float,float) 227 """ 228 # Number of documents 229 230 r = predicted_rank_vector.normalize(ties='ceiling').integers() 231 l = original_rank_vector.normalize(ties='ceiling').integers() 232 n = len(l) 233 234 #if user doesn't specify k, set equal to the ranking length 235 if not k: 236 k = n 237 238 #make sure that the lists have the right dimensions 239 assert len(r)==n, 'Expected {} ranks, but got {}.'.format(n,len(r)) 240 gains = _calculate_gains(r, l) 241 242 #ERR calculations 243 p = 1.0 244 err = 0.0 245 for j in range(n): 246 r = gains[j] 247 err += p*r/(j+1.0) 248 p *= 1-r 249 250 #DCG calculation 251 dcg = sum([g/log(j+2) for (j,g) in enumerate(gains[:k])]) 252 253 #NDCG calculation 254 ideal_dcg = idcg(gains, k) 255 256 if ideal_dcg: 257 ndcg = dcg / ideal_dcg 258 else: 259 ndcg = 1.0 260 261 return ndcg, err

262 263 264 265 """ 266 Reciprocal rank 267 """ 268

269 -def reciprocal_rank(predicted_rank_vector, original_rank_vector, **kwargs):

270 """ 271 Calculates the First Answer Reciprocal Rank according to Radev et. al (2002) 272 @param predicted_rank_vector: list of integers representing the predicted ranks 273 @type predicted_rank_vector: [int, ...] 274 @param original_rank_vector: list of integers containing the original ranks. 275 @type original_rank_vector: [int, ...] 276 @kwarg ties: way of handling ties, passed to L{sentence.ranking.Ranking} object 277 @type ties: string 278 @return: the reciprocal rank value 279 @type: float 280 """ 281 ties_handling = kwargs.setdefault('ties','ceiling') 282 283 predicted_rank_vector = predicted_rank_vector.normalize(ties=ties_handling) 284 original_rank_vector = original_rank_vector.normalize(ties=ties_handling) 285 286 best_original_rank = min(original_rank_vector) 287 best_original_rank_indexes = original_rank_vector.indexes(best_original_rank) 288 289 predicted_values_for_best_rank = [predicted_rank_vector[index] for index in best_original_rank_indexes] 290 291 #if there is a tie for the best rank, this will return our best choice for it 292 best_predicted_value_for_best_rank = min(predicted_values_for_best_rank) 293 294 reciprocal_rank = 1.00/best_predicted_value_for_best_rank 295 return reciprocal_rank

296 297 298 299 #if __name__ == "__main__": 300 # predicted_rank_vector = Ranking([1,3,2.2,"0.1"]) 301 # print predicted_rank_vector.normalize() 302 # original_rank_vector = Ranking([1,2,3,4]) 303 ## dcg, err = ndgc_err(predicted_rank_vector, original_rank_vector, 4) 304 ## print "ERR = {}\n nDCG_p = {}".format(dcg, err) 305 # 306 # print ndgc_err(predicted_rank_vector, original_rank_vector) 307

Source Code for Module evaluation.ranking.segment