sentence.pairwiseparallelsentenceset

1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 ''' 4 Created on Jul 12, 2011 5 6 @author: jogin, Eleftherios Avramidis 7 ''' 8 from pairwiseparallelsentence import PairwiseParallelSentence 9 from copy import deepcopy 10 from parallelsentence import ParallelSentence 11 import logging 12 from os import sys 13 14

15 -class PairwiseParallelSentenceSet():

16 """ 17 A set of pairwise parallel sentences, all originating from the same source sentence, in order to facilitate pairwise comparisons etc. 18 Works as a wrapper over a dictionary, where pairs are indexed based on the system names of the 2 target translations. 19 """

20 - def get_parallelsentences(self):

21 """ 22 @return: a list of the parallel sentences contained in the structure 23 @rtype: pairwise_parallelsentences: list of L{sentence.pairwiseparallelsentence.PairwiseParallelSentence} instances 24 """ 25 return self.pps_dict.values()

26

27 - def get_system_names(self):

28 """ 29 @return: all the system pairs that are mapped to pairwise sentences 30 @rtype: list of tuples of two strings each 31 """ 32 return self.pps_dict.keys()

33 34

35 - def length(self):

36 return len(self.get_parallelsentences())

37 38

39 -class AnalyticPairwiseParallelSentenceSet(PairwiseParallelSentenceSet):

40 """ 41 A set of pairwise parallel sentences, all originating from the same source sentence, where more than one comparisons per system-pair are allowed 42 @ivar pps_dict: a dict that stores all the pairwise parallelsentences mapped to a tuple of strings containing the system names for the respective translations 43 @type pps_dict: {(str, str): [L{PairwiseParallelSentence}, ...]} 44 """

45 - def __init__(self, pairwise_parallelsentences = [], rank_name = "rank", **kwargs):

46 """ 47 @param pairwise_parallelsentences: a list of pairwise parallel sentences 48 @type pairwise_parallelsentences: [L{sentence.pairwiseparallelsentence.PairwiseParallelSentence}, ...] 49 """ 50 self.pps_dict = {} 51 self.rank_name = kwargs.setdefault("rank_name", rank_name) 52 53 for ps in pairwise_parallelsentences: 54 system_names = ps.get_system_names() 55 try: 56 self.pps_dict[system_names].append(ps) 57 except KeyError: 58 self.pps_dict[system_names] = [ps]

59

60 - def get_parallelsentences(self):

61 all_parallelsentences = [] 62 for parallelsentencelist in self.pps_dict.values(): 63 all_parallelsentences.extend(parallelsentencelist) 64 return all_parallelsentences

65 66 67

68 - def remove_ties(self):

69 """ 70 It removes the pairwise sentences whose rank is equal with each other's 71 @return: the number of ties filtered 72 @rtype: int 73 @todo: test 74 """ 75 reformed_dict = {} 76 removed_ties = 0 77 for system_names in self.pps_dict: 78 reformed_dict[system_names] = [ps for ps in self.pps_dict[system_names] if int(ps.get_attribute(self.rank_name)) != 0] 79 removed_ties += len(self.pps_dict[system_names]) - len(reformed_dict[system_names]) 80 81 self.pps_dict = reformed_dict 82 return removed_ties

83 84

85 - def restrict_ranks(self, restrict_ranks):

86 pass 87 restrict_ranks = set([float(r) for r in restrict_ranks]) 88 print 89 print restrict_ranks 90 for system_names in self.pps_dict.keys(): 91 ps_restricted = [] 92 for ps in self.pps_dict[system_names]: 93 target_ranks = set([float(r) for r in ps.get_target_attribute_values(self.rank_name)]) 94 print target_ranks, 95 if not target_ranks.isdisjoint(restrict_ranks): 96 ps_restricted.append(ps) 97 print "ok" 98 else: 99 print 100 if ps_restricted: 101 self.pps_dict[system_names] = ps_restricted 102 else: 103 del self.pps_dict[system_names]

104 105 106 107

108 - def get_pairwise_parallelsentences(self, system_names, directed = False):

109 """ 110 Provides the pairwise parallel sentences, whose target sentences provide output by the two given systems 111 @param system_names: pair of translation system names 112 @type system_names: tuple of strings 113 @param order: whether the order of the systems in the tuple is important, or not 114 @type order: boolean 115 @return: the pairwise parallel sentence that contains the outputs of the two given systems 116 @rtype: [L{sentence.pairwiseparallelsentence.PairwiseParallelSentence}, ...] 117 """ 118 try: 119 return self.pps_dict[system_names] 120 except: 121 if not directed: 122 try: 123 system_names_reversed = (system_names[1], system_names[0]) 124 return self.pps_dict[system_names_reversed].get_reversed() 125 except: 126 print "At least one of system names is missing." 127 else: 128 print "At least one of system names is missing."

129 130

131 - def get_filtered_pairwise_parallelsentence_set(self, threshold = 1.00):

132 return CompactPairwiseParallelSentenceSet(self.get_filtered_pairwise_parallelsentences(threshold))

133 134

135 - def get_filtered_pairwise_parallelsentences(self, threshold = 1.00):

136 filtered_pairwise_parallelsentences = [] 137 for system_names in self.get_system_names(): 138 overlapping_judgments = self.get_pairwise_parallelsentences(system_names) 139 filtered_pairwise_parallelsentence = self._filter_agreement(threshold, overlapping_judgments, system_names) 140 if filtered_pairwise_parallelsentence: 141 filtered_pairwise_parallelsentences.append(filtered_pairwise_parallelsentence) 142 return filtered_pairwise_parallelsentences

143

144 - def _filter_agreement(self, threshold = 1.00, pairwise_parallelsentences = [], system_names=()):

145 if len(pairwise_parallelsentences) == 1: 146 return pairwise_parallelsentences[0] 147 rank_vector = [ps.get_rank() for ps in pairwise_parallelsentences] 148 rank_values = set(rank_vector) 149 rank_distribution = sorted([(rank_vector.count(rank)*1.00/len(rank_vector), rank) for rank in rank_values]) 150 most_popular = rank_distribution[-1] 151 if most_popular[0] >= threshold: 152 #return the first pairwise sentence that appears to have this rank 153 for ps in pairwise_parallelsentences: 154 if ps.get_rank() == most_popular[1]: 155 return ps 156 else: 157 return None

158 159

160 - def get_compact_pairwise_parallelsentences(self):

161 return self.get_merged_pairwise_parallelsentences()

162

163 - def get_merged_pairwise_parallelsentences(self):

164 """ 165 Merge many overlapping judgments over translations originating from the same source sentence 166 @return pairwise parallel sentences, containing only the merged output rank 167 @rtype [L{L{sentence.pairwiseparallelsentence.PairwiseParallelSentence}, ...] 168 """ 169 merged_pairwise_parallelsentences = [] 170 for system_names in self.get_system_names(): 171 overlapping_judgments = self.get_pairwise_parallelsentences(system_names) 172 merged_pairwise_parallelsentence = self._merge_judgments(overlapping_judgments, system_names) 173 merged_pairwise_parallelsentences.append(merged_pairwise_parallelsentence) 174 return merged_pairwise_parallelsentences

175

176 - def get_compact_pairwise_parallelsentence_set(self):

177 return CompactPairwiseParallelSentenceSet(self.get_compact_pairwise_parallelsentences())

178

179 - def _merge_judgments(self, pairwise_parallelsentences = [], system_names=()):

180 """ 181 Merge many overlapping judgements over translations produced by the same system pair 182 originating from the same source sentence, into only one judgment 183 @return: a pairwise parallel sentences 184 @rtype: L{PairwiseParallelSentence} 185 """ 186 rank = sum([float(ps.get_rank()) * self._merge_weight(ps) for ps in pairwise_parallelsentences]) 187 188 attributes = deepcopy(pairwise_parallelsentences[0].attributes) 189 attributes[self.rank_name] = rank 190 source = pairwise_parallelsentences[0].get_source() 191 translations = pairwise_parallelsentences[0].get_translations() 192 reference = pairwise_parallelsentences[0].get_reference() 193 new_ps = PairwiseParallelSentence(source, translations, system_names, reference, attributes, self.rank_name) 194 return new_ps

195 196 197

198 - def _merge_weight(self, ps):

199 return 1

200 201 202 203 204

205 -class CompactPairwiseParallelSentenceSet(PairwiseParallelSentenceSet):

206 """ 207 A compact set of pairwise parallel sentences, all originating from the same source sentence, 208 where only one comparison per system-pair is allowed 209 @ivar rank_name: the name of the rank value 210 @type rank_name: str 211 @ivar pps_dict: a dictionary of pairwise parallelel sentences 212 @type pps_dict: {(str, str): L{PairwiseParallelSentence}} 213 """ 214

215 - def __init__(self, pairwise_parallelsentences, rank_name = "rank"):

216 """ 217 @param pairwise_parallelsentences: a list of pairwise parallel sentences 218 @type pairwise_parallelsentences: [L{PairwiseParallelSentence}, ...] 219 """ 220 self.rank_name = rank_name 221 self.pps_dict = dict([(ps.get_system_names(), ps) for ps in pairwise_parallelsentences]) 222 pass

223 224

225 - def remove_ties(self):

226 """ 227 It removes the pairwise sentences whose rank is equal with each other's 228 @return: the number of ties filtered 229 @rtype: int 230 """ 231 reformed_dict = {} 232 ties = 0 233 for system_names in self.pps_dict: 234 ps = self.pps_dict[system_names] 235 if int(ps.get_attribute(self.rank_name)) != 0: 236 reformed_dict[system_names] = ps 237 else: 238 ties += 1 239 self.pps_dict = reformed_dict 240 241 print "filtered %d ties" % ties 242 return ties

243

244 - def get_multiranked_sentence(self, critical_attribute = None, new_rank_name = None, del_orig_class_att = True):

245 """ 246 It reconstructs a single parallel sentence object with a gathered discrete [1-9] 247 ranking out of the pairwise comparisons that exist in the pairwise parallel sentence instances 248 @return: a parallel sentence 249 @rtype: L{ParallelSentence} 250 """ 251 rank_per_system = {} 252 translations_per_system = {} 253 254 if not new_rank_name: 255 new_rank_name = self.rank_name 256 257 258 259 #first iterate and make a sum of the rank per system name 260 for (system_a, system_b), parallelsentence in self.pps_dict.iteritems(): 261 #get the rank value (0, -1, 1) 262 263 if not critical_attribute: 264 rank = int(parallelsentence.get_rank()) 265 else: 266 rank = int(parallelsentence.get_attribute(critical_attribute)) 267 268 #rank value adds up on the first system's rank 269 #and subtracts from the seconds system's 270 271 rank_per_system[system_a] = rank_per_system.setdefault(system_a, 0) + rank 272 #rank_per_system[system_b] = rank_per_system.setdefault(system_b, 0) - rank 273 274 #also gather in a dict the translations per system name, in order to have easy access later 275 translations_per_system[system_b] = parallelsentence.get_translations()[1] 276 translations_per_system[system_a] = parallelsentence.get_translations()[0] 277 278 279 #normalize ranks 280 i = 0 281 prev_rank = None 282 translations_new_rank = [] #list that gathers all the translations 283 284 #iterate through the system outputs, sorted by their rank 285 #and increment their rank only if there is no tie 286 systems = sorted(rank_per_system, key=lambda system: rank_per_system[system]) 287 for system in systems: 288 #if there is no tie 289 if rank_per_system[system] != prev_rank: 290 i += 1 291 292 #print "system: %s\t%d -> %d" % (system, rank_per_system[system] , i) 293 # print i, system, 294 prev_rank = rank_per_system[system] 295 translation = deepcopy(translations_per_system[system]) 296 translation.add_attribute(new_rank_name, str(i)) 297 translations_new_rank.append(translation) 298 299 #get the values of the first sentence as template 300 source = deepcopy(self.pps_dict.values()[0].get_source()) 301 reference = deepcopy(self.pps_dict.values()[0].get_reference()) 302 attributes = deepcopy(self.pps_dict.values()[0].get_attributes()) 303 # if del_orig_class_att: 304 # del(attributes[self.rank_name]) 305 try: 306 del(attributes[self.rank_name]) 307 except: 308 pass 309 310 return ParallelSentence(source, translations_new_rank, reference, attributes)

311 312

313 - def get_multiranked_sentence_with_probfilter(self, attribute1="", attribute2="", critical_attribute="rank_soft_predicted", new_rank_name = None, threshold=0.1000):

314 """ 315 It reconstructs a single parallel sentence object with a gathered discrete [1-9] 316 ranking out of the pairwise comparisons that exist in the pairwise parallel sentence instances 317 @return: a parallel sentence 318 @rtype: L{ParallelSentence} 319 """ 320 rank_per_system = {} 321 translations_per_system = {} 322 323 if not new_rank_name: 324 new_rank_name = self.rank_name 325 326 fullrank = False 327 328 while not fullrank: 329 #first iterate and make a sum of the rank per system name 330 for (system_a, system_b), parallelsentence in self.pps_dict.iteritems(): 331 logging.debug("threshold: {}".format(threshold)) 332 333 #get the rank probability 334 prob_neg = float(parallelsentence.get_attribute(attribute1)) 335 336 #rank value adds up on the first system's rank 337 #only if it is "sure" enough 338 if abs(prob_neg-0.5) > threshold: 339 try: 340 rank_per_system[system_b] += prob_neg 341 except KeyError: 342 rank_per_system[system_b] = prob_neg 343 # also gather in a dict the translations per system name, in order to have easy access later 344 translations_per_system[system_b] = parallelsentence.get_translations()[1] 345 translations_per_system[system_a] = parallelsentence.get_translations()[0] 346 347 348 fullrank = True 349 for system_a, system_b in self.get_system_names(): 350 if system_b not in rank_per_system: 351 logging.debug("didn't fill in one rank") 352 fullrank = False 353 threshold = threshold - threshold/20 354 break 355 356 #run one last time with threshold 1 357 if threshold < 0.002: 358 threshold = 0 359 if threshold == 0: 360 fullrank = True 361 362 #normalize ranks 363 i = 0 364 prev_rank = None 365 translations_new_rank = [] #list that gathers all the translations 366 367 #iterate through the system outputs, sorted by their rank 368 #and increment their rank only if there is no tie 369 systems = sorted(rank_per_system, key=lambda system: rank_per_system[system]) 370 print systems 371 for system in systems: 372 #if there is no tie 373 if rank_per_system[system] != prev_rank: 374 i += 1 375 376 #print "system: %s\t%d -> %d" % (system, rank_per_system[system] , i) 377 # print i, system, 378 prev_rank = rank_per_system[system] 379 translation = deepcopy(translations_per_system[system]) 380 translation.add_attribute(new_rank_name, str(i)) 381 translations_new_rank.append(translation) 382 383 #get the values of the first sentence as template 384 source = deepcopy(self.pps_dict.values()[0].get_source()) 385 reference = deepcopy(self.pps_dict.values()[0].get_reference()) 386 attributes = deepcopy(self.pps_dict.values()[0].get_attributes()) 387 try: 388 del(attributes[new_rank_name]) 389 except: 390 pass 391 392 return ParallelSentence(source, translations_new_rank, reference, attributes)

393

394 - def get_multiranked_sentence_with_soft_ranks(self, attribute1="", attribute2="", critical_attribute="rank_soft_predicted", new_rank_name = None):

395 """ 396 It reconstructs a single parallel sentence object with a gathered discrete [1-9] 397 ranking out of the pairwise comparisons that exist in the pairwise parallel sentence instances 398 @return: a parallel sentence 399 @rtype: L{ParallelSentence} 400 """ 401 rank_per_system = {} 402 translations_per_system = {} 403 404 if not new_rank_name: 405 new_rank_name = self.rank_name 406 407 #first iterate and make a sum of the rank per system name 408 for (system_a, system_b), parallelsentence in self.pps_dict.iteritems(): 409 #get the rank value (0, -1, 1) 410 411 prob_neg = float(parallelsentence.get_attribute(attribute1)) 412 # prob_pos = -1.00 * float(parallelsentence.get_attribute(attribute2)) 413 414 415 #rank value adds up on the first system's rank 416 #and subtracts from the seconds system's -> found out that this doesn't help 417 try: 418 rank_per_system[system_b] += prob_neg 419 except KeyError: 420 rank_per_system[system_b] = prob_neg 421 # try: 422 # rank_per_system[system_a] -= prob_pos 423 # except KeyError: 424 # rank_per_system[system_a] = -1 * prob_pos 425 # 426 #also gather in a dict the translations per system name, in order to have easy access later 427 translations_per_system[system_b] = parallelsentence.get_translations()[1] 428 translations_per_system[system_a] = parallelsentence.get_translations()[0] 429 430 431 #normalize ranks 432 i = 0 433 prev_rank = None 434 translations_new_rank = [] #list that gathers all the translations 435 436 #iterate through the system outputs, sorted by their rank 437 #and increment their rank only if there is no tie 438 systems = sorted(rank_per_system, key=lambda system: rank_per_system[system]) 439 print systems 440 for system in systems: 441 #if there is no tie 442 if rank_per_system[system] != prev_rank: 443 i += 1 444 445 #print "system: %s\t%d -> %d" % (system, rank_per_system[system] , i) 446 # print i, system, 447 prev_rank = rank_per_system[system] 448 translation = deepcopy(translations_per_system[system]) 449 translation.add_attribute(new_rank_name, str(i)) 450 translations_new_rank.append(translation) 451 452 #get the values of the first sentence as template 453 source = deepcopy(self.pps_dict.values()[0].get_source()) 454 reference = deepcopy(self.pps_dict.values()[0].get_reference()) 455 attributes = deepcopy(self.pps_dict.values()[0].get_attributes()) 456 try: 457 del(attributes[new_rank_name]) 458 except: 459 pass 460 461 return ParallelSentence(source, translations_new_rank, reference, attributes)

462 463 464

465 - def get_pairwise_parallelsentence(self, system_names, directed = False):

466 """ 467 Provides the pairwise parallel sentence, whose target sentences provide output by the two given systems 468 @param system_names: pair of translation system names 469 @type system_names: tuple of strings 470 @param order: whether the order of the systems in the tuple is important, or not 471 @type order: boolean 472 @return: the pairwise parallel sentence that contains the outputs of the two given systems 473 @rtype: L{PairwiseParallelSentence} 474 """ 475 476 try: 477 return self.pps_dict[system_names] 478 except: 479 if not directed: 480 try: 481 system_names_reversed = (system_names[1], system_names[0]) 482 return self.pps_dict[system_names_reversed] 483 except: 484 sys.stderr.write("At least one of system names is missing.\n") 485 else: 486 sys.stderr.write("At least one of system names is missing.\n")

487

Source Code for Module sentence.pairwiseparallelsentenceset