sentence.parallelsentence

12 """ 13 A parallel sentence, that contains a source sentence, 14 a number of target sentences, a reference and some attributes 15 @ivar src: the source sentence 16 @type src: SimpleSentence 17 @ivar tgt: a list of target sentences / translations 18 @type tgt: [SimpleSentence, ...] 19 @ivar ref: a reference translation 20 @type ref: SimpleSentence 21 """ 22 23

24 - def __init__(self, source, translations, reference = None, attributes = {}, rank_name = "rank", **kwargs):

25 """ 26 Constructor 27 @type source SimpleSentence 28 @param source The source text of the parallel sentence 29 @type translations list ( SimpleSentence ) 30 @param translations A list of given translations 31 @type reference SimpleSentence 32 @param reference The desired translation provided by the system 33 @type attributes dict { String name , String value } 34 @param the attributes that describe the parallel sentence 35 @keyword sort_translations: Whether translations should be sorted based on the system name 36 @type sort_translations: boolean 37 """ 38 self.src = source 39 self.tgt = translations 40 self.ref = reference 41 self.attributes = deepcopy (attributes) 42 self.rank_name = rank_name 43 if kwargs.setdefault("sort_translations", False): 44 self.tgt = sorted(translations, key=lambda t: t.get_attribute("system")) 45 46 47 try: 48 self.attributes["langsrc"] = kwargs.setdefault("langsrc", self.attributes["langsrc"]) 49 self.attributes["langtgt"] = kwargs.setdefault("langtgt", self.attributes["langtgt"]) 50 except KeyError: 51 sys.exit('Source or target language not specified in parallelsentence: [{}]'.format(self.__str__()))

52

53 - def __str__(self):

54 return [s.__str__() for s in self.serialize()]

55

56 - def __lt__(self, other):

57 return self.get_compact_id() < other.get_compact_id()

58

59 - def __gt__(self, other):

60 return self.get_compact_id() > other.get_compact_id()

61

62 - def __eq__(self, other):

63 64 print self.src == other.src 65 print self.tgt == other.tgt 66 print self.attributes == other.attributes 67 68 69 return ( 70 self.src == other.src and 71 self.tgt == other.tgt and 72 self.ref == other.ref and 73 self.attributes == other.attributes)

74 75 76

77 - def get_rank(self):

78 """ 79 provide the rank value of the parallel sentence 80 return: the rank value 81 rtype: string 82 """ 83 return self.attributes[self.rank_name]

84

85 - def get_ranking(self):

86 """ 87 returns a ranking list, containing the ranks of the included 88 target translations 89 @return: the ranking list 90 @rtype: Ranking 91 """ 92 return Ranking([s.get_rank() for s in self.tgt])

93

94 - def get_attributes(self):

95 """ 96 provide all attributes 97 @return: the parallel sentence attributes dictionary 98 @rtype: dict([(string,string), ...]) 99 """ 100 return self.attributes

101

102 - def get_attribute_names (self):

103 """ 104 provide all attribute names 105 @return: a set with the names of the attributes 106 @rtype: set([string, ...]) 107 """ 108 return self.attributes.keys()

109

110 - def get_attribute(self, name):

111 """ 112 provide the value of a particular attribute 113 @return: the value of the attribute with the specified name 114 @rtype: string 115 """ 116 return self.attributes[name]

117

118 - def get_target_attribute_values(self, attribute_name, sub=None):

119 # print [t.attributes for t in self.tgt] 120 attribute_values = [target.get_attribute(attribute_name, sub) for target in self.tgt] 121 return attribute_values

122

123 - def get_filtered_target_attribute_values(self, attribute_name, filter_attribute_name, filter_attribute_value):

124 attribute_values = [target.get_attribute(attribute_name) for target in self.tgt if target.get_attribute(filter_attribute_name) != filter_attribute_value] 125 return attribute_values

126

127 - def add_attributes(self, attributes):

128 self.attributes.update( attributes )

129

130 - def set_langsrc (self, langsrc):

131 self.attributes["langsrc"] = langsrc

132

133 - def set_langtgt (self, langtgt):

134 self.attributes["langtgt"] = langtgt

135

136 - def set_id (self, id):

137 self.attributes["id"] = str(id)

138

139 - def get_compact_id(self):

140 try: 141 return "%s:%s" % (self.attributes["testset"], self.attributes["id"]) 142 except: 143 # sys.stderr.write("Warning: Could not add set id into compact sentence id %s\n" % self.attributes["id"]) 144 return self.attributes["id"]

145

146 - def get_tuple_id(self):

147 try: 148 return (self.attributes["testset"], self.attributes["id"]) 149 except: 150 # sys.stderr.write("Warning: Could not add set id into compact sentence id %s\n" % self.attributes["id"]) 151 return (self.attributes["id"])

152

153 - def get_compact_judgment_id(self):

154 try: 155 return "%s:%s" % (self.attributes["testset"], self.attributes["judgement_id"]) 156 except: 157 # sys.stderr.write("Warning: Could not add set id into compact sentence id %s\n" % self.attributes["id"]) 158 return self.attributes["judgement_id"]

159

160 - def get_judgment_id(self):

161 return self.attributes["judgement_id"]

162

163 - def has_judgment_id(self):

164 return self.attributes.has_key("judgement_id")

165

166 - def add_judgment_id(self, value):

167 self.attributes["judgement_id"] = str(value)

168

169 - def get_source(self):

170 return self.src

171

172 - def set_source(self,src):

173 self.src = src

174 175

176 - def get_translations(self):

177 return self.tgt

178

179 - def set_translations(self, tgt):

180 self.tgt = tgt

181

182 - def get_reference(self):

183 return self.ref

184

185 - def set_reference(self,ref):

186 self.ref = ref

187

188 - def get_nested_attribute_names(self):

189 attribute_names = [] 190 attribute_names.extend(self.attributes.keys()) 191 192 source_attribute_names = [attribute_names.append("src_{}".format(att)) for att in self.src.get_attributes()] 193 attribute_names.extend(source_attribute_names) 194 195 i=0 196 for tgtitem in self.tgt: 197 i += 1 198 target_attribute_names = [attribute_names.append("tgt-{}_{}".format(i,att)) for att in tgtitem.get_attributes()] 199 attribute_names.extend(target_attribute_names) 200 return attribute_names

201

202 - def get_nested_attributes(self):

203 """ 204 function that gathers all the features of the nested sentences 205 to the parallel sentence object, by prefixing their names accordingly 206 """ 207 208 new_attributes = deepcopy (self.attributes) 209 new_attributes.update( self._prefix(self.src.get_attributes(), "src") ) 210 i=0 211 for tgtitem in self.tgt: 212 i += 1 213 prefixeditems = self._prefix( tgtitem.get_attributes(), "tgt-%d" % i ) 214 #prefixeditems = self._prefix( tgtitem.get_attributes(), tgtitem.get_attributes()["system"] ) 215 new_attributes.update( prefixeditems ) 216 217 try: 218 new_attributes.update( self._prefix( self.ref.get_attributes(), "ref" ) ) 219 except: 220 pass 221 return new_attributes

222 223

224 - def recover_attributes(self):

225 """ 226 Moves the attributes back to the nested sentences 227 228 """ 229 230 for attribute_name in self.attributes.keys(): 231 attribute_value = self.attributes[attribute_name] 232 if (attribute_name.find('_') > 0) : 233 234 src_attribute = re.match("src_(.*)", attribute_name) 235 if src_attribute: 236 self.src.add_attribute(src_attribute.group(1), attribute_value) 237 del self.attributes[attribute_name] 238 239 ref_attribute = re.match("ref_(.*)", attribute_name) 240 if ref_attribute: 241 self.src.add_attribute(ref_attribute.group(1), attribute_value) 242 del self.attributes[attribute_name] 243 244 tgt_attribute = re.match("tgt-([0-9]*)_(.*)", attribute_name) 245 if tgt_attribute: 246 index = int(tgt_attribute.group(1)) - 1 247 new_attribute_name = tgt_attribute.group(2) 248 self.tgt[index].add_attribute(new_attribute_name, attribute_value) 249 del self.attributes[attribute_name]

250 251

252 - def serialize(self):

253 list = [] 254 list.append(self.src) 255 list.extend(self.tgt) 256 return list

257 258

259 - def _prefix(self, listitems, prefix):

260 newlistitems = OrderedDict() 261 for item_key in listitems.keys(): 262 new_item_key = "_".join([prefix, item_key]) 263 newlistitems[new_item_key] = listitems[item_key] 264 return newlistitems

265 266

267 - def merge_parallelsentence(self, ps, attribute_replacements = {}, **kwargs):

268 """ 269 Augment the parallelsentence with another parallesentence. 270 Merges attributes of source, target and reference sentences and adds target sentences whose system doesn't exist. 271 attributes of target sentences that have a common system. 272 @param ps: Object of ParallelSentence() with one source sentence and more target sentences 273 @type ps: sentence.parallelsentence.ParallelSentence 274 @param add_missing: If translation outputs are missing from the first file but exist in the second, add them (default: True) 275 @type add_missing: boolean 276 """ 277 278 add_missing = kwargs.setdefault("add_missing", True) 279 280 #merge attributes on the ParallelSentence level and do the replacements 281 incoming_attributes = ps.get_attributes() 282 for incoming_attribute in incoming_attributes: 283 if incoming_attribute in attribute_replacements: 284 new_key = attribute_replacements[incoming_attribute] 285 new_value = incoming_attributes[incoming_attribute] 286 incoming_attributes[new_key] = new_value 287 del(incoming_attributes[incoming_attribute]) 288 289 self.attributes.update(incoming_attributes) 290 291 #merge source sentence 292 self.src.merge_simplesentence(ps.get_source(), attribute_replacements) 293 294 #merge reference translation 295 try: 296 self.ref.merge_simplesentence(ps.get_reference(), attribute_replacements) 297 except: 298 pass 299 300 #loop over the contained target sentences. Merge those with same system attribute and append those missing 301 302 for tgtPS in ps.get_translations(): 303 system = tgtPS.get_attribute("system") 304 merged = False 305 for i in range(len(self.tgt)): 306 if self.tgt[i].attributes["system"] == system: 307 self.tgt[i].merge_simplesentence(tgtPS, attribute_replacements) 308 merged = True 309 if not merged and add_missing: 310 #print tgtPS.get_attributes(), "not merged - unknown system!" 311 sys.stderr.write("Warning: Target sentence was missing. Adding...\n") 312 self.tgt.append(tgtPS)

313 314

315 - def get_pairwise_parallelsentences(self, replacement = True, **kwargs):

316 """ 317 Create a set of all available parallel sentence pairs (in tgt) from one ParallelSentence object. 318 @param ps: Object of ParallelSetnece() with one source sentence and more target sentences 319 @type ps: sentence.parallelsentence.ParallelSentence 320 321 kwargs: 322 @param replacement: If enabled, creates pairs with all possible combinations with replacement 323 @type replacement: boolean 324 @param include_references: Include references as system translations from system "_ref" and lowest rank 325 @type include_references: boolean 326 @param filter_unassigned: If enabled, it filters out pairs with rank = "-1", which means no value was assigned 327 It should not be turned on for test-sets 328 @type filter_unassigned: boolean 329 @param restrict_ranks: Filter pairs to keep only for the ones that include the given ranks. Don't filter if list empty. Before 330 using this, make sure that the ranks are normalized 331 @type restrict_ranks: [int, ...] 332 333 @return p: set of parallel sentence pairs from one PS object 334 @type p: a list of PairwiseParallelSentence() objects 335 336 """ 337 from pairwiseparallelsentence import PairwiseParallelSentence 338 339 replacement = kwargs.setdefault("replacement", replacement) 340 include_references = kwargs.setdefault("include_references", False) 341 restrict_ranks = kwargs.setdefault("restrict_ranks", []) 342 invert_ranks = kwargs.setdefault("invert_ranks", []) 343 rank_name = kwargs.setdefault("rank_name", self.rank_name) 344 rankless = kwargs.setdefault("rankless", False) 345 346 systems = [] 347 targets = [] 348 systems_list = [] 349 targets_list = [] 350 351 translations = self.get_translations() 352 if kwargs.setdefault('filter_unassigned', False): 353 translations = [t for t in self.get_translations() if t.get_attribute(self.rank_name) != "-1"] 354 355 #this is used in case we want to include references in the pairwising 356 #references are added as translations by system named _ref 357 #only single references supported at the moment 358 if include_references: 359 if "_ref" not in self.get_target_attribute_values("system"): 360 reference = self.get_reference() 361 reference.add_attribute("system", "_ref") 362 if not rankless: 363 #get a rank value lower than all the existing ones and assign it to references 364 min_rank = min([float(t.get_attribute(self.rank_name)) for t in translations]) - 1 365 reference.add_attribute(self.rank_name, str(int(min_rank))) 366 translations.append(reference) 367 368 #@todo: rewrite this function in more efficient way 369 for targetA in translations: 370 system_nameA = targetA.get_attribute('system') 371 for system_nameB in systems_list: 372 systems.append((system_nameA, system_nameB)) 373 if replacement: 374 systems.append((system_nameB, system_nameA)) 375 for targetB in targets_list: 376 targets.append((targetA, targetB)) 377 if replacement: 378 targets.append((targetB, targetA)) 379 systems_list.append(system_nameA) 380 targets_list.append(targetA) 381 382 pps_list = [PairwiseParallelSentence(self.get_source(), 383 targets[i], 384 systems[i], 385 self.ref, 386 self.attributes, 387 rank_name, 388 invert_ranks = invert_ranks, 389 rankless = rankless 390 ) \ 391 for i in range(len(systems)) 392 ] 393 return pps_list

394 395

396 - def import_indexed_parallelsentence(self, parallelsentence, target_attribute_names, keep_attributes_general=[], keep_attributes_source=[], keep_attributes_target=[]):

397 """ 398 """ 399 targets = self.get_translations() 400 401 incoming_targets = parallelsentence.get_translations() 402 incoming_translations = dict([(tgt.get_attribute("system"), tgt) for tgt in incoming_targets]) 403 #print parallelsentence.get_attribute("judgement_id") 404 #print self.get_attribute("judgement_id") 405 406 #print [t.get_attribute("system") for t in incoming_targets] 407 #print [t.get_attribute("system") for t in targets] 408 409 #print 410 411 new_targets = [] 412 self.src.keep_only_attributes(keep_attributes_source) 413 414 for target in targets: 415 system_id = target.get_attribute("system") 416 matched_incoming = incoming_translations[system_id] 417 for attribute_name in target_attribute_names: 418 value = matched_incoming.get_attribute(attribute_name) 419 target.keep_only_attributes(keep_attributes_target) 420 target.add_attribute(attribute_name, value) 421 new_targets.append(target) 422 423 self.tgt = new_targets 424 425 for name in self.attributes.keys(): 426 if name not in keep_attributes_general: 427 del(self.attributes[name]) 428 self.ref = None

429

430 - def import_missing_parallelsentence(self, target_attribute_names, keep_attributes_general=[], keep_attributes_source=[], keep_attributes_target=[]):

431 targets = self.get_translations() 432 433 434 435 436 new_targets = [] 437 self.src.keep_only_attributes(keep_attributes_source) 438 439 for target in targets: 440 system_id = target.get_attribute("system") 441 for attribute_name in target_attribute_names: 442 #only for rank 443 value = '1' 444 target.keep_only_attributes(keep_attributes_target) 445 target.add_attribute(attribute_name, value) 446 new_targets.append(target) 447 448 self.tgt = new_targets 449 450 for name in self.attributes.keys(): 451 if name not in keep_attributes_general: 452 del(self.attributes[name]) 453 self.ref = None

454 455

456 - def remove_ties(self):

457 """ 458 Function that modifies the current parallel sentence by removing the target translations that create ties. 459 Only first translation for each rank is kept 460 """ 461 translation_per_rank = [(tgt.get_rank(), tgt) for tgt in self.tgt] 462 prev_rank = None 463 remaining_translations = [] 464 for system, translation in sorted(translation_per_rank): 465 rank = int(translation.get_rank()) 466 if prev_rank != rank: 467 remaining_translations.append(translation) 468 prev_rank = rank 469 self.tgt = remaining_translations

Source Code for Module sentence.parallelsentence