1 """
2 @author: Eleftherios Avramidis
3 """
4
5 from collections import OrderedDict
6 from copy import deepcopy
7 import re
8 import sys
9 from ranking import Ranking
10
12 """
13 A parallel sentence, that contains a source sentence,
14 a number of target sentences, a reference and some attributes
15 @ivar src: the source sentence
16 @type src: SimpleSentence
17 @ivar tgt: a list of target sentences / translations
18 @type tgt: [SimpleSentence, ...]
19 @ivar ref: a reference translation
20 @type ref: SimpleSentence
21 """
22
23
24 - def __init__(self, source, translations, reference = None, attributes = {}, rank_name = "rank", **kwargs):
25 """
26 Constructor
27 @type source SimpleSentence
28 @param source The source text of the parallel sentence
29 @type translations list ( SimpleSentence )
30 @param translations A list of given translations
31 @type reference SimpleSentence
32 @param reference The desired translation provided by the system
33 @type attributes dict { String name , String value }
34 @param the attributes that describe the parallel sentence
35 @keyword sort_translations: Whether translations should be sorted based on the system name
36 @type sort_translations: boolean
37 """
38 self.src = source
39 self.tgt = translations
40 self.ref = reference
41 self.attributes = deepcopy (attributes)
42 self.rank_name = rank_name
43 if kwargs.setdefault("sort_translations", False):
44 self.tgt = sorted(translations, key=lambda t: t.get_attribute("system"))
45
46
47 try:
48 self.attributes["langsrc"] = kwargs.setdefault("langsrc", self.attributes["langsrc"])
49 self.attributes["langtgt"] = kwargs.setdefault("langtgt", self.attributes["langtgt"])
50 except KeyError:
51 sys.exit('Source or target language not specified in parallelsentence: [{}]'.format(self.__str__()))
52
55
58
61
63
64 print self.src == other.src
65 print self.tgt == other.tgt
66 print self.attributes == other.attributes
67
68
69 return (
70 self.src == other.src and
71 self.tgt == other.tgt and
72 self.ref == other.ref and
73 self.attributes == other.attributes)
74
75
76
78 """
79 provide the rank value of the parallel sentence
80 return: the rank value
81 rtype: string
82 """
83 return self.attributes[self.rank_name]
84
86 """
87 returns a ranking list, containing the ranks of the included
88 target translations
89 @return: the ranking list
90 @rtype: Ranking
91 """
92 return Ranking([s.get_rank() for s in self.tgt])
93
95 """
96 provide all attributes
97 @return: the parallel sentence attributes dictionary
98 @rtype: dict([(string,string), ...])
99 """
100 return self.attributes
101
103 """
104 provide all attribute names
105 @return: a set with the names of the attributes
106 @rtype: set([string, ...])
107 """
108 return self.attributes.keys()
109
111 """
112 provide the value of a particular attribute
113 @return: the value of the attribute with the specified name
114 @rtype: string
115 """
116 return self.attributes[name]
117
119
120 attribute_values = [target.get_attribute(attribute_name, sub) for target in self.tgt]
121 return attribute_values
122
124 attribute_values = [target.get_attribute(attribute_name) for target in self.tgt if target.get_attribute(filter_attribute_name) != filter_attribute_value]
125 return attribute_values
126
129
132
135
138
145
152
154 try:
155 return "%s:%s" % (self.attributes["testset"], self.attributes["judgement_id"])
156 except:
157
158 return self.attributes["judgement_id"]
159
162
164 return self.attributes.has_key("judgement_id")
165
168
171
174
175
178
181
184
187
189 attribute_names = []
190 attribute_names.extend(self.attributes.keys())
191
192 source_attribute_names = [attribute_names.append("src_{}".format(att)) for att in self.src.get_attributes()]
193 attribute_names.extend(source_attribute_names)
194
195 i=0
196 for tgtitem in self.tgt:
197 i += 1
198 target_attribute_names = [attribute_names.append("tgt-{}_{}".format(i,att)) for att in tgtitem.get_attributes()]
199 attribute_names.extend(target_attribute_names)
200 return attribute_names
201
203 """
204 function that gathers all the features of the nested sentences
205 to the parallel sentence object, by prefixing their names accordingly
206 """
207
208 new_attributes = deepcopy (self.attributes)
209 new_attributes.update( self._prefix(self.src.get_attributes(), "src") )
210 i=0
211 for tgtitem in self.tgt:
212 i += 1
213 prefixeditems = self._prefix( tgtitem.get_attributes(), "tgt-%d" % i )
214
215 new_attributes.update( prefixeditems )
216
217 try:
218 new_attributes.update( self._prefix( self.ref.get_attributes(), "ref" ) )
219 except:
220 pass
221 return new_attributes
222
223
225 """
226 Moves the attributes back to the nested sentences
227
228 """
229
230 for attribute_name in self.attributes.keys():
231 attribute_value = self.attributes[attribute_name]
232 if (attribute_name.find('_') > 0) :
233
234 src_attribute = re.match("src_(.*)", attribute_name)
235 if src_attribute:
236 self.src.add_attribute(src_attribute.group(1), attribute_value)
237 del self.attributes[attribute_name]
238
239 ref_attribute = re.match("ref_(.*)", attribute_name)
240 if ref_attribute:
241 self.src.add_attribute(ref_attribute.group(1), attribute_value)
242 del self.attributes[attribute_name]
243
244 tgt_attribute = re.match("tgt-([0-9]*)_(.*)", attribute_name)
245 if tgt_attribute:
246 index = int(tgt_attribute.group(1)) - 1
247 new_attribute_name = tgt_attribute.group(2)
248 self.tgt[index].add_attribute(new_attribute_name, attribute_value)
249 del self.attributes[attribute_name]
250
251
253 list = []
254 list.append(self.src)
255 list.extend(self.tgt)
256 return list
257
258
259 - def _prefix(self, listitems, prefix):
260 newlistitems = OrderedDict()
261 for item_key in listitems.keys():
262 new_item_key = "_".join([prefix, item_key])
263 newlistitems[new_item_key] = listitems[item_key]
264 return newlistitems
265
266
268 """
269 Augment the parallelsentence with another parallesentence.
270 Merges attributes of source, target and reference sentences and adds target sentences whose system doesn't exist.
271 attributes of target sentences that have a common system.
272 @param ps: Object of ParallelSentence() with one source sentence and more target sentences
273 @type ps: sentence.parallelsentence.ParallelSentence
274 @param add_missing: If translation outputs are missing from the first file but exist in the second, add them (default: True)
275 @type add_missing: boolean
276 """
277
278 add_missing = kwargs.setdefault("add_missing", True)
279
280
281 incoming_attributes = ps.get_attributes()
282 for incoming_attribute in incoming_attributes:
283 if incoming_attribute in attribute_replacements:
284 new_key = attribute_replacements[incoming_attribute]
285 new_value = incoming_attributes[incoming_attribute]
286 incoming_attributes[new_key] = new_value
287 del(incoming_attributes[incoming_attribute])
288
289 self.attributes.update(incoming_attributes)
290
291
292 self.src.merge_simplesentence(ps.get_source(), attribute_replacements)
293
294
295 try:
296 self.ref.merge_simplesentence(ps.get_reference(), attribute_replacements)
297 except:
298 pass
299
300
301
302 for tgtPS in ps.get_translations():
303 system = tgtPS.get_attribute("system")
304 merged = False
305 for i in range(len(self.tgt)):
306 if self.tgt[i].attributes["system"] == system:
307 self.tgt[i].merge_simplesentence(tgtPS, attribute_replacements)
308 merged = True
309 if not merged and add_missing:
310
311 sys.stderr.write("Warning: Target sentence was missing. Adding...\n")
312 self.tgt.append(tgtPS)
313
314
316 """
317 Create a set of all available parallel sentence pairs (in tgt) from one ParallelSentence object.
318 @param ps: Object of ParallelSetnece() with one source sentence and more target sentences
319 @type ps: sentence.parallelsentence.ParallelSentence
320
321 kwargs:
322 @param replacement: If enabled, creates pairs with all possible combinations with replacement
323 @type replacement: boolean
324 @param include_references: Include references as system translations from system "_ref" and lowest rank
325 @type include_references: boolean
326 @param filter_unassigned: If enabled, it filters out pairs with rank = "-1", which means no value was assigned
327 It should not be turned on for test-sets
328 @type filter_unassigned: boolean
329 @param restrict_ranks: Filter pairs to keep only for the ones that include the given ranks. Don't filter if list empty. Before
330 using this, make sure that the ranks are normalized
331 @type restrict_ranks: [int, ...]
332
333 @return p: set of parallel sentence pairs from one PS object
334 @type p: a list of PairwiseParallelSentence() objects
335
336 """
337 from pairwiseparallelsentence import PairwiseParallelSentence
338
339 replacement = kwargs.setdefault("replacement", replacement)
340 include_references = kwargs.setdefault("include_references", False)
341 restrict_ranks = kwargs.setdefault("restrict_ranks", [])
342 invert_ranks = kwargs.setdefault("invert_ranks", [])
343 rank_name = kwargs.setdefault("rank_name", self.rank_name)
344 rankless = kwargs.setdefault("rankless", False)
345
346 systems = []
347 targets = []
348 systems_list = []
349 targets_list = []
350
351 translations = self.get_translations()
352 if kwargs.setdefault('filter_unassigned', False):
353 translations = [t for t in self.get_translations() if t.get_attribute(self.rank_name) != "-1"]
354
355
356
357
358 if include_references:
359 if "_ref" not in self.get_target_attribute_values("system"):
360 reference = self.get_reference()
361 reference.add_attribute("system", "_ref")
362 if not rankless:
363
364 min_rank = min([float(t.get_attribute(self.rank_name)) for t in translations]) - 1
365 reference.add_attribute(self.rank_name, str(int(min_rank)))
366 translations.append(reference)
367
368
369 for targetA in translations:
370 system_nameA = targetA.get_attribute('system')
371 for system_nameB in systems_list:
372 systems.append((system_nameA, system_nameB))
373 if replacement:
374 systems.append((system_nameB, system_nameA))
375 for targetB in targets_list:
376 targets.append((targetA, targetB))
377 if replacement:
378 targets.append((targetB, targetA))
379 systems_list.append(system_nameA)
380 targets_list.append(targetA)
381
382 pps_list = [PairwiseParallelSentence(self.get_source(),
383 targets[i],
384 systems[i],
385 self.ref,
386 self.attributes,
387 rank_name,
388 invert_ranks = invert_ranks,
389 rankless = rankless
390 ) \
391 for i in range(len(systems))
392 ]
393 return pps_list
394
395
396 - def import_indexed_parallelsentence(self, parallelsentence, target_attribute_names, keep_attributes_general=[], keep_attributes_source=[], keep_attributes_target=[]):
397 """
398 """
399 targets = self.get_translations()
400
401 incoming_targets = parallelsentence.get_translations()
402 incoming_translations = dict([(tgt.get_attribute("system"), tgt) for tgt in incoming_targets])
403
404
405
406
407
408
409
410
411 new_targets = []
412 self.src.keep_only_attributes(keep_attributes_source)
413
414 for target in targets:
415 system_id = target.get_attribute("system")
416 matched_incoming = incoming_translations[system_id]
417 for attribute_name in target_attribute_names:
418 value = matched_incoming.get_attribute(attribute_name)
419 target.keep_only_attributes(keep_attributes_target)
420 target.add_attribute(attribute_name, value)
421 new_targets.append(target)
422
423 self.tgt = new_targets
424
425 for name in self.attributes.keys():
426 if name not in keep_attributes_general:
427 del(self.attributes[name])
428 self.ref = None
429
431 targets = self.get_translations()
432
433
434
435
436 new_targets = []
437 self.src.keep_only_attributes(keep_attributes_source)
438
439 for target in targets:
440 system_id = target.get_attribute("system")
441 for attribute_name in target_attribute_names:
442
443 value = '1'
444 target.keep_only_attributes(keep_attributes_target)
445 target.add_attribute(attribute_name, value)
446 new_targets.append(target)
447
448 self.tgt = new_targets
449
450 for name in self.attributes.keys():
451 if name not in keep_attributes_general:
452 del(self.attributes[name])
453 self.ref = None
454
455
457 """
458 Function that modifies the current parallel sentence by removing the target translations that create ties.
459 Only first translation for each rank is kept
460 """
461 translation_per_rank = [(tgt.get_rank(), tgt) for tgt in self.tgt]
462 prev_rank = None
463 remaining_translations = []
464 for system, translation in sorted(translation_per_rank):
465 rank = int(translation.get_rank())
466 if prev_rank != rank:
467 remaining_translations.append(translation)
468 prev_rank = rank
469 self.tgt = remaining_translations
470