1
2
3 '''
4 Created on Jul 12, 2011
5
6 @author: jogin, Eleftherios Avramidis
7 '''
8 from pairwiseparallelsentence import PairwiseParallelSentence
9 from copy import deepcopy
10 from parallelsentence import ParallelSentence
11 import logging
12 from os import sys
13
14
16 """
17 A set of pairwise parallel sentences, all originating from the same source sentence, in order to facilitate pairwise comparisons etc.
18 Works as a wrapper over a dictionary, where pairs are indexed based on the system names of the 2 target translations.
19 """
21 """
22 @return: a list of the parallel sentences contained in the structure
23 @rtype: pairwise_parallelsentences: list of L{sentence.pairwiseparallelsentence.PairwiseParallelSentence} instances
24 """
25 return self.pps_dict.values()
26
28 """
29 @return: all the system pairs that are mapped to pairwise sentences
30 @rtype: list of tuples of two strings each
31 """
32 return self.pps_dict.keys()
33
34
37
38
40 """
41 A set of pairwise parallel sentences, all originating from the same source sentence, where more than one comparisons per system-pair are allowed
42 @ivar pps_dict: a dict that stores all the pairwise parallelsentences mapped to a tuple of strings containing the system names for the respective translations
43 @type pps_dict: {(str, str): [L{PairwiseParallelSentence}, ...]}
44 """
45 - def __init__(self, pairwise_parallelsentences = [], rank_name = "rank", **kwargs):
46 """
47 @param pairwise_parallelsentences: a list of pairwise parallel sentences
48 @type pairwise_parallelsentences: [L{sentence.pairwiseparallelsentence.PairwiseParallelSentence}, ...]
49 """
50 self.pps_dict = {}
51 self.rank_name = kwargs.setdefault("rank_name", rank_name)
52
53 for ps in pairwise_parallelsentences:
54 system_names = ps.get_system_names()
55 try:
56 self.pps_dict[system_names].append(ps)
57 except KeyError:
58 self.pps_dict[system_names] = [ps]
59
61 all_parallelsentences = []
62 for parallelsentencelist in self.pps_dict.values():
63 all_parallelsentences.extend(parallelsentencelist)
64 return all_parallelsentences
65
66
67
69 """
70 It removes the pairwise sentences whose rank is equal with each other's
71 @return: the number of ties filtered
72 @rtype: int
73 @todo: test
74 """
75 reformed_dict = {}
76 removed_ties = 0
77 for system_names in self.pps_dict:
78 reformed_dict[system_names] = [ps for ps in self.pps_dict[system_names] if int(ps.get_attribute(self.rank_name)) != 0]
79 removed_ties += len(self.pps_dict[system_names]) - len(reformed_dict[system_names])
80
81 self.pps_dict = reformed_dict
82 return removed_ties
83
84
86 pass
87 restrict_ranks = set([float(r) for r in restrict_ranks])
88 print
89 print restrict_ranks
90 for system_names in self.pps_dict.keys():
91 ps_restricted = []
92 for ps in self.pps_dict[system_names]:
93 target_ranks = set([float(r) for r in ps.get_target_attribute_values(self.rank_name)])
94 print target_ranks,
95 if not target_ranks.isdisjoint(restrict_ranks):
96 ps_restricted.append(ps)
97 print "ok"
98 else:
99 print
100 if ps_restricted:
101 self.pps_dict[system_names] = ps_restricted
102 else:
103 del self.pps_dict[system_names]
104
105
106
107
109 """
110 Provides the pairwise parallel sentences, whose target sentences provide output by the two given systems
111 @param system_names: pair of translation system names
112 @type system_names: tuple of strings
113 @param order: whether the order of the systems in the tuple is important, or not
114 @type order: boolean
115 @return: the pairwise parallel sentence that contains the outputs of the two given systems
116 @rtype: [L{sentence.pairwiseparallelsentence.PairwiseParallelSentence}, ...]
117 """
118 try:
119 return self.pps_dict[system_names]
120 except:
121 if not directed:
122 try:
123 system_names_reversed = (system_names[1], system_names[0])
124 return self.pps_dict[system_names_reversed].get_reversed()
125 except:
126 print "At least one of system names is missing."
127 else:
128 print "At least one of system names is missing."
129
130
133
134
136 filtered_pairwise_parallelsentences = []
137 for system_names in self.get_system_names():
138 overlapping_judgments = self.get_pairwise_parallelsentences(system_names)
139 filtered_pairwise_parallelsentence = self._filter_agreement(threshold, overlapping_judgments, system_names)
140 if filtered_pairwise_parallelsentence:
141 filtered_pairwise_parallelsentences.append(filtered_pairwise_parallelsentence)
142 return filtered_pairwise_parallelsentences
143
144 - def _filter_agreement(self, threshold = 1.00, pairwise_parallelsentences = [], system_names=()):
145 if len(pairwise_parallelsentences) == 1:
146 return pairwise_parallelsentences[0]
147 rank_vector = [ps.get_rank() for ps in pairwise_parallelsentences]
148 rank_values = set(rank_vector)
149 rank_distribution = sorted([(rank_vector.count(rank)*1.00/len(rank_vector), rank) for rank in rank_values])
150 most_popular = rank_distribution[-1]
151 if most_popular[0] >= threshold:
152
153 for ps in pairwise_parallelsentences:
154 if ps.get_rank() == most_popular[1]:
155 return ps
156 else:
157 return None
158
159
162
164 """
165 Merge many overlapping judgments over translations originating from the same source sentence
166 @return pairwise parallel sentences, containing only the merged output rank
167 @rtype [L{L{sentence.pairwiseparallelsentence.PairwiseParallelSentence}, ...]
168 """
169 merged_pairwise_parallelsentences = []
170 for system_names in self.get_system_names():
171 overlapping_judgments = self.get_pairwise_parallelsentences(system_names)
172 merged_pairwise_parallelsentence = self._merge_judgments(overlapping_judgments, system_names)
173 merged_pairwise_parallelsentences.append(merged_pairwise_parallelsentence)
174 return merged_pairwise_parallelsentences
175
178
180 """
181 Merge many overlapping judgements over translations produced by the same system pair
182 originating from the same source sentence, into only one judgment
183 @return: a pairwise parallel sentences
184 @rtype: L{PairwiseParallelSentence}
185 """
186 rank = sum([float(ps.get_rank()) * self._merge_weight(ps) for ps in pairwise_parallelsentences])
187
188 attributes = deepcopy(pairwise_parallelsentences[0].attributes)
189 attributes[self.rank_name] = rank
190 source = pairwise_parallelsentences[0].get_source()
191 translations = pairwise_parallelsentences[0].get_translations()
192 reference = pairwise_parallelsentences[0].get_reference()
193 new_ps = PairwiseParallelSentence(source, translations, system_names, reference, attributes, self.rank_name)
194 return new_ps
195
196
197
200
201
202
203
204
206 """
207 A compact set of pairwise parallel sentences, all originating from the same source sentence,
208 where only one comparison per system-pair is allowed
209 @ivar rank_name: the name of the rank value
210 @type rank_name: str
211 @ivar pps_dict: a dictionary of pairwise parallelel sentences
212 @type pps_dict: {(str, str): L{PairwiseParallelSentence}}
213 """
214
215 - def __init__(self, pairwise_parallelsentences, rank_name = "rank"):
216 """
217 @param pairwise_parallelsentences: a list of pairwise parallel sentences
218 @type pairwise_parallelsentences: [L{PairwiseParallelSentence}, ...]
219 """
220 self.rank_name = rank_name
221 self.pps_dict = dict([(ps.get_system_names(), ps) for ps in pairwise_parallelsentences])
222 pass
223
224
226 """
227 It removes the pairwise sentences whose rank is equal with each other's
228 @return: the number of ties filtered
229 @rtype: int
230 """
231 reformed_dict = {}
232 ties = 0
233 for system_names in self.pps_dict:
234 ps = self.pps_dict[system_names]
235 if int(ps.get_attribute(self.rank_name)) != 0:
236 reformed_dict[system_names] = ps
237 else:
238 ties += 1
239 self.pps_dict = reformed_dict
240
241 print "filtered %d ties" % ties
242 return ties
243
245 """
246 It reconstructs a single parallel sentence object with a gathered discrete [1-9]
247 ranking out of the pairwise comparisons that exist in the pairwise parallel sentence instances
248 @return: a parallel sentence
249 @rtype: L{ParallelSentence}
250 """
251 rank_per_system = {}
252 translations_per_system = {}
253
254 if not new_rank_name:
255 new_rank_name = self.rank_name
256
257
258
259
260 for (system_a, system_b), parallelsentence in self.pps_dict.iteritems():
261
262
263 if not critical_attribute:
264 rank = int(parallelsentence.get_rank())
265 else:
266 rank = int(parallelsentence.get_attribute(critical_attribute))
267
268
269
270
271 rank_per_system[system_a] = rank_per_system.setdefault(system_a, 0) + rank
272
273
274
275 translations_per_system[system_b] = parallelsentence.get_translations()[1]
276 translations_per_system[system_a] = parallelsentence.get_translations()[0]
277
278
279
280 i = 0
281 prev_rank = None
282 translations_new_rank = []
283
284
285
286 systems = sorted(rank_per_system, key=lambda system: rank_per_system[system])
287 for system in systems:
288
289 if rank_per_system[system] != prev_rank:
290 i += 1
291
292
293
294 prev_rank = rank_per_system[system]
295 translation = deepcopy(translations_per_system[system])
296 translation.add_attribute(new_rank_name, str(i))
297 translations_new_rank.append(translation)
298
299
300 source = deepcopy(self.pps_dict.values()[0].get_source())
301 reference = deepcopy(self.pps_dict.values()[0].get_reference())
302 attributes = deepcopy(self.pps_dict.values()[0].get_attributes())
303
304
305 try:
306 del(attributes[self.rank_name])
307 except:
308 pass
309
310 return ParallelSentence(source, translations_new_rank, reference, attributes)
311
312
314 """
315 It reconstructs a single parallel sentence object with a gathered discrete [1-9]
316 ranking out of the pairwise comparisons that exist in the pairwise parallel sentence instances
317 @return: a parallel sentence
318 @rtype: L{ParallelSentence}
319 """
320 rank_per_system = {}
321 translations_per_system = {}
322
323 if not new_rank_name:
324 new_rank_name = self.rank_name
325
326 fullrank = False
327
328 while not fullrank:
329
330 for (system_a, system_b), parallelsentence in self.pps_dict.iteritems():
331 logging.debug("threshold: {}".format(threshold))
332
333
334 prob_neg = float(parallelsentence.get_attribute(attribute1))
335
336
337
338 if abs(prob_neg-0.5) > threshold:
339 try:
340 rank_per_system[system_b] += prob_neg
341 except KeyError:
342 rank_per_system[system_b] = prob_neg
343
344 translations_per_system[system_b] = parallelsentence.get_translations()[1]
345 translations_per_system[system_a] = parallelsentence.get_translations()[0]
346
347
348 fullrank = True
349 for system_a, system_b in self.get_system_names():
350 if system_b not in rank_per_system:
351 logging.debug("didn't fill in one rank")
352 fullrank = False
353 threshold = threshold - threshold/20
354 break
355
356
357 if threshold < 0.002:
358 threshold = 0
359 if threshold == 0:
360 fullrank = True
361
362
363 i = 0
364 prev_rank = None
365 translations_new_rank = []
366
367
368
369 systems = sorted(rank_per_system, key=lambda system: rank_per_system[system])
370 print systems
371 for system in systems:
372
373 if rank_per_system[system] != prev_rank:
374 i += 1
375
376
377
378 prev_rank = rank_per_system[system]
379 translation = deepcopy(translations_per_system[system])
380 translation.add_attribute(new_rank_name, str(i))
381 translations_new_rank.append(translation)
382
383
384 source = deepcopy(self.pps_dict.values()[0].get_source())
385 reference = deepcopy(self.pps_dict.values()[0].get_reference())
386 attributes = deepcopy(self.pps_dict.values()[0].get_attributes())
387 try:
388 del(attributes[new_rank_name])
389 except:
390 pass
391
392 return ParallelSentence(source, translations_new_rank, reference, attributes)
393
395 """
396 It reconstructs a single parallel sentence object with a gathered discrete [1-9]
397 ranking out of the pairwise comparisons that exist in the pairwise parallel sentence instances
398 @return: a parallel sentence
399 @rtype: L{ParallelSentence}
400 """
401 rank_per_system = {}
402 translations_per_system = {}
403
404 if not new_rank_name:
405 new_rank_name = self.rank_name
406
407
408 for (system_a, system_b), parallelsentence in self.pps_dict.iteritems():
409
410
411 prob_neg = float(parallelsentence.get_attribute(attribute1))
412
413
414
415
416
417 try:
418 rank_per_system[system_b] += prob_neg
419 except KeyError:
420 rank_per_system[system_b] = prob_neg
421
422
423
424
425
426
427 translations_per_system[system_b] = parallelsentence.get_translations()[1]
428 translations_per_system[system_a] = parallelsentence.get_translations()[0]
429
430
431
432 i = 0
433 prev_rank = None
434 translations_new_rank = []
435
436
437
438 systems = sorted(rank_per_system, key=lambda system: rank_per_system[system])
439 print systems
440 for system in systems:
441
442 if rank_per_system[system] != prev_rank:
443 i += 1
444
445
446
447 prev_rank = rank_per_system[system]
448 translation = deepcopy(translations_per_system[system])
449 translation.add_attribute(new_rank_name, str(i))
450 translations_new_rank.append(translation)
451
452
453 source = deepcopy(self.pps_dict.values()[0].get_source())
454 reference = deepcopy(self.pps_dict.values()[0].get_reference())
455 attributes = deepcopy(self.pps_dict.values()[0].get_attributes())
456 try:
457 del(attributes[new_rank_name])
458 except:
459 pass
460
461 return ParallelSentence(source, translations_new_rank, reference, attributes)
462
463
464
466 """
467 Provides the pairwise parallel sentence, whose target sentences provide output by the two given systems
468 @param system_names: pair of translation system names
469 @type system_names: tuple of strings
470 @param order: whether the order of the systems in the tuple is important, or not
471 @type order: boolean
472 @return: the pairwise parallel sentence that contains the outputs of the two given systems
473 @rtype: L{PairwiseParallelSentence}
474 """
475
476 try:
477 return self.pps_dict[system_names]
478 except:
479 if not directed:
480 try:
481 system_names_reversed = (system_names[1], system_names[0])
482 return self.pps_dict[system_names_reversed]
483 except:
484 sys.stderr.write("At least one of system names is missing.\n")
485 else:
486 sys.stderr.write("At least one of system names is missing.\n")
487