1 '''
2 Created on May 12, 2011
3
4 @author: Eleftherios Avramidis
5 '''
6
7
8 from dataset import DataSet
9 from pairwiseparallelsentenceset import AnalyticPairwiseParallelSentenceSet, CompactPairwiseParallelSentenceSet
10 from pairwiseparallelsentence import PairwiseParallelSentence
11 import sys
12 from collections import OrderedDict
13
14
16 '''
17 Abstract class that defines the data container that stores the entire dataset of parallel sentences, but internally this has been re-structured
18 so that every multiple ranking judgment (e.g. 1-5) has been split into pairwise comparisons (1,2; 1,3; ...).
19 Every set of pairwise comparisons has been mapped to the sentence id of the original source sentence
20 This allows for direct access to pairwise elements of each sentence
21 @ivar pairwise_parallelsentence_sets: A dictionary which keeps the pairwise sentences per (original) sentence id
22 @type pairwise_parallelsentence_sets: {str: }
23 '''
24
26 return self.pairwise_parallelsentence_sets.values()
27
33
34
36 return self.pairwise_parallelsentence_sets.keys()
37
39 return self.pairwise_parallelsentence_sets[sentence_id]
40
42 return self.pairwise_parallelsentence_sets
43
45 """
46 It removes the ties from the current data set
47 @return: the number of ties removed (helpful for testing)
48 @rtype: int
49 """
50 removed_ties = 0
51 for myset in self.pairwise_parallelsentence_sets.values():
52 removed_ties += myset.remove_ties()
53
54 self.pairwise_parallelsentence_sets = OrderedDict([(id, ps) for (id, ps) in self.pairwise_parallelsentence_sets.iteritems() if ps.length() > 0])
55 return removed_ties
56
57
58
59
61 """
62 Abstract class for pairwise datasets whose internal structure allows them to be reconstructed back to multi-class sets
63 It pre-supposes that there are unique pairs per entry (indexable either by judgment_id or sentence_id)
64 """
65
67 multirank_parallelsentences = []
68 for sentence_id in self.pairwise_parallelsentence_sets:
69 pairwise_parallelsentence_set = self.pairwise_parallelsentence_sets[sentence_id]
70 multirank_parallelsentence = pairwise_parallelsentence_set.get_multiranked_sentence()
71 multirank_parallelsentences.append(multirank_parallelsentence)
72
73
74
75
76 return DataSet(multirank_parallelsentences)
77
79
80 sort_attribute = kwargs.setdefault("sort_attribute", None)
81 multirank_parallelsentences = []
82 for sentence_id in self.pairwise_parallelsentence_sets:
83 pairwise_parallelsentence_set = self.pairwise_parallelsentence_sets[sentence_id]
84 multirank_parallelsentence = pairwise_parallelsentence_set.get_multiranked_sentence(critical_attribute, new_rank_name, False)
85 multirank_parallelsentences.append(multirank_parallelsentence)
86 if sort_attribute:
87 multirank_parallelsentences = sorted(multirank_parallelsentences, key=lambda ps: int(ps.get_attribute(sort_attribute)))
88 else:
89
90
91
92 pass
93 return DataSet(multirank_parallelsentences)
94
96 '''
97 Reconstructs the original data set, with only one sentence per entry.
98 @return: Simple dataset that contains the simplified parallel sentences
99 @rtype: L{DataSet}
100 '''
101 sort_attribute = kwargs.setdefault("sort_attribute", None)
102 multirank_parallelsentences = []
103 for sentence_id in self.pairwise_parallelsentence_sets:
104 pairwise_parallelsentence_set = self.pairwise_parallelsentence_sets[sentence_id]
105 multirank_parallelsentence = pairwise_parallelsentence_set.get_multiranked_sentence_with_soft_ranks(attribute1, attribute2, critical_attribute, new_rank_name)
106 multirank_parallelsentences.append(multirank_parallelsentence)
107 if sort_attribute:
108 multirank_parallelsentences = sorted(multirank_parallelsentences, key=lambda ps: int(ps.get_attribute(sort_attribute)))
109 else:
110
111
112
113 pass
114 return DataSet(multirank_parallelsentences)
115
116
117
119 """
120 A data set that contains pairwise comparisons organized by human judgment, i.e. there is a separate entry for each judgment,
121 even if there are more than one judgment per sentence
122 @ivar replacement: Defines whether pairs are done in all combinations without replacement (replacement=False) or with replacement (replacement=True)
123 @type replacement: boolean
124 @ivar include_references: Defines whether references need to be included in pairs, as sentences from system "_ref".
125 Do not enable this for test-sets, as reverting this is not yet supported
126 @type include_references: boolean
127 """
128
129
131 """
132 @param plain_dataset: the simple dataset to be converted or wrapped to an analytic one. Casting of an already pairwise simple set is supported, see L{cast}
133 @type plain_dataset: L{DataSet}
134 @param replacement: Defines whether pairs are done in all combinations without replacement (replacement=False) or with replacement (replacement=True)
135 @type replacement: boolean
136 @param include_references: Defines whether references need to be included in pairs, as sentences from system "_ref".
137 Do not enable this for test-sets, as reverting this is not yet supported
138 @type include_references: boolean
139 @param cast: Cast (reload) an existing pairwise set of simple DataSet as RawPairwiseDataset. No pairwise conversions are done then
140 @type cast: boolean
141 """
142 self.pairwise_parallelsentence_sets = OrderedDict()
143 pairwise_parallelsentences_per_sid = OrderedDict()
144
145 cast = kwargs.setdefault("cast", None)
146 self.include_references = kwargs.setdefault("include_references", False)
147 self.replacement = kwargs.setdefault("replacement", False)
148
149 if cast:
150 self._cast(cast)
151 else:
152
153
154 for parallelsentence in plain_dataset.get_parallelsentences():
155
156 judgment_id = parallelsentence.get_compact_judgment_id()
157 pairwise_parallelsentences_per_sid[judgment_id] = parallelsentence.get_pairwise_parallelsentences(replacement=self.replacement, include_references=self.include_references)
158
159
160 for judgment_id, pairwiseparallelsentences in pairwise_parallelsentences_per_sid.iteritems():
161
162 self.pairwise_parallelsentence_sets[judgment_id] = CompactPairwiseParallelSentenceSet(pairwiseparallelsentences)
163
164
182
183
184
186 """
187 A data set that contains pairwise comparisons organized by sentence id, i.e. if a sentence has multiple human judgments,
188 these will be grouped together under the sentence id
189 @ivar replacement: Defines whether pairs are done in all combinations without replacement (replacement=False) or with replacement (replacement=True)
190 @type replacement: boolean
191 @ivar include_references: Defines whether references need to be included in pairs, as sentences from system "_ref".
192 Do not enable this for test-sets, as reverting this is not yet supported
193 @type include_references: boolean
194 @ivar invert_ranks: Whether ranks should be considered the way round (highest value=best rank)
195 @type invert_ranks: boolean
196 """
197
198
200 """
201 @param plain_dataset: the simple dataset to be converted to an analytic one.
202 @type plain_dataset: L{DataSet}
203 @param replacement: Defines whether pairs are done in all combinations without replacement (replacement=False) or with replacement (replacement=True)
204 @type replacement: boolean
205 @param include_references: Defines whether references need to be included in pairs, as sentences from system "_ref".
206 Do not enable this for test-sets, as reverting this is not yet supported
207 @type include_references: boolean
208 @param restrict_ranks: Filter pairs to keep only for the ones that include the given ranks. Don't filter if list empty. Before
209 using this, make sure that the ranks are normalized
210 @type restrict_ranks: [int, ...]
211 @var invert_ranks: Whether ranks should be considered the way round (highest value=best rank)
212 @type invert_ranks: boolean
213 """
214 self.pairwise_parallelsentence_sets = OrderedDict()
215 pairwise_parallelsentences_per_sid = OrderedDict()
216
217 self.include_references = kwargs.setdefault("include_references", False)
218 self.replacement = kwargs.setdefault("replacement", False)
219 self.filter_unassigned = kwargs.setdefault("filter_unassigned", False)
220 self.restrict_ranks = kwargs.setdefault("restrict_ranks", [])
221 self.rank_name = kwargs.setdefault("rank_name", "rank")
222 self.invert_ranks = kwargs.setdefault("invert_ranks", False)
223 self.rankless = kwargs.setdefault("rankless", False)
224
225
226 for parallelsentence in plain_dataset.get_parallelsentences():
227
228 sentence_id = parallelsentence.get_compact_id()
229 pairwise_parallelsentences_per_sid.setdefault(sentence_id, []).extend(
230 parallelsentence.get_pairwise_parallelsentences(
231 replacement=self.replacement,
232 include_references=self.include_references,
233 filter_unassigned = self.filter_unassigned,
234 invert_ranks = self.invert_ranks,
235 rank_name = self.rank_name,
236
237 rankless = self.rankless
238 )
239 )
240
241 for sentence_id, pairwiseparallelsentences in pairwise_parallelsentences_per_sid.iteritems():
242
243 self.pairwise_parallelsentence_sets[sentence_id] = AnalyticPairwiseParallelSentenceSet(
244 pairwiseparallelsentences,
245 rank_name = self.rank_name
246 )
247 if self.restrict_ranks:
248 self.pairwise_parallelsentence_sets[sentence_id].restrict_ranks(self.restrict_ranks)
249
250
260
261
262
263
265 """
266 A data set that contains pairwise comparisons merged by sentence id, i.e. if a sentence has multiple human judgments,
267 these will be grouped together under the sentence id, and the overlapping pairwise judgments will be merged according
268 to soft or hard rank recomposition
269 """
274
275
276
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369