1
2
3
4 """
5
6 @author: Eleftherios Avramidis
7 """
8
9 import sys
10 import re
11 from compiler.ast import Raise
12 from collections import OrderedDict
13
15 """
16 A wrapper over a list of parallelsentences. It offers convenience functions for features and properties that
17 apply to the entire set of parallelsentences altogether
18 @ivar parallelsentences: a list of the contained parallel sentence instances
19 @type parallelsentences: [L{ParallelSentence}, ...]
20 @ivar attribute_names: (optional) keeps track of the attributes that can be found in the contained parallel sentences
21 @type attribute_names: [str, ...]
22 @ivar attribute_names_found: remembers if the attribute names have been set
23 @type attribute_names_found: boolean
24 """
25
26 - def __init__(self, content = [], attributes_list = [], annotations = []):
27 """
28 @param parallelsentence_list: the parallelsentences to be wrapped in the dataset
29 @type parallelsentence_list: [L{ParallelSentence}, ...]
30 @param attributes_list: if the names of the attributes for the parallelsentences are known, they can
31 be given here, in order to avoid extra processing. Otherwise they will be computed when needed.
32 @type [str, ...]
33 @param annotations: Not implemented
34 @type list
35 """
36
37 if isinstance(content, DataSet) or issubclass(content.__class__, DataSet):
38 self.parallelsentences = content.parallelsentences
39 self.annotations = content.annotations
40 self.attribute_names = content.attribute_names
41 self.attribute_names_found = content.attribute_names_found
42
43 else:
44
45 self.parallelsentences = content
46 self.annotations = annotations
47 if attributes_list:
48 self.attribute_names = attributes_list
49 self.attribute_names_found = True
50 else:
51 self.attribute_names_found = False
52 self.attribute_names = []
53 self.ensure_judgment_ids()
54
68
69
71 return self.parallelsentences
72
73
75 """
76 Group the contained parallel sentences by sentence id
77 @return: a dictionary with lists of parallel sentences for each sentence id
78 @rtype: dict(String, list(sentence.parallelsentence.ParallelSentence))
79 """
80 ps_sid = {}
81 for parallelsentence in self.parallelsentences:
82
83 sentence_id = parallelsentence.get_compact_id()
84 if not ps_sid.has_key(sentence_id):
85 ps_sid[sentence_id] = [parallelsentence]
86 else:
87 ps_sid[sentence_id].append(parallelsentence)
88 return ps_sid
89
90
92 """
93 Parallel sentences often come with multiple occurences, where a judgment id is unique.
94 This functions returns a dictionary of all the parallel sentences mapped to their respective judgment id.
95 If a judment id is missing, it gets assigned the incremental value showing the order of the entry in the set.
96 @return: A dictionary of all the parallel sentences mapped to their respective judgment id.
97 @rtype: dict
98 """
99 ps_jid = {}
100 j = 0
101 for parallelsentence in self.parallelsentences:
102
103 try:
104 judgement_id = parallelsentence.get_attribute("judgment_id")
105 except AttributeError:
106 judgement_id = str(j)
107 j += 1
108
109
110 ps_jid[judgement_id] = parallelsentence
111 return ps_jid
112
113
115 return self.annotations
116
118 if not self.attribute_names_found:
119 self.attribute_names = self._retrieve_attribute_names()
120 self.attribute_names_found = True
121 return self.attribute_names
122
127
133
139
141 attvalues = {}
142 for parallelsentence in self.parallelsentences:
143 allattributes = {}
144 allattributes.update(parallelsentence.get_nested_attributes())
145 allattributes.update(parallelsentence.attributes)
146 for attname in discrete_attribute_names:
147 if attname in allattributes:
148 attvalue = allattributes[attname]
149 try:
150 attvalues[attname].add(attvalue)
151 except:
152 attvalues[attname] = set([attvalue])
153 return attvalues
154
156 """
157 Convenience function that checks whether the user-requested attributes (possibly
158 via the config file) exist in the current dataset's list. If not, raise an error
159 to warn him of a possible typo or so.
160 @param desired_attributes: attributes that need to participate in the ML process
161 @rtype desired_attributes: [str, ...]
162 @param meta_attributes: attributes that need not participate in the ML process (meta)
163 @rtype meta_attributes: [str, ...]
164 """
165 attribute_names = self.get_all_attribute_names()
166 asked_attributes = set(desired_attributes.extend(meta_attributes))
167 for asked_attribute in asked_attributes:
168 if asked_attribute not in attribute_names:
169 sys.stderr.write("Requested feature %s probably not available\n" % asked_attribute)
170 raise KeyError
171
173 """
174 Appends a given data set to the end of the current dataset in place
175 @param add_dataset: dataset to be appended
176 @rtype add_dataset: L{DataSet}
177 """
178 self.parallelsentences.extend(add_dataset.get_parallelsentences())
179 existing_attribute_names = set(self.get_attribute_names())
180 new_attribute_names = set(add_dataset.get_attribute_names())
181 merged_attribute_names = existing_attribute_names.union(new_attribute_names)
182 self.attribute_names = list(merged_attribute_names)
183
184
185 - def merge_dataset(self, dataset_for_merging_with, attribute_replacements = {}, merging_attributes = ["id"], merge_strict = False, **kwargs):
186 """
187 It takes a dataset which contains the same parallelsentences, but with different attributes.
188 Incoming parallel sentences are matched with the existing parallel sentences based on the "merging attribute".
189 Incoming attributes can be renamed, so that they don't replace existing attributes.
190 @param dataset_for_merging_with: the data set whose contents are to be merged with the current data set
191 @type dataset_for_merging_with: DataSet
192 @param attribute_replacements: listing the attribute renamings that need to take place to the incoming attributes, before the are merged
193 @type attribute_replacements: list of tuples
194 @param merging_attributes: the names of the attributes that signify that two parallelsentences are the same, though with possibly different attributes
195 @type merging_attributes: list of strings
196 """
197 incoming_parallelsentences_indexed = OrderedDict()
198 incoming_parallelsentences = dataset_for_merging_with.get_parallelsentences()
199
200
201 for incoming_ps in incoming_parallelsentences:
202 key = tuple([incoming_ps.get_attribute(att) for att in merging_attributes])
203 incoming_parallelsentences_indexed[key] = incoming_ps
204
205
206 for i in range(len(self.parallelsentences)):
207 if self.parallelsentences[i]:
208 key = tuple([self.parallelsentences[i].get_attribute(att) for att in merging_attributes])
209 try:
210 incoming_ps = incoming_parallelsentences_indexed[key]
211 self.parallelsentences[i].merge_parallelsentence(incoming_ps, attribute_replacements, **kwargs)
212 except KeyError:
213 sys.stderr.write( "Didn't find key while merging sentence %s \n" % key )
214 if merge_strict:
215 self.parallelsentences[i] = None
216 pass
217
218
219
221 """
222 Merge the current dataset in place with another symmetrical dataset of the same size and the same original content, but
223 possibly with different attributes per parallel sentence
224 @param dataset_for_merging_with: the symmetrical dataset with the same order of parallel sentences
225 @type dataset_for_merging_with: L{DataSet}
226 @param attribute_replacements: a dict of attribute replacements that need to take place, before merging occurs
227 @type attribute_replacements: {str, str; ...}
228 """
229 incoming_parallelsentences = dataset_for_merging_with.get_parallelsentences()
230 if len(self.parallelsentences) != len(incoming_parallelsentences):
231 raise IndexError("Error, datasets not symmetrical")
232 if confirm_attribute != "":
233 vector1 = [ps.get_attribute(confirm_attribute) for ps in self.get_parallelsentences()]
234 vector2 = [ps.get_attribute(confirm_attribute) for ps in dataset_for_merging_with.get_parallelsentences()]
235 if vector1 != vector2:
236 raise IndexError("Error, datasets not symmetrical, concerning the identifier attribute {}".format(confirm_attribute))
237
238 for i in range(len(self.parallelsentences)):
239 incoming_ps = incoming_parallelsentences[i]
240 self.parallelsentences[i].merge_parallelsentence(incoming_ps, attribute_replacements)
241
242
244
245 new_parallelsentences = []
246 incoming_parallelsentences = dict([(p.get_attribute("judgement_id"), p) for p in dataset.get_parallelsentences()])
247
248 for existing_parallelsentence in self.parallelsentences:
249 jid = existing_parallelsentence.get_attribute("judgement_id")
250 try:
251 incoming_parallelsentence = incoming_parallelsentences[jid]
252 existing_parallelsentence.import_indexed_parallelsentence(incoming_parallelsentence, target_attribute_names, keep_attributes_general, keep_attributes_source, keep_attributes_target)
253 except:
254 sys.stderr.write("Warning: could not get a sentence for judgement_id={}".format(jid))
255
256 new_parallelsentences.append(existing_parallelsentence)
257 self.parallelsentences = new_parallelsentences
258
259
261 incoming_parallelsentences = dataset_for_merging_with.get_parallelsentences()
262 if len(self.parallelsentences) != len(incoming_parallelsentences):
263 raise IndexError("Error, datasets not symmetrical")
264 for i in range(len(self.parallelsentences)):
265 self.parallelsentences[i].ref = incoming_parallelsentences[i].ref
266
267
270
271
274
275
277 import tempfile
278 if not filename:
279 file = tempfile.mkstemp(text=True)
280 filename = file.name
281 else:
282 file = open(filename, 'w')
283 for source in self.get_singlesource_strings():
284 file.write(source)
285 file.write('\n')
286 file.close()
287 return filename
288
290 raise NotImplementedError
291
297
299 for string, ps in zip(strings, self.parallelsentences):
300 ps.src.string = string
301
303 for stringlist, ps in zip(strings, self.parallelsentences):
304 for string, tgt in zip(stringlist, ps.tgt):
305 tgt.string = string
306
307
308
309
310
312 """
313 Modifies the current dataset by removing ranking ties
314 """
315 for ps in self.parallelsentences:
316 ps.remove_ties()
317
318
320 return len(self.parallelsentences)
321
323 return self.parallelsentences[:n]
324
326 return self.parallelsentences[-1 * n:]
327
329 size = int(round(ratio * len(self.parallelsentences)))
330 return DataSet(self.parallelsentences[:size-2]), DataSet(self.parallelsentences[size-1:])
331
333 att_vector.reverse()
334
335 for ps, atts in zip(self.parallelsentences, att_vector):
336
337 atts = OrderedDict([(k, str(v)) for k,v in atts.iteritems()])
338 if target == "ps":
339 ps.add_attributes(atts)
340 elif target == "tgt":
341 ps.tgt[item].add_attributes(atts)
342 elif target == "src":
343 ps.src.add_attributes(atts)
344
345
347 attribute_names = set()
348
349 compiled_expressions = [re.compile(expression) for expression in expressions]
350 for expression in compiled_expressions:
351 for attribute_name in self.get_all_attribute_names():
352 if re.match(expression, attribute_name):
353 attribute_names.add(attribute_name)
354 else:
355 print "tzifos"
356 return list(attribute_names)
357
358
361
362 """
363 def get_nested_attributes(self):
364
365 propagated_parallelsentences = []
366 propagated_attribute_names = set()
367 for psentence in self.parallelsentences:
368 psentence.propagate_attributes()
369 propagated_parallelsentences.append(psentence)
370 propagated_attribute_names.add( psentence.get_attributes() )
371 self.parallelsentences = propagated_parallelsentences
372 self.attribute_names = list( propagated_attribute_names )
373 """
374
376 """
377 @todo comparison doesn't really work
378 """
379 i = 0
380 for ps_here, ps_other in zip(self.parallelsentences, other.parallelsentences):
381 i+=1
382 if not ps_here == ps_other:
383 print "Sentence %d with id %s-%s seems to be unequal"% (i, ps_here.get_attribute("ps1_id"), ps_here.get_attribute("ps2_id"))
384 return False
385 return True
386
387
388 - def compare(self, other_dataset, start=0, to=None ):
389 """
390 Compares this dataset to another, by displaying parallel sentences in pairs
391 """
392 if not to:
393 to = len(self.parallelsentences)-1
394 for ps1 in self.parallelsentences[start:to]:
395 for ps2 in other_dataset.get_parallelsentences():
396 if ps2.get_attributes()["id"] == ps1.get_attributes()["id"] and ps2.get_attributes()["testset"] == ps1.get_attributes()["testset"] and ps2.get_attributes()["langsrc"] == ps1.get_attributes()["langsrc"]:
397 print ps1.get_source().get_string() , "\n", ps2.get_source().get_string()
398 print ps1.get_attributes() , "\n", ps2.get_attributes()
399 print ps1.get_translations()[0].get_string() , "\n", ps2.get_translations()[0].get_string()
400 print ps1.get_translations()[0].get_attributes() , "\n", ps2.get_translations()[0].get_attributes()
401 print ps1.get_translations()[1].get_string() , "\n", ps2.get_translations()[1].get_string()
402 print ps1.get_translations()[1].get_attributes() , "\n", ps2.get_translations()[1].get_attributes()
403
404
405
407 """
408 A DataSet iterates over its basic wrapped object, ParallelSentence
409 """
410 return self.parallelsentences.__iter__()
411