sentence.dataset

15 """ 16 A wrapper over a list of parallelsentences. It offers convenience functions for features and properties that 17 apply to the entire set of parallelsentences altogether 18 @ivar parallelsentences: a list of the contained parallel sentence instances 19 @type parallelsentences: [L{ParallelSentence}, ...] 20 @ivar attribute_names: (optional) keeps track of the attributes that can be found in the contained parallel sentences 21 @type attribute_names: [str, ...] 22 @ivar attribute_names_found: remembers if the attribute names have been set 23 @type attribute_names_found: boolean 24 """ 25

26 - def __init__(self, content = [], attributes_list = [], annotations = []):

27 """ 28 @param parallelsentence_list: the parallelsentences to be wrapped in the dataset 29 @type parallelsentence_list: [L{ParallelSentence}, ...] 30 @param attributes_list: if the names of the attributes for the parallelsentences are known, they can 31 be given here, in order to avoid extra processing. Otherwise they will be computed when needed. 32 @type [str, ...] 33 @param annotations: Not implemented 34 @type list 35 """ 36 37 if isinstance(content, DataSet) or issubclass(content.__class__, DataSet): 38 self.parallelsentences = content.parallelsentences 39 self.annotations = content.annotations 40 self.attribute_names = content.attribute_names 41 self.attribute_names_found = content.attribute_names_found 42 43 else: 44 45 self.parallelsentences = content 46 self.annotations = annotations 47 if attributes_list: 48 self.attribute_names = attributes_list 49 self.attribute_names_found = True 50 else: 51 self.attribute_names_found = False 52 self.attribute_names = [] 53 self.ensure_judgment_ids()

54

55 - def ensure_judgment_ids(self):

56 """ 57 Processes one by one the contained parallel sentences and ensures that there are judgment ids 58 otherwise adds an incremental value 59 """ 60 i = 0 61 try: 62 for parallelsentence in self.parallelsentences: 63 i += 1 64 if not parallelsentence.has_judgment_id(): 65 parallelsentence.add_judgment_id(i) 66 except: 67 pass

68 69

70 - def get_parallelsentences(self):

71 return self.parallelsentences

72 73

74 - def get_parallelsentences_per_sentence_id(self):

75 """ 76 Group the contained parallel sentences by sentence id 77 @return: a dictionary with lists of parallel sentences for each sentence id 78 @rtype: dict(String, list(sentence.parallelsentence.ParallelSentence)) 79 """ 80 ps_sid = {} 81 for parallelsentence in self.parallelsentences: 82 #get the id of the particular multiple ranking (judgment) or create a new one 83 sentence_id = parallelsentence.get_compact_id() 84 if not ps_sid.has_key(sentence_id): 85 ps_sid[sentence_id] = [parallelsentence] 86 else: 87 ps_sid[sentence_id].append(parallelsentence) 88 return ps_sid

89 90

91 - def get_parallelsentences_with_judgment_ids(self):

92 """ 93 Parallel sentences often come with multiple occurences, where a judgment id is unique. 94 This functions returns a dictionary of all the parallel sentences mapped to their respective judgment id. 95 If a judment id is missing, it gets assigned the incremental value showing the order of the entry in the set. 96 @return: A dictionary of all the parallel sentences mapped to their respective judgment id. 97 @rtype: dict 98 """ 99 ps_jid = {} 100 j = 0 101 for parallelsentence in self.parallelsentences: 102 #get the id of the particular multiple ranking (judgment) or create a new one 103 try: 104 judgement_id = parallelsentence.get_attribute("judgment_id") 105 except AttributeError: 106 judgement_id = str(j) 107 j += 1 108 109 #add the pair into the dictionary 110 ps_jid[judgement_id] = parallelsentence 111 return ps_jid

112 113

114 - def get_annotations(self):

115 return self.annotations

116

117 - def get_attribute_names(self):

118 if not self.attribute_names_found: 119 self.attribute_names = self._retrieve_attribute_names() 120 self.attribute_names_found = True 121 return self.attribute_names

122

123 - def get_all_attribute_names(self):

124 all_attribute_names = self.get_attribute_names() 125 all_attribute_names.extend( self.get_nested_attribute_names() ) 126 return list(set(all_attribute_names))

127

128 - def get_nested_attribute_names(self):

129 nested_attribute_names = set() 130 for parallelsentence in self.parallelsentences: 131 nested_attribute_names.update ( parallelsentence.get_nested_attributes().keys() ) 132 return list(nested_attribute_names)

133

134 - def _retrieve_attribute_names(self):

135 attribute_names = set() 136 for parallelsentence in self.parallelsentences: 137 attribute_names.update( parallelsentence.get_attribute_names() ) 138 return list(attribute_names)

139

140 - def get_discrete_attribute_values(self, discrete_attribute_names):

141 attvalues = {} 142 for parallelsentence in self.parallelsentences: 143 allattributes = {} 144 allattributes.update(parallelsentence.get_nested_attributes()) 145 allattributes.update(parallelsentence.attributes) 146 for attname in discrete_attribute_names: 147 if attname in allattributes: 148 attvalue = allattributes[attname] 149 try: 150 attvalues[attname].add(attvalue) 151 except: 152 attvalues[attname] = set([attvalue]) 153 return attvalues

154

155 - def confirm_attributes(self, desired_attributes=[], meta_attributes=[]):

156 """ 157 Convenience function that checks whether the user-requested attributes (possibly 158 via the config file) exist in the current dataset's list. If not, raise an error 159 to warn him of a possible typo or so. 160 @param desired_attributes: attributes that need to participate in the ML process 161 @rtype desired_attributes: [str, ...] 162 @param meta_attributes: attributes that need not participate in the ML process (meta) 163 @rtype meta_attributes: [str, ...] 164 """ 165 attribute_names = self.get_all_attribute_names() 166 asked_attributes = set(desired_attributes.extend(meta_attributes)) 167 for asked_attribute in asked_attributes: 168 if asked_attribute not in attribute_names: 169 sys.stderr.write("Requested feature %s probably not available\n" % asked_attribute) 170 raise KeyError

171

172 - def append_dataset(self, add_dataset):

173 """ 174 Appends a given data set to the end of the current dataset in place 175 @param add_dataset: dataset to be appended 176 @rtype add_dataset: L{DataSet} 177 """ 178 self.parallelsentences.extend(add_dataset.get_parallelsentences()) 179 existing_attribute_names = set(self.get_attribute_names()) 180 new_attribute_names = set(add_dataset.get_attribute_names()) 181 merged_attribute_names = existing_attribute_names.union(new_attribute_names) 182 self.attribute_names = list(merged_attribute_names)

183 184 #attribute_replacements = {"rank": "predicted_rank"}

185 - def merge_dataset(self, dataset_for_merging_with, attribute_replacements = {}, merging_attributes = ["id"], merge_strict = False, **kwargs):

186 """ 187 It takes a dataset which contains the same parallelsentences, but with different attributes. 188 Incoming parallel sentences are matched with the existing parallel sentences based on the "merging attribute". 189 Incoming attributes can be renamed, so that they don't replace existing attributes. 190 @param dataset_for_merging_with: the data set whose contents are to be merged with the current data set 191 @type dataset_for_merging_with: DataSet 192 @param attribute_replacements: listing the attribute renamings that need to take place to the incoming attributes, before the are merged 193 @type attribute_replacements: list of tuples 194 @param merging_attributes: the names of the attributes that signify that two parallelsentences are the same, though with possibly different attributes 195 @type merging_attributes: list of strings 196 """ 197 incoming_parallelsentences_indexed = OrderedDict() 198 incoming_parallelsentences = dataset_for_merging_with.get_parallelsentences() 199 200 #index incoming parallelsentences based on a particular attribute into a dic 201 for incoming_ps in incoming_parallelsentences: 202 key = tuple([incoming_ps.get_attribute(att) for att in merging_attributes]) #hopefully this runs always in the same order 203 incoming_parallelsentences_indexed[key] = incoming_ps 204 205 206 for i in range(len(self.parallelsentences)): 207 if self.parallelsentences[i]: 208 key = tuple([self.parallelsentences[i].get_attribute(att) for att in merging_attributes]) #hopefully this runs always in the same order 209 try: 210 incoming_ps = incoming_parallelsentences_indexed[key] 211 self.parallelsentences[i].merge_parallelsentence(incoming_ps, attribute_replacements, **kwargs) 212 except KeyError: 213 sys.stderr.write( "Didn't find key while merging sentence %s \n" % key ) 214 if merge_strict: 215 self.parallelsentences[i] = None 216 pass

217 218 219 #attribute_replacements = {"rank": "predicted_rank"}

220 - def merge_dataset_symmetrical(self, dataset_for_merging_with, attribute_replacements = {}, confirm_attribute = ""):

221 """ 222 Merge the current dataset in place with another symmetrical dataset of the same size and the same original content, but 223 possibly with different attributes per parallel sentence 224 @param dataset_for_merging_with: the symmetrical dataset with the same order of parallel sentences 225 @type dataset_for_merging_with: L{DataSet} 226 @param attribute_replacements: a dict of attribute replacements that need to take place, before merging occurs 227 @type attribute_replacements: {str, str; ...} 228 """ 229 incoming_parallelsentences = dataset_for_merging_with.get_parallelsentences() 230 if len(self.parallelsentences) != len(incoming_parallelsentences): 231 raise IndexError("Error, datasets not symmetrical") 232 if confirm_attribute != "": 233 vector1 = [ps.get_attribute(confirm_attribute) for ps in self.get_parallelsentences()] 234 vector2 = [ps.get_attribute(confirm_attribute) for ps in dataset_for_merging_with.get_parallelsentences()] 235 if vector1 != vector2: 236 raise IndexError("Error, datasets not symmetrical, concerning the identifier attribute {}".format(confirm_attribute)) 237 238 for i in range(len(self.parallelsentences)): 239 incoming_ps = incoming_parallelsentences[i] 240 self.parallelsentences[i].merge_parallelsentence(incoming_ps, attribute_replacements)

241 242

243 - def import_target_attributes_onsystem(self, dataset, target_attribute_names, keep_attributes_general=[], keep_attributes_source=[], keep_attributes_target=[]):

244 245 new_parallelsentences = [] 246 incoming_parallelsentences = dict([(p.get_attribute("judgement_id"), p) for p in dataset.get_parallelsentences()]) 247 248 for existing_parallelsentence in self.parallelsentences: 249 jid = existing_parallelsentence.get_attribute("judgement_id") 250 try: 251 incoming_parallelsentence = incoming_parallelsentences[jid] 252 existing_parallelsentence.import_indexed_parallelsentence(incoming_parallelsentence, target_attribute_names, keep_attributes_general, keep_attributes_source, keep_attributes_target) 253 except: 254 sys.stderr.write("Warning: could not get a sentence for judgement_id={}".format(jid)) 255 #existing_parallelsentence.import_missing_parallelsentence(target_attribute_names, keep_attributes_general, keep_attributes_source, keep_attributes_target) 256 new_parallelsentences.append(existing_parallelsentence) 257 self.parallelsentences = new_parallelsentences

258 259

260 - def merge_references_symmetrical(self, dataset_for_merging_with):

261 incoming_parallelsentences = dataset_for_merging_with.get_parallelsentences() 262 if len(self.parallelsentences) != len(incoming_parallelsentences): 263 raise IndexError("Error, datasets not symmetrical") 264 for i in range(len(self.parallelsentences)): 265 self.parallelsentences[i].ref = incoming_parallelsentences[i].ref

266 267

268 - def get_translations_count_vector(self):

269 return [len(ps.get_translations()) for ps in self.get_parallelsentences()]

270 271

272 - def get_singlesource_strings(self):

273 return [ps.get_source().get_string() for ps in self.parallelsentences]

274 275

276 - def write_singlesource_strings_file(self, filename = None):

277 import tempfile 278 if not filename: 279 file = tempfile.mkstemp(text=True) 280 filename = file.name 281 else: 282 file = open(filename, 'w') 283 for source in self.get_singlesource_strings(): 284 file.write(source) 285 file.write('\n') 286 file.close() 287 return filename

288

289 - def get_multisource_strings(self):

290 raise NotImplementedError

291

292 - def get_target_strings(self):

293 output = [] 294 for ps in self.parallelsentences: 295 output.append([tgt.get_string() for tgt in ps.get_translations()]) 296 return output

297

298 - def modify_singlesource_strings(self, strings = []):

299 for string, ps in zip(strings, self.parallelsentences): 300 ps.src.string = string

301

302 - def modify_target_strings(self, strings = []):

303 for stringlist, ps in zip(strings, self.parallelsentences): 304 for string, tgt in zip(stringlist, ps.tgt): 305 tgt.string = string

306 307 308 309 310

311 - def remove_ties(self):

312 """ 313 Modifies the current dataset by removing ranking ties 314 """ 315 for ps in self.parallelsentences: 316 ps.remove_ties()

317 318

319 - def get_size(self):

320 return len(self.parallelsentences)

321

322 - def get_head_sentences(self, n):

323 return self.parallelsentences[:n]

324

325 - def get_tail_sentences(self, n):

326 return self.parallelsentences[-1 * n:]

327

328 - def split(self, ratio):

329 size = int(round(ratio * len(self.parallelsentences))) 330 return DataSet(self.parallelsentences[:size-2]), DataSet(self.parallelsentences[size-1:])

331

332 - def add_attribute_vector(self, att_vector, target="tgt", item=0):

333 att_vector.reverse() 334 335 for ps, atts in zip(self.parallelsentences, att_vector): 336 # atts = att_vector.pop() 337 atts = OrderedDict([(k, str(v)) for k,v in atts.iteritems()]) 338 if target == "ps": 339 ps.add_attributes(atts) 340 elif target == "tgt": 341 ps.tgt[item].add_attributes(atts) 342 elif target == "src": 343 ps.src.add_attributes(atts)

344 345

346 - def select_attribute_names(self, expressions=[]):

347 attribute_names = set() 348 #compile the list of expressions first, so that there is minimal overhead 349 compiled_expressions = [re.compile(expression) for expression in expressions] 350 for expression in compiled_expressions: 351 for attribute_name in self.get_all_attribute_names(): 352 if re.match(expression, attribute_name): 353 attribute_names.add(attribute_name) 354 else: 355 print "tzifos" 356 return list(attribute_names)

357 358

359 - def clone(self):

360 return DataSet(self.parallelsentence, self.attribute_names)

361 362 """ 363 def get_nested_attributes(self): 364 365 propagated_parallelsentences = [] 366 propagated_attribute_names = set() 367 for psentence in self.parallelsentences: 368 psentence.propagate_attributes() 369 propagated_parallelsentences.append(psentence) 370 propagated_attribute_names.add( psentence.get_attributes() ) 371 self.parallelsentences = propagated_parallelsentences 372 self.attribute_names = list( propagated_attribute_names ) 373 """ 374

375 - def __eq__(self, other):

376 """ 377 @todo comparison doesn't really work 378 """ 379 i = 0 380 for ps_here, ps_other in zip(self.parallelsentences, other.parallelsentences): 381 i+=1 382 if not ps_here == ps_other: 383 print "Sentence %d with id %s-%s seems to be unequal"% (i, ps_here.get_attribute("ps1_id"), ps_here.get_attribute("ps2_id")) 384 return False 385 return True

386 # return self.parallelsentences == other.parallelsentences 387

388 - def compare(self, other_dataset, start=0, to=None ):

389 """ 390 Compares this dataset to another, by displaying parallel sentences in pairs 391 """ 392 if not to: 393 to = len(self.parallelsentences)-1 394 for ps1 in self.parallelsentences[start:to]: 395 for ps2 in other_dataset.get_parallelsentences(): 396 if ps2.get_attributes()["id"] == ps1.get_attributes()["id"] and ps2.get_attributes()["testset"] == ps1.get_attributes()["testset"] and ps2.get_attributes()["langsrc"] == ps1.get_attributes()["langsrc"]: 397 print ps1.get_source().get_string() , "\n", ps2.get_source().get_string() 398 print ps1.get_attributes() , "\n", ps2.get_attributes() 399 print ps1.get_translations()[0].get_string() , "\n", ps2.get_translations()[0].get_string() 400 print ps1.get_translations()[0].get_attributes() , "\n", ps2.get_translations()[0].get_attributes() 401 print ps1.get_translations()[1].get_string() , "\n", ps2.get_translations()[1].get_string() 402 print ps1.get_translations()[1].get_attributes() , "\n", ps2.get_translations()[1].get_attributes()

403 404 405

406 - def __iter__(self):

407 """ 408 A DataSet iterates over its basic wrapped object, ParallelSentence 409 """ 410 return self.parallelsentences.__iter__()

Source Code for Module sentence.dataset