sentence.coupleddataset

1 ''' 2 Created on 23 Feb 2012 3 4 @author: Eleftherios Avramidis 5 ''' 6 from itertools import combinations 7 import sys 8 import os 9 import shutil 10 from dataset import DataSet 11 from coupledparallelsentence import CoupledParallelSentence 12 from dataprocessor.input.orangereader import OrangeData 13 from dataprocessor.sax.saxps2jcml import IncrementalJcml 14 from dataprocessor.input.jcmlreader import JcmlReader 15 16

17 -class CoupledDataSet(DataSet):

18 ''' 19 A coupled data set contains all possible couples of parallel sentences of a simple dataset 20 @ivar parallelsentences: a list of the coupled parallel sentences 21 @type parallelsentences: [L{CoupledParallelSentence}, ...] 22 ''' 23 24

25 - def __init__(self, **kwargs):

26 ''' 27 @var existing_item: allows the construction of a coupled dataset from an existing simple dataset or already coupled parallel sentences 28 @type existing_item: L{DataSet} or [L{CoupledParallelSentence}, ...] 29 ''' 30 self.parallelsentences = [] 31 self.attribute_names_found = False 32 self.attribute_names = [] 33 34 if "construct" in kwargs: 35 dataset = kwargs["construct"] 36 parallelsentences = dataset.get_parallelsentences() 37 ps_combinations = combinations(parallelsentences, 2) 38 self.parallelsentences = [CoupledParallelSentence(ps1, ps2) for ps1, ps2 in ps_combinations] 39 elif "readfile" in kwargs: 40 dataset = JcmlReader(kwargs["readfile"]).get_dataset() 41 already_coupled_parallelsentences = dataset.get_parallelsentences() 42 self.parallelsentences = [CoupledParallelSentence(ps) for ps in already_coupled_parallelsentences] 43 elif "wrap" in kwargs: 44 dataset = kwargs["wrap"] 45 already_coupled_parallelsentences = dataset.get_parallelsentences() 46 self.parallelsentences = [CoupledParallelSentence(ps) for ps in already_coupled_parallelsentences]

47 48 #critical_attribute="predicted_rank"

49 - def get_single_set(self, critical_attribute=None):

50 ''' 51 Reconstructs the original data set, with only one sentence per entry. 52 @return: Simple dataset that contains the simplified parallel sentences 53 @rtype: L{DataSet} 54 ''' 55 single_parallelsentences = {} 56 for coupled_parallelsentence in self.parallelsentences: 57 ps1, ps2 = coupled_parallelsentence.get_couple() 58 single_parallelsentences[ps1.get_tuple_id()] = ps1 59 single_parallelsentences[ps2.get_tuple_id()] = ps2 60 61 sorted_keys = sorted(single_parallelsentences) 62 sorted_ps = [single_parallelsentences[key] for key in sorted_keys] 63 return DataSet(sorted_ps)

64 65

66 - def get_single_set_with_soft_ranks(self, attribute1="", attribute2="", critical_attribute="rank_soft_predicted"):

67 ''' 68 Reconstructs the original data set, with only one sentence per entry. 69 @return: Simple dataset that contains the simplified parallel sentences 70 @rtype: L{DataSet} 71 ''' 72 single_parallelsentences = {} 73 single_parallelsentences_rank = {} 74 for coupled_parallelsentence in self.parallelsentences: 75 ps1, ps2 = coupled_parallelsentence.get_couple() 76 #when this wins, it indicates that first sentence is better 77 if coupled_parallelsentence.get_attribute(attribute1) == "n": 78 return [] 79 prob_neg = -1.00 * float(coupled_parallelsentence.get_attribute(attribute1)) 80 prob_pos = -1.00 * float(coupled_parallelsentence.get_attribute(attribute2)) 81 82 # int(ps2.attributes[critical_attribute]) - rank 83 single_parallelsentences[ps1.get_tuple_id()] = ps1 84 try: 85 single_parallelsentences_rank[ps1.get_tuple_id()] += prob_neg 86 except: 87 single_parallelsentences_rank[ps1.get_tuple_id()] = prob_neg 88 89 single_parallelsentences[ps2.get_tuple_id()] = ps2 90 try: 91 single_parallelsentences_rank[ps2.get_tuple_id()] += prob_pos 92 except: 93 single_parallelsentences_rank[ps2.get_tuple_id()] = prob_pos 94 95 j = 0 96 prev_rank = None 97 prev_j = None 98 normalized_rank = {} 99 for key, rank in sorted(single_parallelsentences_rank.iteritems(), key=lambda (k,v): (v,k)): 100 j+=1 101 if rank == prev_rank: 102 normalized_rank[key] = prev_j 103 else: 104 normalized_rank[key] = j 105 prev_j = j 106 prev_rank = rank 107 108 109 sorted_keys = sorted(single_parallelsentences) 110 sorted_ps = [] 111 for key in sorted_keys: 112 ps = single_parallelsentences[key] 113 ps.add_attributes({critical_attribute: str(normalized_rank[key])}) 114 sorted_ps.append(ps) 115 return DataSet(sorted_ps)

116 117 118 119 #critical_attribute="predicted_rank"

120 - def get_single_set_with_hard_ranks(self, critical_attribute=None):

121 ''' 122 Reconstructs the original data set, with only one sentence per entry. 123 @return: Simple dataset that contains the simplified parallel sentences 124 @rtype: L{DataSet} 125 ''' 126 single_parallelsentences = {} 127 single_parallelsentences_rank = {} 128 for coupled_parallelsentence in self.parallelsentences: 129 ps1, ps2 = coupled_parallelsentence.get_couple() 130 rank = int(coupled_parallelsentence.get_attribute(critical_attribute)) 131 132 # int(ps2.attributes[critical_attribute]) - rank 133 single_parallelsentences[ps1.get_tuple_id()] = ps1 134 try: 135 single_parallelsentences_rank[ps1.get_tuple_id()] += rank 136 except: 137 single_parallelsentences_rank[ps1.get_tuple_id()] = rank 138 139 single_parallelsentences[ps2.get_tuple_id()] = ps2 140 try: 141 single_parallelsentences_rank[ps2.get_tuple_id()] -= rank 142 except: 143 single_parallelsentences_rank[ps2.get_tuple_id()] = -1 * rank 144 145 j = 0 146 prev_rank = None 147 prev_j = None 148 normalized_rank = {} 149 for key, rank in sorted(single_parallelsentences_rank.iteritems(), key=lambda (k,v): (v,k)): 150 j+=1 151 if rank == prev_rank: 152 normalized_rank[key] = prev_j 153 else: 154 normalized_rank[key] = j 155 prev_j = j 156 prev_rank = rank 157 158 159 sorted_keys = sorted(single_parallelsentences) 160 sorted_ps = [] 161 for key in sorted_keys: 162 ps = single_parallelsentences[key] 163 ps.add_attributes({critical_attribute: str(normalized_rank[key])}) 164 sorted_ps.append(ps) 165 return DataSet(sorted_ps)

166

167 - def get_nested_attribute_names(self):

168 return []

169 170 171 172

173 -class CoupledDataSetDisk(CoupledDataSet):

174 - def __init__(self, existing_item):

175 ''' 176 @var existing_item: allows the construction of a coupled dataset from an existing simple dataset or already coupled parallel sentences 177 @type existing_item: L{DataSet} or [L{CoupledParallelSentence}, ...] 178 ''' 179 self.parallelsentences = [] 180 self.attribute_names_found = False 181 self.attribute_names = [] 182 183 if isinstance(existing_item, DataSet): 184 dataset = existing_item 185 parallelsentences = dataset.get_parallelsentences() 186 self.parallelsentences = combinations(parallelsentences, 2)

187

188 - def write(self, filename, filter_att=False, diff=0, compact=False):

189 writer = IncrementalJcml(filename) 190 filtered = 0 191 total = 0 192 for ps1, ps2 in self.parallelsentences: 193 total += 1 194 if filter_att and (abs(float(ps1.get_translations()[0].get_attribute(filter_att)) - float(ps2.get_translations()[0].get_attribute(filter_att))) <= diff): 195 filtered += 1 196 continue 197 coupled_ps = CoupledParallelSentence(ps1, ps2, compact=compact) 198 writer.add_parallelsentence(coupled_ps) 199 percentage = 100.00 * filtered / total 200 print "filtered {0} from {1} sentences ({2} %)".format(filtered,total,percentage) 201 writer.close()

202 203

204 -class OrangeCoupledDataSet(OrangeData):

205 """ 206 A wrapper for the orange Example Table that can be initialized upon a CoupledDataSet 207 @todo: maybe change that to a function of the previous class and break down to the parallel sentences 208 """ 209

210 - def _get_orange_header(self, dataset, class_name, attribute_names, desired_attributes=[], meta_attributes=[]):

211 212 #first construct the lines for the declaration 213 line_1 = "" #line for the name of the arguments 214 line_2 = "" #line for the type of the arguments 215 line_3 = "" #line for the definition of the class 216 print "Getting attributes" 217 218 if desired_attributes == []: 219 desired_attributes = attribute_names 220 221 222 #if no desired attribute define, get all of them 223 #if not desired_attributes: 224 # desired_attributes = attribute_names 225 226 print "Constructing file" 227 #prepare heading 228 for attribute_name in attribute_names : 229 #line 1 holds just the names 230 attribute_name = str(attribute_name) 231 line_1 += attribute_name +"\t" 232 233 #TODO: find a way to define continuous and discrete arg 234 #line 2 holds the class type 235 if attribute_name == class_name: 236 line_2 += "discrete\t" 237 elif attribute_name in desired_attributes and attribute_name not in meta_attributes: 238 line_2 += "continuous\t" 239 else: 240 line_2 += "string\t" 241 242 243 #line 3 defines the class and the metadata 244 if attribute_name == class_name: 245 line_3 = line_3 + "c" 246 elif attribute_name not in desired_attributes or attribute_name in meta_attributes: 247 #print attribute_name , "= meta" 248 line_3 = line_3 + "m" 249 line_3 = line_3 + "\t" 250 251 #if not self.avoidstrings: 252 #src 253 line_2 += "string\t" 254 line_3 += "m\t" 255 line_1 += "src-1\t" 256 257 line_2 += "string\t" 258 line_3 += "m\t" 259 line_1 += "src-2\t" 260 #target 261 i=0 262 for tgt in dataset.get_parallelsentences()[0].get_translations(): 263 i+=1 264 line_2 += "string\t" 265 line_3 += "m\t" 266 line_1 += "tgt-" + str(i) + "\t" 267 #ref 268 line_2 += "string\t" 269 line_3 += "m\t" 270 line_1 += "ref\t" 271 272 #break the line in the end 273 line_1 = line_1 + "\n" 274 line_2 = line_2 + "\n" 275 line_3 = line_3 + "\n" 276 output = line_1 + line_2 + line_3 277 return output

278 279

280 - def _getOrangeFormat(self, orange_file, dataset, class_name, desired_attributes=[], meta_attributes=[]):

281 sys.stderr.write("retrieving attribute names\n") 282 attribute_names = dataset.get_all_attribute_names() 283 284 sys.stderr.write("processing orange header\n") 285 output = self._get_orange_header(dataset, class_name, attribute_names, desired_attributes, meta_attributes) 286 sys.stderr.write("processing content\n") 287 288 orange_file.write(output) 289 290 for psentence in dataset.get_parallelsentences(): 291 outputlines = [] 292 #sys.stderr.write("getting nested attributes\n") 293 nested_attributes = psentence.get_nested_attributes() 294 if nested_attributes == {}: 295 nested_attributes = psentence.get_attributes() 296 297 nested_attribute_names = nested_attributes.keys() 298 299 #sys.stderr.write("printing content\n") 300 for attribute_name in attribute_names: 301 if attribute_name in nested_attribute_names: 302 outputlines.append(nested_attributes[attribute_name]) 303 304 #even if attribute value exists or not, we have to tab 305 outputlines.append ("\t") 306 307 #if not self.avoidstrings: 308 outputlines.append( psentence.get_source()[0].get_string()) 309 outputlines.append("\t") 310 311 outputlines.append( psentence.get_source()[1].get_string()) 312 outputlines.append("\t") 313 314 for tgt in psentence.get_translations(): 315 outputlines.append(tgt.get_string()) 316 outputlines.append("\t") 317 try: 318 outputlines.append(psentence.get_reference().get_string()) 319 outputlines.append("\t") 320 except: 321 outputlines.append("\t") 322 outputlines.append("\n") 323 orange_file.writelines(outputlines)

324

Source Code for Module sentence.coupleddataset