dataprocessor.input.orangereader

1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 4 5 """ 6 7 @author: Eleftherios Avramidis 8 """ 9 10 import os 11 import sys 12 import orange, orngTest, orngStat, orngTree 13 from tempfile import mktemp 14 from sentence.dataset import DataSet 15 from sentence.parallelsentence import ParallelSentence 16 from sentence.sentence import SimpleSentence 17 import sentence 18 from copy import deepcopy 19

20 -class OrangeData:

21 """ 22 Handles the conversion of the generic data objects to a format handled by Orange library 23 """ 24

25 - def __init__ (self, dataSet, class_name="", desired_attributes=[], meta_attributes=[], chosen_orangefilename=False, keep_empty=False):

26 if isinstance ( dataSet , orange.ExampleTable ): 27 self.data = dataSet 28 29 elif isinstance ( dataSet , sentence.dataset.DataSet ): 30 #self.avoidstrings = avoidstrings #this is to prevent buggy utf8 exporting when non-ascii characters contained in strings 31 print "desired attributes" , desired_attributes 32 print "meta attributes" , meta_attributes 33 #get the data in Orange file format 34 orange_file = self._get_temp_file(chosen_orangefilename) 35 self._getOrangeFormat(orange_file, dataSet, class_name, desired_attributes, meta_attributes) 36 37 #write the data in a temporary file 38 #not secure but we trust our hard disk 39 orangefilename = orange_file.name 40 orange_file.close() 41 42 dataSet = None 43 #load the data 44 print "Feeding file to Orange" 45 if not keep_empty: 46 self.data = orange.ExampleTable(orangefilename) 47 print "Loaded ", len(self.data) , " sentences from file " , orangefilename 48 #get rid of the temp file 49 if not chosen_orangefilename: 50 os.unlink(orangefilename)

51 52 53

54 - def get_data(self):

55 return self.data

56 57

58 - def get_dataset(self):

59 data = self.data 60 attribute_names = set() #set containing the attribute names 61 new_data = [] #list containing the data, one parallelsentence per entry 62 63 for item in data: 64 sentence_attributes = {} 65 66 67 sentence_attributes[item.domain.classVar.name] = str(item.getclass().value) 68 69 #first get normal features 70 for att in item.domain.attributes: 71 sentence_attributes[att.name] = str(item[att].value) 72 attribute_names.add(att.name) 73 74 metas = item.getmetas() 75 76 src = SimpleSentence() 77 tgt_dic = {} #TODO: this will break if more than two SimpleSentences() 78 tgt = [] 79 ref = SimpleSentence() 80 81 #then get metas 82 for key in metas: 83 attribute_name = metas[key].variable.name 84 85 if attribute_name == 'src': 86 src = SimpleSentence(metas[key].value) 87 elif attribute_name == 'ref': 88 try: 89 ref = SimpleSentence(metas[key].value) 90 except KeyError: 91 pass 92 elif (attribute_name.startswith('tgt') and attribute_name.find('_') == -1): 93 tag, index = attribute_name.split( "-") 94 #assume they appear the right order 95 tgt_dic[int(index)-1] = SimpleSentence(metas[key].value) 96 #tgt.append( SimpleSentence ( metas[key].value ) ) 97 98 else: 99 #if not attribute_names = src|ref|tgt 100 sentence_attributes[attribute_name] = unicode(metas[key].value) 101 attribute_names.add(attribute_name) 102 103 #create a new sentence and add it to the list 104 #print "Creating a sentence" 105 #print src 106 #print "Target", tgt 107 #print ref 108 109 for index in range(len(tgt_dic.keys())): 110 tgt.append(tgt_dic[index]) 111 112 new_parallelsentence = ParallelSentence(src, tgt, ref, sentence_attributes) 113 new_parallelsentence.recover_attributes() 114 new_data.append(new_parallelsentence) 115 116 return DataSet( new_data, attribute_names )

117

118 - def print_statistics(self):

119 data=self.data 120 # report on number of classes and attributes 121 print "Classes:", len(data.domain.classVar.values) 122 print "Attributes:", len(data.domain.attributes), ",", 123 124 print "Classes:", len(data.domain.classVar.values) 125 print "Attributes:", len(data.domain.attributes), ",", 126 127 # count number of continuous and discrete attributes 128 ncont=0; ndisc=0 129 for a in data.domain.attributes: 130 if a.varType == orange.VarTypes.Discrete: 131 ndisc = ndisc + 1 132 else: 133 ncont = ncont + 1 134 print ncont, "continuous,", ndisc, "discrete" 135 136 # obtain class distribution 137 c = [0] * len(data.domain.classVar.values) 138 for e in data: 139 c[int(e.getclass())] += 1 140 print "Instances: ", len(data), "total", 141 r = [0.] * len(c) 142 for i in range(len(c)): 143 r[i] = c[i]*100./len(data) 144 for i in range(len(data.domain.classVar.values)): 145 print ", %d(%4.1f%s) with class %s" % (c[i], r[i], '%', data.domain.classVar.values[i]), 146 print 147 148 149 #missing values 150 151 natt = len(data.domain.attributes) 152 missing = [0.] * natt 153 for i in data: 154 for j in range(natt): 155 if i[j].isSpecial(): 156 missing[j] += 1 157 missing = map(lambda x, l=len(data):x/l*100., missing) 158 159 print "Missing values per attribute:" 160 atts = data.domain.attributes 161 for i in range(natt): 162 print " %5.1f%s %s" % (missing[i], '%', atts[i].name) 163 164 165 #Domain distributions 166 167 dist = orange.DomainDistributions(data) 168 169 print "Average values and mean square errors:" 170 for i in range(len(data.domain.attributes)): 171 if data.domain.attributes[i].varType == orange.VarTypes.Continuous: 172 print "%s, mean=%5.2f +- %5.2f" % \ 173 (data.domain.attributes[i].name, dist[i].average(), dist[i].error()) 174 175 print "\nFrequencies for values of discrete attributes:" 176 for i in range(len(data.domain.attributes)): 177 a = data.domain.attributes[i] 178 if a.varType == orange.VarTypes.Discrete: 179 print "%s:" % a.name 180 for j in range(len(a.values)): 181 print " %s: %d" % (a.values[j], int(dist[i][j]))

182 183 184

185 - def _get_temp_file(self, orangefilename):

186 if not orangefilename: 187 orangefilename = mktemp(dir=u'.', suffix=u'.tab') 188 189 orange_file = open(orangefilename, 'w') 190 return orange_file

191 192

193 - def _writeTempFile(self, data, orangefilename):

194 if not orangefilename: 195 orangefilename = mktemp(dir=u'.', suffix=u'.tab') 196 file_object = open(orangefilename, 'w') 197 if type(data) is unicode: 198 file_object.write(data.encode('utf8')) 199 elif type(data) is str: 200 file_object.write(data) 201 else: 202 file_object.write(str(data)) 203 file_object.close() 204 205 return orangefilename

206 207 208

209 - def _get_orange_header(self, dataset, class_name, attribute_names, desired_attributes=[], meta_attributes=[]):

210 211 #first construct the lines for the declaration 212 line_1 = "" #line for the name of the arguments 213 line_2 = "" #line for the type of the arguments 214 line_3 = "" #line for the definition of the class 215 print "Getting attributes" 216 217 dataset.confirm_attributes(desired_attributes, meta_attributes) 218 219 220 if desired_attributes == []: 221 desired_attributes = attribute_names 222 223 224 #if no desired attribute define, get all of them 225 #if not desired_attributes: 226 # desired_attributes = attribute_names 227 228 print "Constructing file" 229 #prepare heading 230 for attribute_name in attribute_names : 231 #line 1 holds just the names 232 attribute_name = str(attribute_name) 233 line_1 += attribute_name +"\t" 234 235 #TODO: find a way to define continuous and discrete arg 236 #line 2 holds the class type 237 if attribute_name == class_name: 238 line_2 += "discrete\t" 239 elif attribute_name in desired_attributes and attribute_name not in meta_attributes: 240 line_2 += "continuous\t" 241 else: 242 line_2 += "string\t" 243 244 245 #line 3 defines the class and the metadata 246 if attribute_name == class_name: 247 line_3 = line_3 + "c" 248 elif attribute_name not in desired_attributes or attribute_name in meta_attributes: 249 #print attribute_name , "= meta" 250 line_3 = line_3 + "m" 251 line_3 = line_3 + "\t" 252 253 #if not self.avoidstrings: 254 #src 255 line_2 += "string\t" 256 line_3 += "m\t" 257 line_1 += "src\t" 258 #target 259 i=0 260 for tgt in dataset.get_parallelsentences()[0].get_translations(): 261 i+=1 262 line_2 += "string\t" 263 line_3 += "m\t" 264 line_1 += "tgt-" + str(i) + "\t" 265 #ref 266 line_2 += "string\t" 267 line_3 += "m\t" 268 line_1 += "ref\t" 269 270 #break the line in the end 271 line_1 = line_1 + "\n" 272 line_2 = line_2 + "\n" 273 line_3 = line_3 + "\n" 274 output = line_1 + line_2 + line_3 275 return output

276 277 278

279 - def _getOrangeFormat(self, orange_file, dataset, class_name, desired_attributes=[], meta_attributes=[]):

280 sys.stderr.write("retrieving attribute names\n") 281 attribute_names = dataset.get_all_attribute_names() 282 283 sys.stderr.write("processing orange header\n") 284 header_output = self._get_orange_header(dataset, class_name, attribute_names, desired_attributes, meta_attributes) 285 sys.stderr.write("processing content\n") 286 orange_file.write(header_output) 287 288 for psentence in dataset.get_parallelsentences(): 289 outputlines = [] 290 #sys.stderr.write("getting nested attributes\n") 291 nested_attributes = psentence.get_nested_attributes() 292 nested_attribute_names = nested_attributes.keys() 293 294 #sys.stderr.write("printing content\n") 295 for attribute_name in attribute_names: 296 if attribute_name in nested_attribute_names: 297 outputlines.append(nested_attributes[attribute_name]) 298 299 #even if attribute value exists or not, we have to tab 300 outputlines.append ("\t") 301 302 #if not self.avoidstrings: 303 outputlines.append( psentence.get_source().get_string()) 304 outputlines.append("\t") 305 for tgt in psentence.get_translations(): 306 outputlines.append(tgt.get_string()) 307 outputlines.append("\t") 308 try: 309 outputlines.append(psentence.get_reference().get_string()) 310 outputlines.append("\t") 311 except: 312 outputlines.append("\t") 313 outputlines.append("\n") 314 315 316 orange_file.writelines(outputlines)

317 318 #return output 319 320

321 - def split_data(self, percentage):

322 size = len (self.data) 323 testSize = round (size * percentage) 324 325 print "Splitting data" 326 327 indices = orange.MakeRandomIndices2(p0=testSize) 328 indices.stratified = indices.Stratified 329 ind = indices(self.data) 330 331 testSet = self.data.select(ind, 0) 332 trainingSet = self.data.select(ind, 1) 333 334 return [trainingSet, testSet]

335 336

337 - def cross_validation(self):

338 339 data = self.data 340 # set up the learners 341 bayes = orange.BayesLearner() 342 tree = orngTree.TreeLearner(mForPruning=2) 343 bayes.name = "bayes" 344 tree.name = "tree" 345 346 l = orange.SVMLearner() 347 l.name = "SVM" 348 349 l=orange.SVMLearner() 350 l.svm_type=orange.SVMLearner.Nu_SVC 351 l.nu=0.3 352 l.probability=True 353 354 learners = [bayes, tree, l] 355 deepcopy 356 # compute accuracies on data 357 358 359 res = orngTest.crossValidation(learners, data, folds=10) 360 cm = orngStat.computeConfusionMatrices(res, 361 classIndex=data.domain.classVar.values.index('-1')) 362 363 stat = (('CA', 'CA(res)'), 364 ('Sens', 'sens(cm)'), 365 ('Spec', 'spec(cm)'), 366 ('AUC', 'AUC(res)'), 367 ('IS', 'IS(res)'), 368 ('Brier', 'BrierScore(res)'), 369 ('F1', 'F1(cm)'), 370 ('F2', 'Falpha(cm, alpha=2.0)'), 371 ('MCC', 'MCC(cm)'), 372 ('sPi', 'scottsPi(cm)'), 373 ) 374 375 scores = [eval("orngStat."+s[1]) for s in stat] 376 print "Learner " + "".join(["%-7s" % s[0] for s in stat]) 377 for (i, l) in enumerate(learners): 378 print "%-8s " % l.name + "".join(["%5.3f " % s[i] for s in scores]) 379 380 return None

381 382

383 - def get_SVM(self):

384 l=orange.SVMLearner() 385 l.svm_type=orange.SVMLearner.Nu_SVC 386 l.nu=0.3 387 l.probability=True 388 return l(self.data)

389 390

391 - def classify_with(self,classifier):

392 """ 393 Utility function which classifies the test data with the given classifier 394 """ 395 mydata = self.data 396 397 for i in range(len(mydata)): 398 399 #for ex in mydata: 400 #try: 401 instance = mydata[i] 402 new_value = classifier(instance) 403 #new_value, prob = classifier(mydata[i], orange.Classifier.GetBoth) 404 #except: 405 # print "couldn't apply classifier" 406 # new_value = "0" 407 mydata[i].setclass(new_value.value) 408 return OrangeData(mydata)

409 410

411 - def classify_accuracy(self,classifier):

412 """ 413 Utility function which classifies the test data with the given classifier 414 """ 415 mydata = self.data 416 correct = 0.0 417 wrong = 0.0 418 for i in range(len(mydata)): 419 420 #for ex in mydata: 421 #try: 422 new_value = classifier(mydata[i]) 423 if new_value == mydata[i].getclass(): 424 correct += 1 425 else: 426 wrong += 1 427 # #new_value, prob = classifier(mydata[i], orange.Classifier.GetBoth) 428 #except: 429 # print "couldn't apply classifier" 430 # new_value = "0" 431 mydata[i].setclass(new_value.value) 432 433 taukendall = (correct - wrong) / len(mydata) 434 accuracy = correct / len(mydata) 435 return OrangeData(mydata), accuracy, taukendall

436 437

438 - def get_accuracy(self, classifiers):

439 correct = [0.0]*len(classifiers) 440 wrong = [0.0]*len(classifiers) 441 for ex in self.data: 442 for i in range(len(classifiers)): 443 try: 444 if classifiers[i](ex) == ex.getclass(): 445 correct[i] += 1 446 else: 447 wrong[i] += 1 448 except: 449 print "kind of error" 450 451 for i in range(len(correct)): 452 wrong[i] = (correct[i] - wrong[i]) / len(self.data) 453 correct[i] = correct[i] / len(self.data) 454 return (correct, wrong)

455

Source Code for Module dataprocessor.input.orangereader