dataprocessor.input.orangeprocessor

1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 4 5 """ 6 7 @author: Eleftherios Avramidis 8 """ 9 10 import codecs 11 import os 12 import sys 13 import orange, orngTest, orngStat, orngTree 14 from tempfile import mktemp 15 from sentence.dataset import DataSet 16 from sentence.parallelsentence import ParallelSentence 17 from sentence.sentence import SimpleSentence 18 import sentence 19 from copy import deepcopy 20 from xml.sax import make_parser 21 22 23

24 -class AttributesReader:

25

26 - def __init__(self, file, class_name, meta_attributes, desired_attributes = [] ):

27 """ 28 @param out: file object to receive processed changes 29 @type out: file 30 @param feature_generators: list of feature generators to be applied 31 @type feature_generators: list 32 """ 33 34 self.attribute_names = set() 35 self.tgt_count = 0 36 self.file = file 37 self.desired_attributes = desired_attributes 38 self.meta_attributes = meta_attributes

39

40 - def startElement(self, name, attrs = []):

41 """ 42 Signals the start of an element (simplesentence or parallelsentence) 43 @param name: the name of the element 44 @type name: str 45 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 46 @type attrs: Attributes 47 """ 48 49 for att_name in attrs.getNames(): 50 if name == self.TAG_SRC: 51 att_name = "src_%s" % att_name 52 self.tgt_count = 0 53 elif name == self.TAG_TGT: 54 self.tgt_count += 1 55 att_name = "tgt%d_%s" % att_name 56 elif name == self.TAG_REF: 57 att_name = "ref_%s" % att_name 58 self.attribute_names.add(att_name)

59

60 - def endDocument(self):

61 62 #first construct the lines for the declaration 63 line_1 = "" #line for the name of the arguments 64 line_2 = "" #line for the type of the arguments 65 line_3 = "" #line for the definition of the class 66 print "Getting attributes" 67 desired_attributes = self.desired_attributes 68 meta_attributes = self.meta_attributes 69 70 print self.attribute_names 71 if self.desired_attributes == []: 72 desired_attributes = self.attribute_names 73 74 #if no desired attribute define, get all of them 75 #if not desired_attributes: 76 # desired_attributes = attribute_names 77 78 print "Constructing file" 79 #prepare heading 80 for attribute_name in self.attribute_names : 81 #line 1 holds just the names 82 line_1 = "%s%s\t" % (line_1, attribute_name) 83 84 #TODO: find a way to define continuous and discrete arg 85 #line 2 holds the class type 86 if attribute_name == self.class_name: 87 line_2 = "%sd\t" % line_2 88 elif attribute_name in desired_attributes and attribute_name not in meta_attributes: 89 #line_2 += "%s\t" % desired_attributes[attribute_name] 90 line_2 = "%sc\t" % line_2 91 92 else: 93 line_2 = "%sd\t" % line_2 94 95 96 #line 3 defines the class and the metadata 97 if attribute_name == self.class_name: 98 line_3 = line_3 + "c" 99 elif attribute_name not in desired_attributes or attribute_name in meta_attributes: 100 line_3 = line_3 + "m" 101 line_3 = line_3 + "\t" 102 103 #src 104 line_2 += "string\t" 105 line_3 += "m\t" 106 line_1 += "src\t" 107 #target 108 i=0 109 for i in range (1, self.tgt_count): 110 i+=1 111 line_2 = "%sstring\t" % line_2 112 line_3 = "%sm\t" % line_3 113 line_1 = "%stgt-%d\t" % (line_1, i) 114 #ref 115 line_2 += "string\t" 116 line_3 += "m\t" 117 line_1 += "ref\t" 118 119 #break the line in the end 120 line_1 = line_1 + "\n" 121 line_2 = line_2 + "\n" 122 line_3 = line_3 + "\n" 123 output = line_1 + line_2 + line_3 124 self.file.write(output)

125

126 -class SentenceReader:

127

128 - def __init__(self):

129 from dataprocessor.input.xliffreader import XliffReader

130 - def startElement(self, name, atts = []):

131 if name == self.TAG_SENT: 132 pass

133 134

135 -class OrangeProcessor:

136 """ 137 Handles the conversion of the generic data objects to a format handled by Orange library 138 """ 139

140 - def __init__ (self, dataSet, class_name="", desired_attributes=[], meta_attributes=[], keep_temp=False):

141 if isinstance ( dataSet , orange.ExampleTable ): 142 self.data = dataSet 143 144 elif isinstance ( dataSet , sentence.dataset.DataSet ): 145 146 #get the data in Orange file format 147 148 #write the data in a temporary file 149 #not secure but we trust our hard disk 150 tmpFileName = mktemp(dir='.', suffix='.tab') 151 file = codecs.open(tmpFileName, 'w', 'utf-8') 152 153 attReader = AttributesReader(file, class_name, meta_attributes, desired_attributes) 154 #proceed with parcing 155 myparser = make_parser() 156 myparser.setContentHandler(attReader) 157 myparser.parse(tmpFileName) 158 159 sentReader = SentenceReader(file, class_name, meta_attributes, desired_attributes) 160 161 # 162 # #load the data 163 # print "Feeding file to Orange" 164 # self.data = orange.ExampleTable(tmpFileName) 165 # print "Loaded ", len(self.data) , " sentences from file " , tmpFileName 166 # #get rid of the temp file 167 # if not keep_temp: 168 os.unlink(tmpFileName) 169 170 return None

171 172

173 - def get_data(self):

174 return self.data

175 176

177 - def get_dataset(self):

178 data = self.data 179 attribute_names = set() #set containing the attribute names 180 new_data = [] #list containing the data, one parallelsentence per entry 181 182 183 184 for item in data: 185 sentence_attributes = {} 186 187 188 sentence_attributes[item.domain.classVar.name] = str(item.getclass().value) 189 190 #first get normal features 191 for att in item.domain.attributes: 192 sentence_attributes[att.name] = str(item[att].value) 193 attribute_names.add(att.name) 194 195 metas = item.getmetas() 196 197 src = SimpleSentence() 198 tgt = [SimpleSentence(), SimpleSentence()] #TODO: this will break if more than two SimpleSentences() 199 ref = SimpleSentence() 200 201 #then get metas 202 for key in metas: 203 attribute_name = metas[key].variable.name 204 205 if attribute_name == 'src': 206 src = SimpleSentence(metas[key].value) 207 elif attribute_name == 'ref': 208 try: 209 ref = SimpleSentence(metas[key].value) 210 except KeyError: 211 pass 212 elif (attribute_name.startswith('tgt') and attribute_name.find('_') == -1): 213 tag, index = attribute_name.split( "-") 214 #assume they appear the right order 215 tgt[int(index)-1] = SimpleSentence(metas[key].value) 216 #tgt.append( SimpleSentence ( metas[key].value ) ) 217 218 else: 219 #if not attribute_names = src|ref|tgt 220 sentence_attributes[attribute_name] = str(metas[key].value) 221 attribute_names.add(attribute_name) 222 223 #create a new sentence and add it to the list 224 #print "Creating a sentence" 225 #print src 226 #print "Target", tgt 227 #print ref 228 new_parallelsentence = ParallelSentence(src, tgt, ref, sentence_attributes) 229 new_parallelsentence.recover_attributes() 230 new_data.append(new_parallelsentence) 231 232 return DataSet( new_data, attribute_names )

233

234 - def print_statistics(self):

235 data=self.data 236 # report on number of classes and attributes 237 print "Classes:", len(data.domain.classVar.values) 238 print "Attributes:", len(data.domain.attributes), ",", 239 240 print "Classes:", len(data.domain.classVar.values) 241 print "Attributes:", len(data.domain.attributes), ",", 242 243 # count number of continuous and discrete attributes 244 ncont=0; ndisc=0 245 for a in data.domain.attributes: 246 if a.varType == orange.VarTypes.Discrete: 247 ndisc = ndisc + 1 248 else: 249 ncont = ncont + 1 250 print ncont, "continuous,", ndisc, "discrete" 251 252 # obtain class distribution 253 c = [0] * len(data.domain.classVar.values) 254 for e in data: 255 c[int(e.getclass())] += 1 256 print "Instances: ", len(data), "total", 257 r = [0.] * len(c) 258 for i in range(len(c)): 259 r[i] = c[i]*100./len(data) 260 for i in range(len(data.domain.classVar.values)): 261 print ", %d(%4.1f%s) with class %s" % (c[i], r[i], '%', data.domain.classVar.values[i]), 262 print 263 264 265 #missing values 266 267 natt = len(data.domain.attributes) 268 missing = [0.] * natt 269 for i in data: 270 for j in range(natt): 271 if i[j].isSpecial(): 272 missing[j] += 1 273 missing = map(lambda x, l=len(data):x/l*100., missing) 274 275 print "Missing values per attribute:" 276 atts = data.domain.attributes 277 for i in range(natt): 278 print " %5.1f%s %s" % (missing[i], '%', atts[i].name) 279 280 281 #Domain distributions 282 283 dist = orange.DomainDistributions(data) 284 285 print "Average values and mean square errors:" 286 for i in range(len(data.domain.attributes)): 287 if data.domain.attributes[i].varType == orange.VarTypes.Continuous: 288 print "%s, mean=%5.2f +- %5.2f" % \ 289 (data.domain.attributes[i].name, dist[i].average(), dist[i].error()) 290 291 print "\nFrequencies for values of discrete attributes:" 292 for i in range(len(data.domain.attributes)): 293 a = data.domain.attributes[i] 294 if a.varType == orange.VarTypes.Discrete: 295 print "%s:" % a.name 296 for j in range(len(a.values)): 297 print " %s: %d" % (a.values[j], int(dist[i][j]))

298 299 300

301 - def __createTempFile__(self):

302 tmpFileName = mktemp(dir='.', suffix='.tab') 303 file_object = codecs.open(tmpFileName, 'w', 'utf-8') 304 return file_object

305

306 - def __writeTempFile__(self, data):

307 308 tmpFileName = mktemp(dir='.', suffix='.tab') 309 file_object = codecs.open(tmpFileName, 'w', 'utf-8') 310 file_object.write(data) 311 file_object.close() 312 313 return tmpFileName

314 315

316 - def __get_orange_header__(self, dataset, class_name, attribute_names, desired_attributes=[], meta_attributes=[]):

317 318 #first construct the lines for the declaration 319 line_1 = "" #line for the name of the arguments 320 line_2 = "" #line for the type of the arguments 321 line_3 = "" #line for the definition of the class 322 print "Getting attributes" 323 324 325 print attribute_names 326 if desired_attributes == []: 327 desired_attributes = attribute_names 328 329 #if no desired attribute define, get all of them 330 #if not desired_attributes: 331 # desired_attributes = attribute_names 332 333 print "Constructing file" 334 #prepare heading 335 for attribute_name in attribute_names : 336 #line 1 holds just the names 337 line_1 += attribute_name +"\t" 338 339 #TODO: find a way to define continuous and discrete arg 340 #line 2 holds the class type 341 if attribute_name == class_name: 342 line_2 += "d\t" 343 elif attribute_name in desired_attributes and attribute_name not in meta_attributes: 344 #line_2 += "%s\t" % desired_attributes[attribute_name] 345 line_2 += "c\t" 346 347 else: 348 line_2 += "d\t" 349 350 351 #line 3 defines the class and the metadata 352 if attribute_name == class_name: 353 line_3 = line_3 + "c" 354 elif attribute_name not in desired_attributes or attribute_name in meta_attributes: 355 line_3 = line_3 + "m" 356 line_3 = line_3 + "\t" 357 358 #src 359 line_2 += "string\t" 360 line_3 += "m\t" 361 line_1 += "src\t" 362 #target 363 i=0 364 for tgt in dataset.get_parallelsentences()[0].get_translations(): 365 i+=1 366 line_2 += "string\t" 367 line_3 += "m\t" 368 line_1 += "tgt-" + str(i) + "\t" 369 #ref 370 line_2 += "string\t" 371 line_3 += "m\t" 372 line_1 += "ref\t" 373 374 #break the line in the end 375 line_1 = line_1 + "\n" 376 line_2 = line_2 + "\n" 377 line_3 = line_3 + "\n" 378 output = line_1 + line_2 + line_3 379 return output

380 381

382 - def __getOrangeFormat__(self, dataset, class_name, desired_attributes=[], meta_attributes=[]):

383 sys.stderr.write("retrieving attribute names\n") 384 self.attribute_names = dataset.get_all_attribute_names() 385 386 sys.stderr.write("processing orange header\n") 387 output = self.__get_orange_header__(dataset, class_name, self.attribute_names, desired_attributes, meta_attributes) 388 389 sys.stderr.write("processing content\n") 390 391 outputlines = [] 392 393 for psentence in dataset.get_parallelsentences(): 394 return self.process_parallelsentence(psentence) 395 output += "".join(outputlines) 396 return output

397

398 - def process_parallelsentence(self, psentence):

399 #sys.stderr.write("getting nested attributes\n") 400 nested_attributes = psentence.get_nested_attributes() 401 nested_attribute_names = nested_attributes.keys() 402 attribute_names = self.attribute_names 403 outputlines = [] 404 405 #sys.stderr.write("printing content\n") 406 for attribute_name in attribute_names: 407 if attribute_name in nested_attribute_names: 408 outputlines.append(nested_attributes[attribute_name]) 409 410 #even if attribute value exists or not, we have to tab 411 outputlines.append ("\t") 412 outputlines.append( psentence.get_source().get_string()) 413 outputlines.append("\t") 414 for tgt in psentence.get_translations(): 415 outputlines.append(tgt.get_string()) 416 outputlines.append("\t") 417 try: 418 outputlines.append(psentence.get_reference().get_string()) 419 outputlines.append("\t") 420 except: 421 outputlines.append("\t") 422 outputlines.append("\n") 423 return outputlines

424

425 - def split_data(self, percentage):

426 size = len (self.data) 427 testSize = round (size * percentage) 428 429 print "Splitting data" 430 431 indices = orange.MakeRandomIndices2(p0=testSize) 432 indices.stratified = indices.Stratified 433 ind = indices(self.data) 434 435 testSet = self.data.select(ind, 0) 436 trainingSet = self.data.select(ind, 1) 437 438 return [trainingSet, testSet]

439 440 441

442 - def cross_validation(self):

443 444 data = self.data 445 # set up the learners 446 bayes = orange.BayesLearner() 447 tree = orngTree.TreeLearner(mForPruning=2) 448 bayes.name = "bayes" 449 tree.name = "tree" 450 451 l = orange.SVMLearner() 452 l.name = "SVM" 453 454 l=orange.SVMLearner() 455 l.svm_type=orange.SVMLearner.Nu_SVC 456 l.nu=0.3 457 l.probability=True 458 459 learners = [bayes, tree, l] 460 deepcopy 461 # compute accuracies on data 462 463 464 res = orngTest.crossValidation(learners, data, folds=10) 465 cm = orngStat.computeConfusionMatrices(res, 466 classIndex=data.domain.classVar.values.index('-1')) 467 468 stat = (('CA', 'CA(res)'), 469 ('Sens', 'sens(cm)'), 470 ('Spec', 'spec(cm)'), 471 ('AUC', 'AUC(res)'), 472 ('IS', 'IS(res)'), 473 ('Brier', 'BrierScore(res)'), 474 ('F1', 'F1(cm)'), 475 ('F2', 'Falpha(cm, alpha=2.0)'), 476 ('MCC', 'MCC(cm)'), 477 ('sPi', 'scottsPi(cm)'), 478 ) 479 480 scores = [eval("orngStat."+s[1]) for s in stat] 481 print 482 print "Learner " + "".join(["%-7s" % s[0] for s in stat]) 483 for (i, l) in enumerate(learners): 484 print "%-8s " % l.name + "".join(["%5.3f " % s[i] for s in scores]) 485 486 return None

487

488 - def get_SVM(self):

489 l=orange.SVMLearner() 490 l.svm_type=orange.SVMLearner.Nu_SVC 491 l.nu=0.3 492 l.probability=True 493 return l(self.data)

494 495

496 - def classify_with(self,classifier):

497 """ 498 Utility function which classifies the test data with the given classifier 499 """ 500 mydata = self.data 501 502 for i in range(len(mydata)): 503 504 #for ex in mydata: 505 try: 506 new_value = classifier(mydata[i]) 507 except: 508 print "couldn't apply classifier" 509 new_value = "0" 510 mydata[i].setclass(new_value) 511 return mydata

512 513

514 - def get_accuracy(self, classifiers):

515 correct = [0.0]*len(classifiers) 516 wrong = [0.0]*len(classifiers) 517 for ex in self.data: 518 for i in range(len(classifiers)): 519 try: 520 if classifiers[i](ex) == ex.getclass(): 521 correct[i] += 1 522 else: 523 wrong[i] += 1 524 except: 525 print "kind of error" 526 527 for i in range(len(correct)): 528 wrong[i] = (correct[i] - wrong[i]) / len(self.data) 529 correct[i] = correct[i] / len(self.data) 530 return (correct, wrong)

531

Source Code for Module dataprocessor.input.orangeprocessor