1
2
3
4
5 """
6
7 @author: Eleftherios Avramidis
8 """
9
10 import codecs
11 import os
12 import sys
13 import orange, orngTest, orngStat, orngTree
14 from tempfile import mktemp
15 from sentence.dataset import DataSet
16 from sentence.parallelsentence import ParallelSentence
17 from sentence.sentence import SimpleSentence
18 import sentence
19 from copy import deepcopy
20 from xml.sax import make_parser
21
22
23
25
26 - def __init__(self, file, class_name, meta_attributes, desired_attributes = [] ):
27 """
28 @param out: file object to receive processed changes
29 @type out: file
30 @param feature_generators: list of feature generators to be applied
31 @type feature_generators: list
32 """
33
34 self.attribute_names = set()
35 self.tgt_count = 0
36 self.file = file
37 self.desired_attributes = desired_attributes
38 self.meta_attributes = meta_attributes
39
41 """
42 Signals the start of an element (simplesentence or parallelsentence)
43 @param name: the name of the element
44 @type name: str
45 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
46 @type attrs: Attributes
47 """
48
49 for att_name in attrs.getNames():
50 if name == self.TAG_SRC:
51 att_name = "src_%s" % att_name
52 self.tgt_count = 0
53 elif name == self.TAG_TGT:
54 self.tgt_count += 1
55 att_name = "tgt%d_%s" % att_name
56 elif name == self.TAG_REF:
57 att_name = "ref_%s" % att_name
58 self.attribute_names.add(att_name)
59
61
62
63 line_1 = ""
64 line_2 = ""
65 line_3 = ""
66 print "Getting attributes"
67 desired_attributes = self.desired_attributes
68 meta_attributes = self.meta_attributes
69
70 print self.attribute_names
71 if self.desired_attributes == []:
72 desired_attributes = self.attribute_names
73
74
75
76
77
78 print "Constructing file"
79
80 for attribute_name in self.attribute_names :
81
82 line_1 = "%s%s\t" % (line_1, attribute_name)
83
84
85
86 if attribute_name == self.class_name:
87 line_2 = "%sd\t" % line_2
88 elif attribute_name in desired_attributes and attribute_name not in meta_attributes:
89
90 line_2 = "%sc\t" % line_2
91
92 else:
93 line_2 = "%sd\t" % line_2
94
95
96
97 if attribute_name == self.class_name:
98 line_3 = line_3 + "c"
99 elif attribute_name not in desired_attributes or attribute_name in meta_attributes:
100 line_3 = line_3 + "m"
101 line_3 = line_3 + "\t"
102
103
104 line_2 += "string\t"
105 line_3 += "m\t"
106 line_1 += "src\t"
107
108 i=0
109 for i in range (1, self.tgt_count):
110 i+=1
111 line_2 = "%sstring\t" % line_2
112 line_3 = "%sm\t" % line_3
113 line_1 = "%stgt-%d\t" % (line_1, i)
114
115 line_2 += "string\t"
116 line_3 += "m\t"
117 line_1 += "ref\t"
118
119
120 line_1 = line_1 + "\n"
121 line_2 = line_2 + "\n"
122 line_3 = line_3 + "\n"
123 output = line_1 + line_2 + line_3
124 self.file.write(output)
125
133
134
136 """
137 Handles the conversion of the generic data objects to a format handled by Orange library
138 """
139
140 - def __init__ (self, dataSet, class_name="", desired_attributes=[], meta_attributes=[], keep_temp=False):
141 if isinstance ( dataSet , orange.ExampleTable ):
142 self.data = dataSet
143
144 elif isinstance ( dataSet , sentence.dataset.DataSet ):
145
146
147
148
149
150 tmpFileName = mktemp(dir='.', suffix='.tab')
151 file = codecs.open(tmpFileName, 'w', 'utf-8')
152
153 attReader = AttributesReader(file, class_name, meta_attributes, desired_attributes)
154
155 myparser = make_parser()
156 myparser.setContentHandler(attReader)
157 myparser.parse(tmpFileName)
158
159 sentReader = SentenceReader(file, class_name, meta_attributes, desired_attributes)
160
161
162
163
164
165
166
167
168 os.unlink(tmpFileName)
169
170 return None
171
172
175
176
178 data = self.data
179 attribute_names = set()
180 new_data = []
181
182
183
184 for item in data:
185 sentence_attributes = {}
186
187
188 sentence_attributes[item.domain.classVar.name] = str(item.getclass().value)
189
190
191 for att in item.domain.attributes:
192 sentence_attributes[att.name] = str(item[att].value)
193 attribute_names.add(att.name)
194
195 metas = item.getmetas()
196
197 src = SimpleSentence()
198 tgt = [SimpleSentence(), SimpleSentence()]
199 ref = SimpleSentence()
200
201
202 for key in metas:
203 attribute_name = metas[key].variable.name
204
205 if attribute_name == 'src':
206 src = SimpleSentence(metas[key].value)
207 elif attribute_name == 'ref':
208 try:
209 ref = SimpleSentence(metas[key].value)
210 except KeyError:
211 pass
212 elif (attribute_name.startswith('tgt') and attribute_name.find('_') == -1):
213 tag, index = attribute_name.split( "-")
214
215 tgt[int(index)-1] = SimpleSentence(metas[key].value)
216
217
218 else:
219
220 sentence_attributes[attribute_name] = str(metas[key].value)
221 attribute_names.add(attribute_name)
222
223
224
225
226
227
228 new_parallelsentence = ParallelSentence(src, tgt, ref, sentence_attributes)
229 new_parallelsentence.recover_attributes()
230 new_data.append(new_parallelsentence)
231
232 return DataSet( new_data, attribute_names )
233
235 data=self.data
236
237 print "Classes:", len(data.domain.classVar.values)
238 print "Attributes:", len(data.domain.attributes), ",",
239
240 print "Classes:", len(data.domain.classVar.values)
241 print "Attributes:", len(data.domain.attributes), ",",
242
243
244 ncont=0; ndisc=0
245 for a in data.domain.attributes:
246 if a.varType == orange.VarTypes.Discrete:
247 ndisc = ndisc + 1
248 else:
249 ncont = ncont + 1
250 print ncont, "continuous,", ndisc, "discrete"
251
252
253 c = [0] * len(data.domain.classVar.values)
254 for e in data:
255 c[int(e.getclass())] += 1
256 print "Instances: ", len(data), "total",
257 r = [0.] * len(c)
258 for i in range(len(c)):
259 r[i] = c[i]*100./len(data)
260 for i in range(len(data.domain.classVar.values)):
261 print ", %d(%4.1f%s) with class %s" % (c[i], r[i], '%', data.domain.classVar.values[i]),
262 print
263
264
265
266
267 natt = len(data.domain.attributes)
268 missing = [0.] * natt
269 for i in data:
270 for j in range(natt):
271 if i[j].isSpecial():
272 missing[j] += 1
273 missing = map(lambda x, l=len(data):x/l*100., missing)
274
275 print "Missing values per attribute:"
276 atts = data.domain.attributes
277 for i in range(natt):
278 print " %5.1f%s %s" % (missing[i], '%', atts[i].name)
279
280
281
282
283 dist = orange.DomainDistributions(data)
284
285 print "Average values and mean square errors:"
286 for i in range(len(data.domain.attributes)):
287 if data.domain.attributes[i].varType == orange.VarTypes.Continuous:
288 print "%s, mean=%5.2f +- %5.2f" % \
289 (data.domain.attributes[i].name, dist[i].average(), dist[i].error())
290
291 print "\nFrequencies for values of discrete attributes:"
292 for i in range(len(data.domain.attributes)):
293 a = data.domain.attributes[i]
294 if a.varType == orange.VarTypes.Discrete:
295 print "%s:" % a.name
296 for j in range(len(a.values)):
297 print " %s: %d" % (a.values[j], int(dist[i][j]))
298
299
300
302 tmpFileName = mktemp(dir='.', suffix='.tab')
303 file_object = codecs.open(tmpFileName, 'w', 'utf-8')
304 return file_object
305
307
308 tmpFileName = mktemp(dir='.', suffix='.tab')
309 file_object = codecs.open(tmpFileName, 'w', 'utf-8')
310 file_object.write(data)
311 file_object.close()
312
313 return tmpFileName
314
315
317
318
319 line_1 = ""
320 line_2 = ""
321 line_3 = ""
322 print "Getting attributes"
323
324
325 print attribute_names
326 if desired_attributes == []:
327 desired_attributes = attribute_names
328
329
330
331
332
333 print "Constructing file"
334
335 for attribute_name in attribute_names :
336
337 line_1 += attribute_name +"\t"
338
339
340
341 if attribute_name == class_name:
342 line_2 += "d\t"
343 elif attribute_name in desired_attributes and attribute_name not in meta_attributes:
344
345 line_2 += "c\t"
346
347 else:
348 line_2 += "d\t"
349
350
351
352 if attribute_name == class_name:
353 line_3 = line_3 + "c"
354 elif attribute_name not in desired_attributes or attribute_name in meta_attributes:
355 line_3 = line_3 + "m"
356 line_3 = line_3 + "\t"
357
358
359 line_2 += "string\t"
360 line_3 += "m\t"
361 line_1 += "src\t"
362
363 i=0
364 for tgt in dataset.get_parallelsentences()[0].get_translations():
365 i+=1
366 line_2 += "string\t"
367 line_3 += "m\t"
368 line_1 += "tgt-" + str(i) + "\t"
369
370 line_2 += "string\t"
371 line_3 += "m\t"
372 line_1 += "ref\t"
373
374
375 line_1 = line_1 + "\n"
376 line_2 = line_2 + "\n"
377 line_3 = line_3 + "\n"
378 output = line_1 + line_2 + line_3
379 return output
380
381
397
399
400 nested_attributes = psentence.get_nested_attributes()
401 nested_attribute_names = nested_attributes.keys()
402 attribute_names = self.attribute_names
403 outputlines = []
404
405
406 for attribute_name in attribute_names:
407 if attribute_name in nested_attribute_names:
408 outputlines.append(nested_attributes[attribute_name])
409
410
411 outputlines.append ("\t")
412 outputlines.append( psentence.get_source().get_string())
413 outputlines.append("\t")
414 for tgt in psentence.get_translations():
415 outputlines.append(tgt.get_string())
416 outputlines.append("\t")
417 try:
418 outputlines.append(psentence.get_reference().get_string())
419 outputlines.append("\t")
420 except:
421 outputlines.append("\t")
422 outputlines.append("\n")
423 return outputlines
424
426 size = len (self.data)
427 testSize = round (size * percentage)
428
429 print "Splitting data"
430
431 indices = orange.MakeRandomIndices2(p0=testSize)
432 indices.stratified = indices.Stratified
433 ind = indices(self.data)
434
435 testSet = self.data.select(ind, 0)
436 trainingSet = self.data.select(ind, 1)
437
438 return [trainingSet, testSet]
439
440
441
443
444 data = self.data
445
446 bayes = orange.BayesLearner()
447 tree = orngTree.TreeLearner(mForPruning=2)
448 bayes.name = "bayes"
449 tree.name = "tree"
450
451 l = orange.SVMLearner()
452 l.name = "SVM"
453
454 l=orange.SVMLearner()
455 l.svm_type=orange.SVMLearner.Nu_SVC
456 l.nu=0.3
457 l.probability=True
458
459 learners = [bayes, tree, l]
460 deepcopy
461
462
463
464 res = orngTest.crossValidation(learners, data, folds=10)
465 cm = orngStat.computeConfusionMatrices(res,
466 classIndex=data.domain.classVar.values.index('-1'))
467
468 stat = (('CA', 'CA(res)'),
469 ('Sens', 'sens(cm)'),
470 ('Spec', 'spec(cm)'),
471 ('AUC', 'AUC(res)'),
472 ('IS', 'IS(res)'),
473 ('Brier', 'BrierScore(res)'),
474 ('F1', 'F1(cm)'),
475 ('F2', 'Falpha(cm, alpha=2.0)'),
476 ('MCC', 'MCC(cm)'),
477 ('sPi', 'scottsPi(cm)'),
478 )
479
480 scores = [eval("orngStat."+s[1]) for s in stat]
481 print
482 print "Learner " + "".join(["%-7s" % s[0] for s in stat])
483 for (i, l) in enumerate(learners):
484 print "%-8s " % l.name + "".join(["%5.3f " % s[i] for s in scores])
485
486 return None
487
489 l=orange.SVMLearner()
490 l.svm_type=orange.SVMLearner.Nu_SVC
491 l.nu=0.3
492 l.probability=True
493 return l(self.data)
494
495
497 """
498 Utility function which classifies the test data with the given classifier
499 """
500 mydata = self.data
501
502 for i in range(len(mydata)):
503
504
505 try:
506 new_value = classifier(mydata[i])
507 except:
508 print "couldn't apply classifier"
509 new_value = "0"
510 mydata[i].setclass(new_value)
511 return mydata
512
513
515 correct = [0.0]*len(classifiers)
516 wrong = [0.0]*len(classifiers)
517 for ex in self.data:
518 for i in range(len(classifiers)):
519 try:
520 if classifiers[i](ex) == ex.getclass():
521 correct[i] += 1
522 else:
523 wrong[i] += 1
524 except:
525 print "kind of error"
526
527 for i in range(len(correct)):
528 wrong[i] = (correct[i] - wrong[i]) / len(self.data)
529 correct[i] = correct[i] / len(self.data)
530 return (correct, wrong)
531