1
2
3
4
5 """
6
7 @author: Eleftherios Avramidis
8 """
9
10 import os
11 import sys
12 import orange, orngTest, orngStat, orngTree
13 from tempfile import mktemp
14 from sentence.dataset import DataSet
15 from sentence.parallelsentence import ParallelSentence
16 from sentence.sentence import SimpleSentence
17 import sentence
18 from copy import deepcopy
19
21 """
22 Handles the conversion of the generic data objects to a format handled by Orange library
23 """
24
25 - def __init__ (self, dataSet, class_name="", desired_attributes=[], meta_attributes=[], chosen_orangefilename=False, keep_empty=False):
26 if isinstance ( dataSet , orange.ExampleTable ):
27 self.data = dataSet
28
29 elif isinstance ( dataSet , sentence.dataset.DataSet ):
30
31 print "desired attributes" , desired_attributes
32 print "meta attributes" , meta_attributes
33
34 orange_file = self._get_temp_file(chosen_orangefilename)
35 self._getOrangeFormat(orange_file, dataSet, class_name, desired_attributes, meta_attributes)
36
37
38
39 orangefilename = orange_file.name
40 orange_file.close()
41
42 dataSet = None
43
44 print "Feeding file to Orange"
45 if not keep_empty:
46 self.data = orange.ExampleTable(orangefilename)
47 print "Loaded ", len(self.data) , " sentences from file " , orangefilename
48
49 if not chosen_orangefilename:
50 os.unlink(orangefilename)
51
52
53
56
57
59 data = self.data
60 attribute_names = set()
61 new_data = []
62
63 for item in data:
64 sentence_attributes = {}
65
66
67 sentence_attributes[item.domain.classVar.name] = str(item.getclass().value)
68
69
70 for att in item.domain.attributes:
71 sentence_attributes[att.name] = str(item[att].value)
72 attribute_names.add(att.name)
73
74 metas = item.getmetas()
75
76 src = SimpleSentence()
77 tgt_dic = {}
78 tgt = []
79 ref = SimpleSentence()
80
81
82 for key in metas:
83 attribute_name = metas[key].variable.name
84
85 if attribute_name == 'src':
86 src = SimpleSentence(metas[key].value)
87 elif attribute_name == 'ref':
88 try:
89 ref = SimpleSentence(metas[key].value)
90 except KeyError:
91 pass
92 elif (attribute_name.startswith('tgt') and attribute_name.find('_') == -1):
93 tag, index = attribute_name.split( "-")
94
95 tgt_dic[int(index)-1] = SimpleSentence(metas[key].value)
96
97
98 else:
99
100 sentence_attributes[attribute_name] = unicode(metas[key].value)
101 attribute_names.add(attribute_name)
102
103
104
105
106
107
108
109 for index in range(len(tgt_dic.keys())):
110 tgt.append(tgt_dic[index])
111
112 new_parallelsentence = ParallelSentence(src, tgt, ref, sentence_attributes)
113 new_parallelsentence.recover_attributes()
114 new_data.append(new_parallelsentence)
115
116 return DataSet( new_data, attribute_names )
117
119 data=self.data
120
121 print "Classes:", len(data.domain.classVar.values)
122 print "Attributes:", len(data.domain.attributes), ",",
123
124 print "Classes:", len(data.domain.classVar.values)
125 print "Attributes:", len(data.domain.attributes), ",",
126
127
128 ncont=0; ndisc=0
129 for a in data.domain.attributes:
130 if a.varType == orange.VarTypes.Discrete:
131 ndisc = ndisc + 1
132 else:
133 ncont = ncont + 1
134 print ncont, "continuous,", ndisc, "discrete"
135
136
137 c = [0] * len(data.domain.classVar.values)
138 for e in data:
139 c[int(e.getclass())] += 1
140 print "Instances: ", len(data), "total",
141 r = [0.] * len(c)
142 for i in range(len(c)):
143 r[i] = c[i]*100./len(data)
144 for i in range(len(data.domain.classVar.values)):
145 print ", %d(%4.1f%s) with class %s" % (c[i], r[i], '%', data.domain.classVar.values[i]),
146 print
147
148
149
150
151 natt = len(data.domain.attributes)
152 missing = [0.] * natt
153 for i in data:
154 for j in range(natt):
155 if i[j].isSpecial():
156 missing[j] += 1
157 missing = map(lambda x, l=len(data):x/l*100., missing)
158
159 print "Missing values per attribute:"
160 atts = data.domain.attributes
161 for i in range(natt):
162 print " %5.1f%s %s" % (missing[i], '%', atts[i].name)
163
164
165
166
167 dist = orange.DomainDistributions(data)
168
169 print "Average values and mean square errors:"
170 for i in range(len(data.domain.attributes)):
171 if data.domain.attributes[i].varType == orange.VarTypes.Continuous:
172 print "%s, mean=%5.2f +- %5.2f" % \
173 (data.domain.attributes[i].name, dist[i].average(), dist[i].error())
174
175 print "\nFrequencies for values of discrete attributes:"
176 for i in range(len(data.domain.attributes)):
177 a = data.domain.attributes[i]
178 if a.varType == orange.VarTypes.Discrete:
179 print "%s:" % a.name
180 for j in range(len(a.values)):
181 print " %s: %d" % (a.values[j], int(dist[i][j]))
182
183
184
186 if not orangefilename:
187 orangefilename = mktemp(dir=u'.', suffix=u'.tab')
188
189 orange_file = open(orangefilename, 'w')
190 return orange_file
191
192
194 if not orangefilename:
195 orangefilename = mktemp(dir=u'.', suffix=u'.tab')
196 file_object = open(orangefilename, 'w')
197 if type(data) is unicode:
198 file_object.write(data.encode('utf8'))
199 elif type(data) is str:
200 file_object.write(data)
201 else:
202 file_object.write(str(data))
203 file_object.close()
204
205 return orangefilename
206
207
208
210
211
212 line_1 = ""
213 line_2 = ""
214 line_3 = ""
215 print "Getting attributes"
216
217 dataset.confirm_attributes(desired_attributes, meta_attributes)
218
219
220 if desired_attributes == []:
221 desired_attributes = attribute_names
222
223
224
225
226
227
228 print "Constructing file"
229
230 for attribute_name in attribute_names :
231
232 attribute_name = str(attribute_name)
233 line_1 += attribute_name +"\t"
234
235
236
237 if attribute_name == class_name:
238 line_2 += "discrete\t"
239 elif attribute_name in desired_attributes and attribute_name not in meta_attributes:
240 line_2 += "continuous\t"
241 else:
242 line_2 += "string\t"
243
244
245
246 if attribute_name == class_name:
247 line_3 = line_3 + "c"
248 elif attribute_name not in desired_attributes or attribute_name in meta_attributes:
249
250 line_3 = line_3 + "m"
251 line_3 = line_3 + "\t"
252
253
254
255 line_2 += "string\t"
256 line_3 += "m\t"
257 line_1 += "src\t"
258
259 i=0
260 for tgt in dataset.get_parallelsentences()[0].get_translations():
261 i+=1
262 line_2 += "string\t"
263 line_3 += "m\t"
264 line_1 += "tgt-" + str(i) + "\t"
265
266 line_2 += "string\t"
267 line_3 += "m\t"
268 line_1 += "ref\t"
269
270
271 line_1 = line_1 + "\n"
272 line_2 = line_2 + "\n"
273 line_3 = line_3 + "\n"
274 output = line_1 + line_2 + line_3
275 return output
276
277
278
317
318
319
320
322 size = len (self.data)
323 testSize = round (size * percentage)
324
325 print "Splitting data"
326
327 indices = orange.MakeRandomIndices2(p0=testSize)
328 indices.stratified = indices.Stratified
329 ind = indices(self.data)
330
331 testSet = self.data.select(ind, 0)
332 trainingSet = self.data.select(ind, 1)
333
334 return [trainingSet, testSet]
335
336
338
339 data = self.data
340
341 bayes = orange.BayesLearner()
342 tree = orngTree.TreeLearner(mForPruning=2)
343 bayes.name = "bayes"
344 tree.name = "tree"
345
346 l = orange.SVMLearner()
347 l.name = "SVM"
348
349 l=orange.SVMLearner()
350 l.svm_type=orange.SVMLearner.Nu_SVC
351 l.nu=0.3
352 l.probability=True
353
354 learners = [bayes, tree, l]
355 deepcopy
356
357
358
359 res = orngTest.crossValidation(learners, data, folds=10)
360 cm = orngStat.computeConfusionMatrices(res,
361 classIndex=data.domain.classVar.values.index('-1'))
362
363 stat = (('CA', 'CA(res)'),
364 ('Sens', 'sens(cm)'),
365 ('Spec', 'spec(cm)'),
366 ('AUC', 'AUC(res)'),
367 ('IS', 'IS(res)'),
368 ('Brier', 'BrierScore(res)'),
369 ('F1', 'F1(cm)'),
370 ('F2', 'Falpha(cm, alpha=2.0)'),
371 ('MCC', 'MCC(cm)'),
372 ('sPi', 'scottsPi(cm)'),
373 )
374
375 scores = [eval("orngStat."+s[1]) for s in stat]
376 print "Learner " + "".join(["%-7s" % s[0] for s in stat])
377 for (i, l) in enumerate(learners):
378 print "%-8s " % l.name + "".join(["%5.3f " % s[i] for s in scores])
379
380 return None
381
382
384 l=orange.SVMLearner()
385 l.svm_type=orange.SVMLearner.Nu_SVC
386 l.nu=0.3
387 l.probability=True
388 return l(self.data)
389
390
392 """
393 Utility function which classifies the test data with the given classifier
394 """
395 mydata = self.data
396
397 for i in range(len(mydata)):
398
399
400
401 instance = mydata[i]
402 new_value = classifier(instance)
403
404
405
406
407 mydata[i].setclass(new_value.value)
408 return OrangeData(mydata)
409
410
412 """
413 Utility function which classifies the test data with the given classifier
414 """
415 mydata = self.data
416 correct = 0.0
417 wrong = 0.0
418 for i in range(len(mydata)):
419
420
421
422 new_value = classifier(mydata[i])
423 if new_value == mydata[i].getclass():
424 correct += 1
425 else:
426 wrong += 1
427
428
429
430
431 mydata[i].setclass(new_value.value)
432
433 taukendall = (correct - wrong) / len(mydata)
434 accuracy = correct / len(mydata)
435 return OrangeData(mydata), accuracy, taukendall
436
437
439 correct = [0.0]*len(classifiers)
440 wrong = [0.0]*len(classifiers)
441 for ex in self.data:
442 for i in range(len(classifiers)):
443 try:
444 if classifiers[i](ex) == ex.getclass():
445 correct[i] += 1
446 else:
447 wrong[i] += 1
448 except:
449 print "kind of error"
450
451 for i in range(len(correct)):
452 wrong[i] = (correct[i] - wrong[i]) / len(self.data)
453 correct[i] = correct[i] / len(self.data)
454 return (correct, wrong)
455