1 '''
2 Created on 07 Mar 2012
3 @author: Eleftherios Avramidis
4 '''
5 import logging
6 import copy
7 from collections import OrderedDict
8 from Orange.regression.linear import LinearRegressionLearner
9 from Orange.regression.pls import PLSRegressionLearner
10 from Orange.regression.lasso import LassoRegressionLearner
11 from Orange.regression.earth import EarthLearner
12 from Orange.regression.tree import TreeLearner
13 from Orange.classification.rules import CN2Learner, CN2UnorderedLearner, CN2SDUnorderedLearner, CN2EVCUnorderedLearner
14 from Orange import feature
15
16 from Orange.classification.bayes import NaiveLearner
17 from Orange.classification.knn import kNNLearner
18
19 from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner
20 from Orange.classification.tree import TreeLearner
21 from Orange.classification.tree import C45Learner
22 from Orange.classification.logreg import LogRegLearner
23 from Orange.classification.logreg import LibLinearLogRegLearner
24 from Orange import evaluation
25
26 from dataprocessor.input.jcmlreader import JcmlReader
27
28 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml
29 from dataprocessor.sax.saxjcml2orange import SaxJcml2Orange
30 from dataprocessor.ce.cejcml2orange import CElementTreeJcml2Orange
31 from classifier.classifier import OrangeClassifier
32 from Orange.data import Table
33 from datetime import datetime
34
35 from featuregenerator.diff_generator import DiffGenerator
36 from sentence.pairwisedataset import AnalyticPairwiseDataset, CompactPairwiseDataset, RawPairwiseDataset
37 from sentence.dataset import DataSet
38 from sentence.scoring import Scoring
39
40 import time
41
42
43 import random
44 import sys
45 import shutil
46 import pickle
47 import os
48
49 from expsuite import PyExperimentSuite
50
51
52
54 restore_supported = True
55
56 - def reset(self, params, rep):
57 self.restore_supported = True
58
59 classifier_name = params["classifier"] + "Learner"
60 self.learner = eval(classifier_name)
61 try:
62 self.classifier_params = eval(params["params_{}".format(params["classifier"]).lower()])
63 except:
64 self.classifier_params = {}
65
66 sys.stderr.write("Accepted classifier parameters: {}\n".format(self.classifier_params))
67 self.remove_infinite = False
68 self.delay_accuracy = False
69 if classifier_name == "SVMEasyLearner":
70 self.classifier_params["verbose"] = True
71 self.remove_infinite = True
72 self.delay_accuracy = True
73
74 self.meta_attributes = params["meta_attributes"].split(",")
75 self.include_references = params.setdefault("include_references", False)
76 self.replacement = params.setdefault("replacement", True)
77 self.filter_unassigned = params.setdefault("filter_unassigned", False)
78 self.restrict_ranks = params.setdefault("restrict_ranks", [])
79
80 self.delay_accuracy = params.setdefault("delay_accuracy", self.delay_accuracy)
81 self.remove_infinite = params.setdefault("remove_infinite", False)
82 self.nullimputation = params.setdefault("nullimputation", False)
83
84 self.invert_ranks = params.setdefault("invert_ranks", False)
85 self.evaluation_invert_ranks = params.setdefault("evaluation_invert_ranks", False)
86
87 if self.restrict_ranks:
88 self.restrict_ranks = self.restrict_ranks.split(",")
89
90 source_attributes = params["{}_source".format(params["att"])].split(",")
91 target_attributes = params["{}_target".format(params["att"])].split(",")
92 general_attributes = params["{}_general".format(params["att"])].split(",")
93
94 params["source_attributes"] = source_attributes
95 params["target_attributes"] = target_attributes
96 params["general_attributes"] = general_attributes
97
98 self.active_attributes = []
99 if general_attributes != [""]:
100 self.active_attributes.extend(general_attributes)
101 if source_attributes != [""]:
102 self.active_attributes.extend(["src_{}".format(att) for att in source_attributes])
103 if target_attributes != [""]:
104 self.active_attributes.extend(["tgt-1_{}".format(att) for att in target_attributes])
105 self.active_attributes.extend(["tgt-2_{}".format(att) for att in target_attributes])
106
107 if self.active_attributes == [""]:
108 self.active_attributes = []
109 self.discretization = False
110 if params.has_key("discretization"):
111 self.discretization = params["discretization"]
112
113 self.hidden_attributes = params["hidden_attributes"].split(",")
114 self.discrete_attributes = params["discrete_attributes"].split(",")
115
116 self.class_name = params["class_name"]
117 self.class_type = params["class_type"]
118
119 self.training_sets = params["training_sets"].format(**params).split(',')
120 self.testset = params["test_set"].format(**params)
121 self.ties = params["ties"]
122
123
124 - def iterate(self, params, rep, n):
125 ret = {}
126
127
128
129
130
131
132
133
134
135 if n == 0:
136 print "fetch training set"
137 parallelsentences = []
138 for training_set in self.training_sets:
139 parallelsentences.extend(JcmlReader(training_set).get_parallelsentences())
140
141 self.trainset = DataSet(parallelsentences)
142
143
144
145 if n == 10:
146 print "fetch test set"
147 shutil.copy(self.testset, "testset.jcml")
148 self.testset = JcmlReader("testset.jcml").get_dataset()
149
150 if n == 20:
151 print "pairwise training set"
152
153 self.trainset = AnalyticPairwiseDataset(
154 self.trainset, include_references = self.include_references,
155 replacement = self.replacement,
156 filter_unassigned = self.filter_unassigned,
157 restrict_ranks = self.restrict_ranks,
158 invert_ranks = self.invert_ranks
159 )
160
161 if not self.ties:
162 self.trainset.remove_ties()
163
164
165 Parallelsentence2Jcml(self.trainset).write_to_file("pairwise_trainset.jcml")
166
167 if n == 30:
168 print "pairwise testset"
169 self.testset = AnalyticPairwiseDataset(self.testset, replacement = self.replacement, invert_ranks = self.invert_ranks)
170
171
172
173 if n == 40:
174
175
176
177
178 pass
179
180
181
182 if n == 50:
183
184 self.pairwise_test_filename = "pairwise_testset.jcml"
185
186
187
188
189 pass
190
191
192 if n == 60:
193 print "produce orange trainset"
194
195 input_file = "pairwise_trainset.jcml"
196 self.trainset_orange_filename = "trainset.tab"
197
198 if os.path.isdir("/local"):
199 dir = "/local"
200 else:
201 dir = "."
202
203
204 CElementTreeJcml2Orange(input_file,
205 self.class_name,
206 self.active_attributes,
207 self.meta_attributes,
208 self.trainset_orange_filename,
209 compact_mode = True,
210 discrete_attributes=self.discrete_attributes,
211 hidden_attributes=self.hidden_attributes,
212 get_nested_attributes=True,
213 dir=dir,
214 remove_infinite=self.remove_infinite,
215 nullimputation=self.nullimputation,
216
217
218 ).convert()
219
220
221 if n == 70:
222 print "produce orange testset"
223
224 input_file = "pairwise_testset.jcml"
225 self.testset_orange_filename = "testset.tab"
226
227 if os.path.isdir("/local"):
228 dir = "/local"
229 else:
230 dir = "."
231
232 CElementTreeJcml2Orange(input_file,
233 self.class_name,
234 self.active_attributes,
235 self.meta_attributes,
236 self.testset_orange_filename,
237 compact_mode = True,
238 discrete_attributes=self.discrete_attributes,
239 hidden_attributes=self.hidden_attributes,
240 get_nested_attributes=True,
241 dir=dir,
242 remove_infinite=self.remove_infinite,
243 nullimputation=self.nullimputation,
244
245
246 ).convert()
247
248 if n == 80:
249 print "train classifier"
250 input_file = self.trainset_orange_filename
251 self.output_file = "classifier.clsf"
252
253 trainset = Table(input_file)
254
255 mylearner = self.learner(**self.classifier_params)
256 trained_classifier = mylearner(trainset)
257 self.classifier = OrangeClassifier(trained_classifier)
258 self.classifier.print_content()
259
260
261
262 if (n == 85 and not self.delay_accuracy) or (n == 185 and self.delay_accuracy):
263 print "evaluate classifier with cross-fold validation"
264 orangeData = Table(self.trainset_orange_filename)
265 learner = self.learner(**self.classifier_params)
266 cv = evaluation.testing.cross_validation([learner], orangeData, 10)
267 ret["CA"] = evaluation.scoring.CA(cv)[0]
268 ret["AUC"] = evaluation.scoring.AUC(cv)[0]
269
270 if n == 90:
271 print "test_classifier"
272
273 input_file = self.testset_orange_filename
274
275
276 print "performing classification"
277 orangedata = Table(input_file)
278
279
280
281
282 classified_set_vector = self.classifier.classify_orange_table(orangedata)
283 self.classified_values_vector = [str(v[0]) for v in classified_set_vector]
284 self.classified_probs_vector = [(v[1]["-1"], v[1]["1"]) for v in classified_set_vector]
285
286
287
288
289
290
291
292
293
294
295
296
297
298 if n == 100:
299 print "EVALUATION"
300 print "reloading coupled test set"
301 self.simple_testset = JcmlReader(self.pairwise_test_filename).get_dataset()
302
303 print "reconstructing test set"
304 att_vector = [{"rank_predicted": v} for v in self.classified_values_vector]
305 att_prob_neg = [{"prob_-1": v[0]} for v in self.classified_probs_vector]
306 att_prob_pos = [{"prob_1": v[1]} for v in self.classified_probs_vector]
307
308
309 print "adding guessed rank"
310 self.simple_testset.add_attribute_vector(att_vector, "ps")
311 self.simple_testset.add_attribute_vector(att_prob_neg, "ps")
312 self.simple_testset.add_attribute_vector(att_prob_pos, "ps")
313
314
315 Parallelsentence2Jcml(self.simple_testset).write_to_file("testset-pairwise-with-estranks.jcml")
316
317 self.simple_testset = RawPairwiseDataset(cast=self.simple_testset)
318
319
320 self.reconstructed_hard_testset = self.simple_testset.get_single_set_with_hard_ranks("rank_predicted", "rank_hard")
321 self.reconstructed_soft_testset = self.simple_testset.get_single_set_with_soft_ranks("prob_-1", "prob_1", "rank_soft_predicted", "rank_soft")
322 self.simple_testset = None
323
324
325 if n == 120:
326 print "Scoring correlation"
327 print "ranks inverted ", self.evaluation_invert_ranks
328 ret.update(score(self.reconstructed_hard_testset, self.class_name, "hard", "rank_hard", self.evaluation_invert_ranks))
329 ret.update(score(self.reconstructed_soft_testset, self.class_name, "soft", "rank_soft", self.evaluation_invert_ranks))
330 ret = OrderedDict(sorted(ret.items(), key=lambda t: t[0]))
331
332 print ret
333 return ret
334
335
336
337
338
369
370
371
372
374 self.class_name = "rank"
375 if n > 0 and n <=20 :
376 self.trainset = JcmlReader("trainset.jcml").get_dataset()
377
378 if n > 10 and n <=30 :
379 self.testset = JcmlReader("testset.jcml").get_dataset()
380
381 if n > 20 and n <=40:
382 self.trainset = JcmlReader("pairwise_trainset.jcml").get_dataset()
383
384 if n > 30 and n <=50:
385 self.testset = JcmlReader("pairwise_testset.jcml").get_dataset()
386
387 if n > 50:
388 self.pairwise_test_filename = "pairwise_testset.jcml"
389 self.trainset_orange_filename = "trainset.tab"
390
391 if n > 70:
392 self.testset_orange_filename = "testset.tab"
393
394 if n > 80 and n <= 90:
395 objectfile = open("classifier.clsf", 'r')
396 self.classifier = OrangeClassifier(pickle.load(objectfile))
397 objectfile.close()
398 if n > 90:
399
400 classified_vector_file = open("classified.hard.txt", 'r')
401 self.classified_values_vector = classified_vector_file.readlines()
402 classified_vector_file.close()
403 classified_prob_file = open("classified.soft.txt", 'r')
404 self.classified_probs_vector = [tuple(line.split('\t')) for line in classified_prob_file]
405 classified_prob_file.close()
406 if n > 100:
407
408 self.reconstructed_hard_testset = JcmlReader("testset.reconstructed.hard.jcml").get_dataset()
409 self.reconstructed_soft_testset = JcmlReader("testset.reconstructed.soft.jcml").get_dataset()
410
411
412
413
414
415
416
417
418 - def _get_testset(self, test_filename, mode = "", ratio=0.7):
419 if not test_filename == "":
420 print "arbitrarily split given set to training and test sets 90% + 10%"
421 simple_trainset = JcmlReader("trainset.jcml").get_dataset()
422
423 if mode == "development":
424 simple_trainset, a = simple_trainset.split(0.03)
425
426 simple_trainset, simple_testset = simple_trainset.split(ratio)
427 Parallelsentence2Jcml(simple_trainset).write_to_file("trainset.jcml")
428 Parallelsentence2Jcml(simple_testset).write_to_file("testset.jcml")
429 else:
430 shutil.copy(test_filename, "testset.jcml")
431
432
433 -def get_scoring(testset, class_name, xid, featurename):
434 scoringset = Scoring(testset)
435 ret = {}
436 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid)))
437 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-ntp", exclude_ties=False))
438 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-nt", penalize_predicted_ties=False))
439
440 ret["kendalltau_b-%s"%xid], ret["kendalltau_b-%s-pi"%xid] = scoringset.get_kendall_tau_b(featurename, class_name)
441 ret["b1-acc-1-%s"%xid], ret["b1-acc-%s-any"%xid] = scoringset.selectbest_accuracy(featurename, class_name)
442 ret["fr-%s"%xid] = scoringset.avg_first_ranked(featurename, class_name)
443 ret["pr-%s"%xid] = scoringset.avg_predicted_ranked(featurename, class_name)
444
445 sb_percentages = scoringset.best_predicted_vs_human(featurename, class_name)
446 for rank, percentage in sb_percentages.iteritems():
447 ret["sb-{}-{}".format(rank,xid)] = str(percentage)
448 return ret
449
450 -def score(testset, class_name, xid, featurename, invert_ranks=False):
451 scoringset = Scoring(testset, invert_ranks=invert_ranks)
452 return scoringset.get_metrics_scores(featurename, class_name, prefix=xid)
453
455 """
456 Fake file-like stream object that redirects writes to a logger instance.
457 """
458 - def __init__(self, logger, log_level=logging.INFO):
459 self.logger = logger
460 self.log_level = log_level
461 self.linebuf = ''
462
464 for line in buf.rstrip().splitlines():
465 self.logger.log(self.log_level, line.rstrip())
466
469
470 if __name__ == '__main__':
471 FORMAT = "%(asctime)-15s [%(process)d:%(thread)d] %(message)s "
472 now = datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M-%S")
473
474
475
476 mysuite = AutorankingSuite();
477 mysuite.start()
478