Package app :: Package autoranking :: Module suite
[hide private]
[frames] | no frames]

Source Code for Module app.autoranking.suite

  1  ''' 
  2  Created on 07 Mar 2012 
  3  @author: Eleftherios Avramidis 
  4  ''' 
  5  import logging 
  6  import copy 
  7  from collections import OrderedDict 
  8  from Orange.regression.linear import LinearRegressionLearner  
  9  from Orange.regression.pls import PLSRegressionLearner 
 10  from Orange.regression.lasso import LassoRegressionLearner 
 11  from Orange.regression.earth import EarthLearner 
 12  from Orange.regression.tree import TreeLearner 
 13  from Orange.classification.rules import CN2Learner,  CN2UnorderedLearner, CN2SDUnorderedLearner, CN2EVCUnorderedLearner 
 14  from Orange import feature 
 15   
 16  from Orange.classification.bayes import NaiveLearner 
 17  from Orange.classification.knn import kNNLearner 
 18  #from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner 
 19  from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner 
 20  from Orange.classification.tree import TreeLearner 
 21  from Orange.classification.tree import C45Learner 
 22  from Orange.classification.logreg import LogRegLearner 
 23  from Orange.classification.logreg import LibLinearLogRegLearner 
 24  from Orange import evaluation 
 25   
 26  from dataprocessor.input.jcmlreader import JcmlReader 
 27  #from sentence.coupleddataset import CoupledDataSet, OrangeCoupledDataSet, CoupledDataSetDisk 
 28  from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 
 29  from dataprocessor.sax.saxjcml2orange import SaxJcml2Orange 
 30  from dataprocessor.ce.cejcml2orange import CElementTreeJcml2Orange  
 31  from classifier.classifier import OrangeClassifier 
 32  from Orange.data import Table 
 33  from datetime import datetime 
 34   
 35  from featuregenerator.diff_generator import DiffGenerator 
 36  from sentence.pairwisedataset import AnalyticPairwiseDataset, CompactPairwiseDataset, RawPairwiseDataset 
 37  from sentence.dataset import DataSet 
 38  from sentence.scoring import Scoring 
 39   
 40  import time 
 41   
 42   
 43  import random 
 44  import sys 
 45  import shutil 
 46  import pickle 
 47  import os 
 48   
 49  from expsuite import PyExperimentSuite 
 50   
 51   
 52   
53 -class AutorankingSuite(PyExperimentSuite):
54 restore_supported = True 55
56 - def reset(self, params, rep):
57 self.restore_supported = True 58 59 classifier_name = params["classifier"] + "Learner" 60 self.learner = eval(classifier_name) 61 try: 62 self.classifier_params = eval(params["params_{}".format(params["classifier"]).lower()]) 63 except: 64 self.classifier_params = {} 65 66 sys.stderr.write("Accepted classifier parameters: {}\n".format(self.classifier_params)) 67 self.remove_infinite = False 68 self.delay_accuracy = False 69 if classifier_name == "SVMEasyLearner": 70 self.classifier_params["verbose"] = True 71 self.remove_infinite = True 72 self.delay_accuracy = True 73 74 self.meta_attributes = params["meta_attributes"].split(",") 75 self.include_references = params.setdefault("include_references", False) 76 self.replacement = params.setdefault("replacement", True) 77 self.filter_unassigned = params.setdefault("filter_unassigned", False) 78 self.restrict_ranks = params.setdefault("restrict_ranks", []) 79 80 self.delay_accuracy = params.setdefault("delay_accuracy", self.delay_accuracy) 81 self.remove_infinite = params.setdefault("remove_infinite", False) 82 self.nullimputation = params.setdefault("nullimputation", False) 83 84 self.invert_ranks = params.setdefault("invert_ranks", False) 85 self.evaluation_invert_ranks = params.setdefault("evaluation_invert_ranks", False) 86 87 if self.restrict_ranks: 88 self.restrict_ranks = self.restrict_ranks.split(",") 89 90 source_attributes = params["{}_source".format(params["att"])].split(",") 91 target_attributes = params["{}_target".format(params["att"])].split(",") 92 general_attributes = params["{}_general".format(params["att"])].split(",") 93 94 params["source_attributes"] = source_attributes 95 params["target_attributes"] = target_attributes 96 params["general_attributes"] = general_attributes 97 98 self.active_attributes = [] 99 if general_attributes != [""]: 100 self.active_attributes.extend(general_attributes) #TODOL check whether ps prefix is needed 101 if source_attributes != [""]: 102 self.active_attributes.extend(["src_{}".format(att) for att in source_attributes]) 103 if target_attributes != [""]: 104 self.active_attributes.extend(["tgt-1_{}".format(att) for att in target_attributes]) 105 self.active_attributes.extend(["tgt-2_{}".format(att) for att in target_attributes]) 106 107 if self.active_attributes == [""]: 108 self.active_attributes = [] 109 self.discretization = False 110 if params.has_key("discretization"): 111 self.discretization = params["discretization"] 112 113 self.hidden_attributes = params["hidden_attributes"].split(",") 114 self.discrete_attributes = params["discrete_attributes"].split(",") 115 116 self.class_name = params["class_name"] 117 self.class_type = params["class_type"] 118 119 self.training_sets = params["training_sets"].format(**params).split(',') 120 self.testset = params["test_set"].format(**params) 121 self.ties = params["ties"]
122 123
124 - def iterate(self, params, rep, n):
125 ret = {} 126 127 # print "app", os.getcwd() 128 # print "iteration", n 129 130 # if n == 0: 131 # import annotate_updated as annotate 132 # from ruffus import pipeline_run 133 # pipeline_run([annotate.analyze_external_features]) 134 135 if n == 0: 136 print "fetch training set" 137 parallelsentences = [] 138 for training_set in self.training_sets: 139 parallelsentences.extend(JcmlReader(training_set).get_parallelsentences()) 140 141 self.trainset = DataSet(parallelsentences) 142 143 #TODO: alter training filters? 144 145 if n == 10: 146 print "fetch test set" 147 shutil.copy(self.testset, "testset.jcml") 148 self.testset = JcmlReader("testset.jcml").get_dataset() 149 150 if n == 20: 151 print "pairwise training set" 152 153 self.trainset = AnalyticPairwiseDataset( 154 self.trainset, include_references = self.include_references, 155 replacement = self.replacement, 156 filter_unassigned = self.filter_unassigned, 157 restrict_ranks = self.restrict_ranks, 158 invert_ranks = self.invert_ranks 159 ) 160 161 if not self.ties: 162 self.trainset.remove_ties() 163 164 #SAVE 165 Parallelsentence2Jcml(self.trainset).write_to_file("pairwise_trainset.jcml") 166 167 if n == 30: 168 print "pairwise testset" 169 self.testset = AnalyticPairwiseDataset(self.testset, replacement = self.replacement, invert_ranks = self.invert_ranks) 170 171 172 173 if n == 40: 174 #print "add difference features : trainset" 175 #parallelsentences = self.trainset.get_parallelsentences() 176 #parallelsentences = DiffGenerator().add_features_batch(parallelsentences) 177 #Parallelsentence2Jcml(parallelsentences).write_to_file("diff_trainset.jcml") 178 pass 179 180 181 182 if n == 50: 183 #print "add difference feature : testset" 184 self.pairwise_test_filename = "pairwise_testset.jcml" 185 186 #parallelsentences = self.testset.get_parallelsentences() 187 #parallelsentences = DiffGenerator().add_features_batch(parallelsentences) 188 #Parallelsentence2Jcml(parallelsentences).write_to_file(self.pairwise_test_filename) 189 pass 190 191 192 if n == 60: 193 print "produce orange trainset" 194 195 input_file = "pairwise_trainset.jcml" 196 self.trainset_orange_filename = "trainset.tab" 197 198 if os.path.isdir("/local"): 199 dir = "/local" 200 else: 201 dir = "." 202 203 204 CElementTreeJcml2Orange(input_file, 205 self.class_name, 206 self.active_attributes, 207 self.meta_attributes, 208 self.trainset_orange_filename, 209 compact_mode = True, 210 discrete_attributes=self.discrete_attributes, 211 hidden_attributes=self.hidden_attributes, 212 get_nested_attributes=True, 213 dir=dir, 214 remove_infinite=self.remove_infinite, 215 nullimputation=self.nullimputation, 216 #filter_attributes={"rank" : "0"}, 217 # class_type=class_type 218 ).convert() 219 220 221 if n == 70: 222 print "produce orange testset" 223 224 input_file = "pairwise_testset.jcml" 225 self.testset_orange_filename = "testset.tab" 226 227 if os.path.isdir("/local"): 228 dir = "/local" 229 else: 230 dir = "." 231 232 CElementTreeJcml2Orange(input_file, 233 self.class_name, 234 self.active_attributes, 235 self.meta_attributes, 236 self.testset_orange_filename, 237 compact_mode = True, 238 discrete_attributes=self.discrete_attributes, 239 hidden_attributes=self.hidden_attributes, 240 get_nested_attributes=True, 241 dir=dir, 242 remove_infinite=self.remove_infinite, 243 nullimputation=self.nullimputation, 244 #filter_attributes={"rank" : "0"}, 245 # class_type=class_type 246 ).convert() 247 248 if n == 80: 249 print "train classifier" 250 input_file = self.trainset_orange_filename 251 self.output_file = "classifier.clsf" 252 253 trainset = Table(input_file) 254 255 mylearner = self.learner(**self.classifier_params) 256 trained_classifier = mylearner(trainset) 257 self.classifier = OrangeClassifier(trained_classifier) 258 self.classifier.print_content() 259 260 261 #give the possibility to calculate classification accuracy in the end 262 if (n == 85 and not self.delay_accuracy) or (n == 185 and self.delay_accuracy): 263 print "evaluate classifier with cross-fold validation" 264 orangeData = Table(self.trainset_orange_filename) 265 learner = self.learner(**self.classifier_params) 266 cv = evaluation.testing.cross_validation([learner], orangeData, 10) 267 ret["CA"] = evaluation.scoring.CA(cv)[0] 268 ret["AUC"] = evaluation.scoring.AUC(cv)[0] 269 270 if n == 90: 271 print "test_classifier" 272 273 input_file = self.testset_orange_filename 274 # output_file = "classified.tab" 275 276 print "performing classification" 277 orangedata = Table(input_file) 278 279 280 281 282 classified_set_vector = self.classifier.classify_orange_table(orangedata) 283 self.classified_values_vector = [str(v[0]) for v in classified_set_vector] 284 self.classified_probs_vector = [(v[1]["-1"], v[1]["1"]) for v in classified_set_vector] 285 286 287 # print [str(v[1]["-1"]) for v in classified_set_vector] 288 # print classified_set_vector 289 290 # if n == 95: 291 # print "accuracy over test set" 292 # orangedata = Table(self.testset_orange_filename) 293 # cv = evaluation.testing.default_evaluation([self.learner(**self.classifier_params)], orangedata) 294 # ret["CA_test"] = evaluation.scoring.CA(cv) 295 # ret["AUC_test"] = evaluation.scoring.AUC(cv) 296 297 298 if n == 100: 299 print "EVALUATION" 300 print "reloading coupled test set" 301 self.simple_testset = JcmlReader(self.pairwise_test_filename).get_dataset() 302 303 print "reconstructing test set" 304 att_vector = [{"rank_predicted": v} for v in self.classified_values_vector] 305 att_prob_neg = [{"prob_-1": v[0]} for v in self.classified_probs_vector] 306 att_prob_pos = [{"prob_1": v[1]} for v in self.classified_probs_vector] 307 # print att_vector 308 309 print "adding guessed rank" 310 self.simple_testset.add_attribute_vector(att_vector, "ps") 311 self.simple_testset.add_attribute_vector(att_prob_neg, "ps") 312 self.simple_testset.add_attribute_vector(att_prob_pos, "ps") 313 314 315 Parallelsentence2Jcml(self.simple_testset).write_to_file("testset-pairwise-with-estranks.jcml") 316 317 self.simple_testset = RawPairwiseDataset(cast=self.simple_testset) #this 318 # self.simple_testset = CompactPairwiseDataset(self.simple_testset) #and this should have no effect 319 320 self.reconstructed_hard_testset = self.simple_testset.get_single_set_with_hard_ranks("rank_predicted", "rank_hard") 321 self.reconstructed_soft_testset = self.simple_testset.get_single_set_with_soft_ranks("prob_-1", "prob_1", "rank_soft_predicted", "rank_soft") 322 self.simple_testset = None 323 324 325 if n == 120: 326 print "Scoring correlation" 327 print "ranks inverted ", self.evaluation_invert_ranks 328 ret.update(score(self.reconstructed_hard_testset, self.class_name, "hard", "rank_hard", self.evaluation_invert_ranks)) 329 ret.update(score(self.reconstructed_soft_testset, self.class_name, "soft", "rank_soft", self.evaluation_invert_ranks)) 330 ret = OrderedDict(sorted(ret.items(), key=lambda t: t[0])) 331 332 print ret 333 return ret
334 335 336 337 338
339 - def save_state(self, params, rep, n):
340 if n == 0: 341 Parallelsentence2Jcml(self.trainset).write_to_file("trainset.jcml") 342 if n == 20: 343 Parallelsentence2Jcml(self.trainset).write_to_file("pairwise_trainset.jcml") 344 if n == 30: 345 Parallelsentence2Jcml(self.testset).write_to_file("pairwise_testset.jcml") 346 if n == 40: 347 pass 348 if n == 50: 349 Parallelsentence2Jcml(self.testset).write_to_file(self.pairwise_test_filename) 350 351 if n == 80: 352 objectfile = open(self.output_file, 'w') 353 pickle.dump(self.classifier.classifier, objectfile) 354 objectfile.close() 355 if n == 90: 356 classified_vector_file = open("classified.hard.txt", 'w') 357 for value in self.classified_values_vector: 358 classified_vector_file.write("{0}\n".format(value)) 359 360 classified_vector_file.close() 361 classified_prob_file = open("classified.soft.txt", 'w') 362 for value1, value2 in self.classified_probs_vector: 363 classified_prob_file.write("{}\t{}\n".format(value1, value2)) 364 classified_prob_file.close() 365 if n == 100: 366 # Parallelsentence2Jcml(self.simple_testset).write_to_file("testset.classified.jcml") 367 Parallelsentence2Jcml(self.reconstructed_hard_testset).write_to_file("testset.reconstructed.hard.jcml") 368 Parallelsentence2Jcml(self.reconstructed_soft_testset).write_to_file("testset.reconstructed.soft.jcml")
369 # if n == 110: 370 # Parallelsentence2Jcml(self.reconstructed_hard_testset).write_to_file("testset.reconstructed.org.hard.jcml") 371 # Parallelsentence2Jcml(self.reconstructed_soft_testset).write_to_file("testset.reconstructed.org.soft.jcml") 372
373 - def restore_state(self,params, rep, n):
374 self.class_name = "rank" #TODO: hardcoded 375 if n > 0 and n <=20 : 376 self.trainset = JcmlReader("trainset.jcml").get_dataset() 377 378 if n > 10 and n <=30 : 379 self.testset = JcmlReader("testset.jcml").get_dataset() 380 381 if n > 20 and n <=40: 382 self.trainset = JcmlReader("pairwise_trainset.jcml").get_dataset() 383 384 if n > 30 and n <=50: 385 self.testset = JcmlReader("pairwise_testset.jcml").get_dataset() 386 387 if n > 50: 388 self.pairwise_test_filename = "pairwise_testset.jcml" 389 self.trainset_orange_filename = "trainset.tab" 390 391 if n > 70: 392 self.testset_orange_filename = "testset.tab" 393 394 if n > 80 and n <= 90: 395 objectfile = open("classifier.clsf", 'r') 396 self.classifier = OrangeClassifier(pickle.load(objectfile)) 397 objectfile.close() 398 if n > 90: 399 400 classified_vector_file = open("classified.hard.txt", 'r') 401 self.classified_values_vector = classified_vector_file.readlines() 402 classified_vector_file.close() 403 classified_prob_file = open("classified.soft.txt", 'r') 404 self.classified_probs_vector = [tuple(line.split('\t')) for line in classified_prob_file] 405 classified_prob_file.close() 406 if n > 100: 407 # self.simple_testset = JcmlReader("testset.classified.jcml").get_dataset 408 self.reconstructed_hard_testset = JcmlReader("testset.reconstructed.hard.jcml").get_dataset() 409 self.reconstructed_soft_testset = JcmlReader("testset.reconstructed.soft.jcml").get_dataset()
410 # if n == 10: 411 # self.reconstructed_hard_testset = JcmlReader("testset.reconstructed.org.hard.jcml").get_dataset() 412 # self.reconstructed_soft_testset = JcmlReader("testset.reconstructed.org.soft.jcml").get_dataset() 413 ############################## 414 415 416 417
418 - def _get_testset(self, test_filename, mode = "", ratio=0.7):
419 if not test_filename == "": 420 print "arbitrarily split given set to training and test sets 90% + 10%" 421 simple_trainset = JcmlReader("trainset.jcml").get_dataset() 422 423 if mode == "development": 424 simple_trainset, a = simple_trainset.split(0.03) 425 426 simple_trainset, simple_testset = simple_trainset.split(ratio) 427 Parallelsentence2Jcml(simple_trainset).write_to_file("trainset.jcml") 428 Parallelsentence2Jcml(simple_testset).write_to_file("testset.jcml") 429 else: 430 shutil.copy(test_filename, "testset.jcml")
431 432
433 -def get_scoring(testset, class_name, xid, featurename):
434 scoringset = Scoring(testset) 435 ret = {} 436 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid))) 437 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-ntp", exclude_ties=False)) 438 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-nt", penalize_predicted_ties=False)) 439 # ret["mrr"] = scoringset.mrr(featurename, class_name) 440 ret["kendalltau_b-%s"%xid], ret["kendalltau_b-%s-pi"%xid] = scoringset.get_kendall_tau_b(featurename, class_name) 441 ret["b1-acc-1-%s"%xid], ret["b1-acc-%s-any"%xid] = scoringset.selectbest_accuracy(featurename, class_name) 442 ret["fr-%s"%xid] = scoringset.avg_first_ranked(featurename, class_name) 443 ret["pr-%s"%xid] = scoringset.avg_predicted_ranked(featurename, class_name) 444 445 sb_percentages = scoringset.best_predicted_vs_human(featurename, class_name) 446 for rank, percentage in sb_percentages.iteritems(): 447 ret["sb-{}-{}".format(rank,xid)] = str(percentage) 448 return ret
449
450 -def score(testset, class_name, xid, featurename, invert_ranks=False):
451 scoringset = Scoring(testset, invert_ranks=invert_ranks) 452 return scoringset.get_metrics_scores(featurename, class_name, prefix=xid)
453
454 -class StreamToLogger(object):
455 """ 456 Fake file-like stream object that redirects writes to a logger instance. 457 """
458 - def __init__(self, logger, log_level=logging.INFO):
459 self.logger = logger 460 self.log_level = log_level 461 self.linebuf = ''
462
463 - def write(self, buf):
464 for line in buf.rstrip().splitlines(): 465 self.logger.log(self.log_level, line.rstrip())
466
467 - def flush(self):
468 pass
469 470 if __name__ == '__main__': 471 FORMAT = "%(asctime)-15s [%(process)d:%(thread)d] %(message)s " 472 now = datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M-%S") 473 # logging.basicConfig(filename='autoranking-{}.log'.format(now),level=logging.DEBUG, format=FORMAT) 474 # sys.stderr = StreamToLogger(logging.getLogger('STDERR'), logging.INFO) 475 # sys.stdout = StreamToLogger(logging.getLogger('STDOUT'), logging.INFO) 476 mysuite = AutorankingSuite(); 477 mysuite.start() 478