Package app :: Package autoranking :: Module decode_batch
[hide private]
[frames] | no frames]

Source Code for Module app.autoranking.decode_batch

  1  ''' 
  2  Created on 07 Mar 2012 
  3  @author: Eleftherios Avramidis 
  4  ''' 
  5  import logging 
  6  import copy 
  7  from collections import OrderedDict 
  8  from Orange.regression.linear import LinearRegressionLearner  
  9  from Orange.regression.pls import PLSRegressionLearner 
 10  from Orange.regression.lasso import LassoRegressionLearner 
 11  from Orange.regression.earth import EarthLearner 
 12  from Orange.regression.tree import TreeLearner 
 13  from Orange.classification.rules import CN2Learner,  CN2UnorderedLearner, CN2SDUnorderedLearner, CN2EVCUnorderedLearner 
 14  from Orange import feature 
 15   
 16  from Orange.classification.bayes import NaiveLearner 
 17  from Orange.classification.knn import kNNLearner 
 18  #from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner 
 19  from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner 
 20  from Orange.classification.tree import TreeLearner 
 21  from Orange.classification.tree import C45Learner 
 22  from Orange.classification.logreg import LogRegLearner 
 23  from Orange import evaluation 
 24   
 25  from dataprocessor.input.jcmlreader import JcmlReader 
 26  #from sentence.coupleddataset import CoupledDataSet, OrangeCoupledDataSet, CoupledDataSetDisk 
 27  from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 
 28  from dataprocessor.sax.saxjcml2orange import SaxJcml2Orange 
 29  from dataprocessor.ce.cejcml import CEJcmlReader 
 30  from dataprocessor.ce.cejcml2orange import CElementTreeJcml2Orange  
 31  from dataprocessor.output.wmt11tabwriter import Wmt11TabWriter 
 32  from classifier.classifier import OrangeClassifier 
 33  from Orange.data import Table 
 34  from datetime import datetime 
 35  from copy import deepcopy 
 36   
 37  from featuregenerator.diff_generator import DiffGenerator 
 38  from sentence.pairwisedataset import AnalyticPairwiseDataset, CompactPairwiseDataset, RawPairwiseDataset 
 39  from sentence.dataset import DataSet 
 40  from sentence.scoring import Scoring 
 41   
 42  import time 
 43   
 44   
 45  import random 
 46  import sys 
 47  import shutil 
 48  import pickle 
 49  import os 
 50   
 51  from expsuite import PyExperimentSuite 
 52   
 53   
 54   
55 -class AutorankingSuite(PyExperimentSuite):
56 restore_supported = True 57
58 - def reset(self, params, rep):
59 self.restore_supported = True 60 61 self.remove_infinite = False 62 63 self.meta_attributes = params["meta_attributes"].split(",") 64 self.include_references = params.setdefault("include_references", False) 65 self.replacement = params.setdefault("replacement", True) 66 self.filter_unassigned = params.setdefault("filter_unassigned", False) 67 self.restrict_ranks = params.setdefault("restrict_ranks", []) 68 69 self.delay_accuracy = params.setdefault("delay_accuracy", False) 70 self.remove_infinite = params.setdefault("remove_infinite", False) 71 72 if self.restrict_ranks: 73 self.restrict_ranks = self.restrict_ranks.split(",") 74 75 source_attributes = params["{}_source".format(params["att"])].split(",") 76 target_attributes = params["{}_target".format(params["att"])].split(",") 77 general_attributes = params["{}_general".format(params["att"])].split(",") 78 79 params["source_attributes"] = source_attributes 80 params["target_attributes"] = target_attributes 81 params["general_attributes"] = general_attributes 82 83 self.active_attributes = [] 84 if general_attributes != [""]: 85 self.active_attributes.extend(general_attributes) #TODOL check whether ps prefix is needed 86 if source_attributes != [""]: 87 self.active_attributes.extend(["src_{}".format(att) for att in source_attributes]) 88 if target_attributes != [""]: 89 self.active_attributes.extend(["tgt-1_{}".format(att) for att in target_attributes]) 90 self.active_attributes.extend(["tgt-2_{}".format(att) for att in target_attributes]) 91 92 if self.active_attributes == [""]: 93 self.active_attributes = [] 94 self.discretization = False 95 if params.has_key("discretization"): 96 self.discretization = params["discretization"] 97 98 self.hidden_attributes = params["hidden_attributes"].split(",") 99 self.discrete_attributes = params["discrete_attributes"].split(",") 100 101 self.class_name = params["class_name"] 102 self.class_type = params["class_type"] 103 104 self.testset = params["test_set"].format(**params) 105 self.ties = params["ties"] 106 107 objectfile = open(params["trained_classifier"], 'r') 108 self.classifier = OrangeClassifier(pickle.load(objectfile)) 109 objectfile.close()
110
111 - def iterate(self, params, rep, n):
112 ret = {} 113 114 if n == 10: 115 print "fetch test set" 116 shutil.copy(self.testset, "testset.jcml") 117 self.testset = JcmlReader("testset.jcml").get_dataset() 118 119 if n == 30: 120 print "pairwise testset" 121 self.testset = AnalyticPairwiseDataset(self.testset, replacement = self.replacement, rankless=True) 122 123 124 if n == 50: 125 #print "add difference feature : testset" 126 self.pairwise_test_filename = "pairwise_testset.jcml" 127 128 #parallelsentences = self.testset.get_parallelsentences() 129 #parallelsentences = DiffGenerator().add_features_batch(parallelsentences) 130 #Parallelsentence2Jcml(parallelsentences).write_to_file(self.pairwise_test_filename) 131 132 133 if n == 70: 134 print "produce orange testset" 135 136 input_file = "pairwise_testset.jcml" 137 self.testset_orange_filename = "testset.tab" 138 139 if os.path.isdir("/local"): 140 dir = "/local" 141 else: 142 dir = "." 143 144 CElementTreeJcml2Orange(input_file, 145 self.class_name, 146 self.active_attributes, 147 self.meta_attributes, 148 self.testset_orange_filename, 149 compact_mode = True, 150 discrete_attributes=self.discrete_attributes, 151 hidden_attributes=self.hidden_attributes, 152 get_nested_attributes=True, 153 dir=dir, 154 remove_infinite=self.remove_infinite 155 #filter_attributes={"rank" : "0"}, 156 # class_type=class_type 157 ).convert() 158 159 if n == 90: 160 print "test_classifier" 161 input_file = self.testset_orange_filename 162 # output_file = "classified.tab" 163 164 print "performing classification" 165 orangedata = Table(input_file) 166 167 168 169 170 classified_set_vector = self.classifier.classify_orange_table(orangedata) 171 172 self.classified_values_vector = [str(v[0]) for v in classified_set_vector] 173 self.classified_probs_vector = [(v[1]["-1"], v[1]["1"]) for v in classified_set_vector] 174 175 176 if n == 100: 177 print "reloading coupled test set" 178 self.simple_testset = CEJcmlReader(self.pairwise_test_filename).get_dataset() 179 Parallelsentence2Jcml(self.simple_testset).write_to_file("testset-pairwise.reloaded.debug.jcml") 180 181 print "reconstructing test set" 182 att_vector = [{"rank_predicted": v} for v in self.classified_values_vector] 183 att_prob_neg = [{"prob_-1": v[0]} for v in self.classified_probs_vector] 184 att_prob_pos = [{"prob_1": v[1]} for v in self.classified_probs_vector] 185 # print att_vector 186 187 print "adding guessed rank" 188 self.simple_testset.add_attribute_vector(att_vector, "ps") 189 self.simple_testset.add_attribute_vector(att_prob_neg, "ps") 190 self.simple_testset.add_attribute_vector(att_prob_pos, "ps") 191 192 Parallelsentence2Jcml(self.simple_testset).write_to_file("testset-pairwise-with-estranks.jcml") 193 194 self.simple_testset = RawPairwiseDataset(cast=self.simple_testset) #this 195 # self.simple_testset = CompactPairwiseDataset(self.simple_testset) #and this should have no effect 196 197 reconstructed_hard_testset = self.simple_testset.get_single_set_with_hard_ranks("rank_predicted", "rank_hard") 198 reconstructed_soft_testset = self.simple_testset.get_single_set_with_soft_ranks("prob_-1", "prob_1", "rank_soft_predicted", "rank_soft") 199 200 Parallelsentence2Jcml(reconstructed_hard_testset).write_to_file("reconstructed.hard.light.jcml") 201 Parallelsentence2Jcml(reconstructed_soft_testset).write_to_file("reconstructed.soft.light.jcml") 202 203 self.testset = JcmlReader("testset.jcml").get_dataset() 204 self.final_reconstructed_hard = deepcopy(self.testset) 205 self.final_reconstructed_hard.import_target_attributes_onsystem(reconstructed_hard_testset, ["rank_hard"],['langsrc','id','langtgt'],[],['rank','system']) 206 self.final_reconstructed_soft = deepcopy(self.testset) 207 self.final_reconstructed_soft.import_target_attributes_onsystem(reconstructed_soft_testset, ["rank_soft"],['langsrc','id','langtgt'],[],['rank','system']) 208 209 210 self.simple_testset = None 211 212 213 if n == 110: 214 215 print "Exporting results" 216 writer = Wmt11TabWriter(self.final_reconstructed_soft, "dfki_{}".format(params["att"]), "testset", "rank_soft") 217 writer.write_to_file("ranked.soft.tab") 218 219 writer = Wmt11TabWriter(self.final_reconstructed_hard, "dfki_{}".format(params["att"]), "testset", "rank_hard") 220 writer.write_to_file("ranked.hard.tab") 221 222 if n == 120: 223 print "Scoring correlation" 224 ret.update(score(self.final_reconstructed_soft, self.class_name, "soft", "rank_soft")) 225 ret = OrderedDict(sorted(ret.items(), key=lambda t: t[0])) 226 227 print ret 228 229 return ret
230 231 232 233 234
235 - def save_state(self, params, rep, n):
236 237 if n == 30: 238 Parallelsentence2Jcml(self.testset).write_to_file("pairwise_testset.jcml") 239 if n == 50: 240 #Parallelsentence2Jcml(self.testset).write_to_file(self.pairwise_test_filename) 241 pass 242 243 244 if n == 90: 245 classified_vector_file = open("classified.hard.txt", 'w') 246 for value in self.classified_values_vector: 247 classified_vector_file.write("{0}\n".format(value)) 248 249 classified_vector_file.close() 250 classified_prob_file = open("classified.soft.txt", 'w') 251 for value1, value2 in self.classified_probs_vector: 252 classified_prob_file.write("{}\t{}\n".format(value1, value2)) 253 classified_prob_file.close() 254 if n == 100: 255 256 Parallelsentence2Jcml(self.final_reconstructed_hard).write_to_file("testset.reconstructed.hard.jcml") 257 258 Parallelsentence2Jcml(self.final_reconstructed_soft).write_to_file("testset.reconstructed.soft.jcml")
259 # if n == 110: 260 # Parallelsentence2Jcml(self.reconstructed_hard_testset).write_to_file("testset.reconstructed.org.hard.jcml") 261 # Parallelsentence2Jcml(self.reconstructed_soft_testset).write_to_file("testset.reconstructed.org.soft.jcml") 262
263 - def restore_state(self,params, rep, n):
264 self.class_name = "rank" #TODO: hardcoded 265 266 267 if n > 10 and n <=30 : 268 self.testset = JcmlReader("testset.jcml").get_dataset() 269 270 271 if n > 30 and n <=50: 272 #self.testset = JcmlReader("pairwise_testset.jcml").get_dataset() 273 pass 274 if n > 50: 275 self.pairwise_test_filename = "pairwise_testset.jcml" 276 277 if n > 70: 278 self.testset_orange_filename = "testset.tab" 279 280 if n > 90: 281 classified_vector_file = open("classified.hard.txt", 'r') 282 self.classified_values_vector = [int(line.strip()) for line in classified_vector_file] 283 classified_vector_file.close() 284 classified_prob_file = open("classified.soft.txt", 'r') 285 self.classified_probs_vector = [tuple(line.strip().split('\t')) for line in classified_prob_file] 286 self.classified_probs_vector = [(float(a),float(b)) for a,b in self.classified_probs_vector] 287 classified_prob_file.close() 288 if n > 100: 289 pass
290 #self.reconstructed_hard_testset = JcmlReader("testset.reconstructed.hard.jcml").get_dataset() 291 #self.reconstructed_soft_testset = JcmlReader("testset.reconstructed.soft.jcml").get_dataset() 292 # if n == 10: 293 # self.reconstructed_hard_testset = JcmlReader("testset.reconstructed.org.hard.jcml").get_dataset() 294 # self.reconstructed_soft_testset = JcmlReader("testset.reconstructed.org.soft.jcml").get_dataset() 295 ############################## 296 297 298 299
300 - def _get_testset(self, test_filename, mode = "", ratio=0.7):
301 if not test_filename == "": 302 print "arbitrarily split given set to training and test sets 90% + 10%" 303 simple_trainset = JcmlReader("trainset.jcml").get_dataset() 304 305 if mode == "development": 306 simple_trainset, a = simple_trainset.split(0.03) 307 308 simple_trainset, simple_testset = simple_trainset.split(ratio) 309 Parallelsentence2Jcml(simple_trainset).write_to_file("trainset.jcml") 310 Parallelsentence2Jcml(simple_testset).write_to_file("testset.jcml") 311 else: 312 shutil.copy(test_filename, "testset.jcml")
313 314
315 -def get_scoring(testset, class_name, xid, featurename):
316 scoringset = Scoring(testset) 317 ret = {} 318 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid))) 319 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-ntp", exclude_ties=False)) 320 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-nt", penalize_predicted_ties=False)) 321 # ret["mrr"] = scoringset.mrr(featurename, class_name) 322 ret["kendalltau_b-%s"%xid], ret["kendalltau_b-%s-pi"%xid] = scoringset.get_kendall_tau_b(featurename, class_name) 323 ret["b1-acc-1-%s"%xid], ret["b1-acc-%s-any"%xid] = scoringset.selectbest_accuracy(featurename, class_name) 324 ret["fr-%s"%xid] = scoringset.avg_first_ranked(featurename, class_name) 325 ret["pr-%s"%xid] = scoringset.avg_predicted_ranked(featurename, class_name) 326 327 sb_percentages = scoringset.best_predicted_vs_human(featurename, class_name) 328 for rank, percentage in sb_percentages.iteritems(): 329 ret["sb-{}-{}".format(rank,xid)] = str(percentage) 330 return ret
331
332 -def score(testset, class_name, xid, featurename):
333 scoringset = Scoring(testset) 334 return scoringset.get_metrics_scores(featurename, class_name, prefix=xid)
335
336 -class StreamToLogger(object):
337 """ 338 Fake file-like stream object that redirects writes to a logger instance. 339 """
340 - def __init__(self, logger, log_level=logging.INFO):
341 self.logger = logger 342 self.log_level = log_level 343 self.linebuf = ''
344
345 - def write(self, buf):
346 for line in buf.rstrip().splitlines(): 347 self.logger.log(self.log_level, line.rstrip())
348
349 - def flush(self):
350 pass
351 352 353 354 if __name__ == '__main__': 355 FORMAT = "%(asctime)-15s [%(process)d:%(thread)d] %(message)s " 356 now = datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M-%S") 357 # logging.basicConfig(filename='autoranking-{}.log'.format(now),level=logging.DEBUG, format=FORMAT) 358 # sys.stderr = StreamToLogger(logging.getLogger('STDERR'), logging.INFO) 359 # sys.stdout = StreamToLogger(logging.getLogger('STDOUT'), logging.INFO) 360 mysuite = AutorankingSuite(); 361 mysuite.start() 362