ml.lib.orange

1 ''' 2 Created on 19 Apr 2013 3 4 @author: Eleftherios Avramidis 5 ''' 6 7 import cPickle as pickle 8 import sys 9 10 from dataprocessor.ce.cejcml2orange import CElementTreeJcml2Orange 11 #from ml.var import Classifier 12 13 from sentence.dataset import DataSet 14 from sentence.pairwisedataset import AnalyticPairwiseDataset 15 from sentence.pairwiseparallelsentenceset import CompactPairwiseParallelSentenceSet 16 17 from Orange.data import Table 18 from Orange.data import Instance, Value, Domain 19 #from Orange.evaluation.scoring import CA, Precision, Recall, F1 20 from Orange.evaluation.testing import cross_validation 21 from Orange.classification.rules import rule_to_string 22 from Orange.classification.svm import get_linear_svm_weights 23 from Orange.classification import logreg 24 25 #import Orange Learners 26 from Orange.classification.bayes import NaiveLearner 27 from Orange.classification.knn import kNNLearner 28 from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner 29 from Orange.classification.tree import TreeLearner 30 from Orange.classification.tree import C45Learner 31 from Orange.classification.logreg import LogRegLearner #,LibLinearLogRegLearner 32 from Orange.classification import Classifier 33 from Orange.feature import Continuous 34 35

36 -def forname(name, **kwargs):

37 """ 38 Pythonic way to initialize and return an orange learner. 39 Pass any parameters needed for the initialization 40 @param name: the name of the learner to be returned 41 @type name: string 42 @return: an orange learner 43 @rtype: Orange.classification.Classifier 44 """ 45 orangeclass = eval(name) 46 return orangeclass(**kwargs)

47 48

49 -def runtime_ranker_forname(name, **kwargs):

50 """ 51 Return particular ranker class given a string 52 53 """ 54 orangeclass = eval(name) 55 return OrangeRuntimeRanker(orangeclass(**kwargs))

56 57

58 -def parallelsentence_to_instance(domain, parallelsentence):

59 """ 60 Receive a parallel sentence and convert it into a memory instance for 61 the machine learner. 62 @param parallelsentence: 63 @type parallelsentence: L{sentence.parallelsentence.ParallelSentence} 64 @return: an orange instance 65 @type: Orange.data.Instance 66 """ 67 attributes = parallelsentence.get_nested_attributes() 68 #print "attributes = ", attributes 69 values = [] 70 71 #features required by the model need to be retrieved from the 72 #dic attributes containing feature values for this sentence 73 domain_features = domain.features 74 75 for feature in domain_features: 76 feature_type = feature.var_type 77 feature_name = feature.name 78 79 try: 80 value = attributes[feature_name] 81 except KeyError: 82 sys.stderr.write("Feature '{}' not given by the enabled generators\n".format(feature_name)) 83 value = 0 84 85 #this casts the feature value we produced, in an orange value object 86 orange_value = feature(value) 87 values.append(orange_value) 88 89 #create a model without the class value and use it for the new instance 90 classless_domain = Domain(domain_features, False) 91 instance = Instance(classless_domain, values) 92 return instance

93 94

95 -def dataset_to_instances(domain, dataset):

96 """ 97 Receive a dataset and convert it into a memory table for the machine learner 98 """ 99 for parallelsentence in dataset: 100 instances = parallelsentence_to_instance(parallelsentence) 101 return Table(instances)

102 103 104

105 -class OrangeRuntimeRanker:

106 """ 107 This class represents a ranker implemented over pairwise orange classifiers. 108 This ranker is loaded into the memory from a dump file which contains an already trained 109 model and provides functions to rank one source sentence + translations at a time 110 @ivar classifier: the orange classifier object 111 @type classifier: Orange.classification.Classifier 112 """ 113

114 - def __init__(self, classifier_filename):

115 """ 116 Load previously trained classifier given existing filename 117 @param classifier_filename: the filename which contains the trained classifier 118 @type classifier_filename: str 119 """ 120 classifier_file = open(classifier_filename) 121 self.classifier = pickle.load(classifier_file) 122 classifier_file.close()

123 124

125 - def _get_description(self, resultvector):

126 output = [] 127 output.append("Used linear regression with Stepwise Feature Selection with the following weights") 128 coefficients = logreg.dump(self.classifier) 129 output.append(coefficients) 130 131 output.append("\n\n") 132 output.append("domain: {}\n\n".format(self.classifier.domain)) 133 134 for resultentry in resultvector: 135 system_names = resultentry['systems'] 136 value = resultentry['value'] 137 instance = resultentry['instance'] 138 distribution = resultentry['distribution'] 139 140 141 if value == -1: 142 output.append("System{} < System{}".format(system_names[0], system_names[1])) 143 else: 144 output.append("System{} > System{}".format(system_names[0], system_names[1])) 145 output.append(" \n instance: {} \n probabilities: {}\n".format(instance, distribution)) 146 return "".join(output)

147

148 - def rank_sentence(self, parallelsentence):

149 """ 150 Receive a parallel sentence with features and perform ranking 151 @param parallelsentence: an object containing the parallel sentence 152 @type parallelsentence: L{sentence.parallelsentence.ParallelSentence} 153 """ 154 155 #this will instruct orange to provide both binary decision and probability 156 return_type = Classifier.GetBoth 157 158 #follow the feature description as needed by the loaded classifier 159 domain = self.classifier.domain 160 161 #this is a clean-up fixing orange's bug, needed only for some classifiers 162 #if self.classifier.__class__.__name__ in ["NaiveClassifier", "CN2UnorderedClassifier"]: 163 # orange_table = self.clean_discrete_features(orange_table) 164 165 resultvector = [] 166 167 #de-compose multiranked sentence into pairwise comparisons 168 pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences() 169 170 #list that will hold the pairwise parallel sentences including the classifier's decision 171 classified_pairwise_parallelsentences = [] 172 173 for pairwise_parallelsentence in pairwise_parallelsentences: 174 175 #conver pairwise parallel sentence into an orange instance 176 instance = parallelsentence_to_instance(domain, pairwise_parallelsentence) 177 178 #run classifier for this instance 179 value, distribution = self.classifier(instance, return_type) 180 181 sys.stderr.write("{}, {}, {}\n".format(pairwise_parallelsentence.get_system_names(), value, distribution)) 182 183 resultvector.append({'systems' : pairwise_parallelsentence.get_system_names(), 184 'value' : (float(value.value)), 185 'distribution': distribution, 186 'instance' : instance}) 187 pairwise_parallelsentence.add_attributes({"rank_predicted":float(value.value), 188 "prob_-1":distribution[0], 189 "prob_1":distribution[1] 190 }) 191 192 classified_pairwise_parallelsentences.append(pairwise_parallelsentence) 193 194 195 196 #gather all classified pairwise comparisons into one sentence again 197 sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences) 198 ranked_sentence = sentenceset.get_multiranked_sentence("rank_predicted") 199 200 result = [(t.get_attribute("rank"), t) for t in ranked_sentence.get_translations()] 201 # return ranked_sentence.get_target_attribute_values("rank") 202 description = self._get_description(resultvector) 203 return result, description

204 205 206 207 208 209 210 211 212 213 214

215 -class OrangeClassifier(Classifier):

216 ''' 217 Wrapper around an orange classifier object 218 @ivar learner: the wrapped orange class 219 @ivar training_data_filename: the jcml training file 220 @type training_data_filename: str 221 @ivar training_table: an Orange "table" of examples containing training instances 222 @type \L{Orange.data.Table} 223 @ivar model: the trained classifier 224 @type model: Orange.classification.Classifier 225 @ivar test_data_filename: the jcml test file 226 @type test_data_filename: str 227 @ivar test_table: the Orange "table" of test examples 228 @type \L{Orange.data.Table} 229 '''

230 - def __init__(self, learner, **kwargs):

231 ''' 232 Constructor. 233 @param learner: an orange classifier whose functionality is to be wrapped 234 @type learner: 235 236 ''' 237 self.learner = learner(**kwargs) 238 self.datafile = None 239 self.training_data_filename = None 240 self.training_table = None 241 self.model = None

242 243 244

245 - def set_training_data(self, jcml_filename, 246 class_name, 247 desired_attributes, 248 meta_attributes, 249 250 **kwargs):

251 ''' 252 Read the data from an XML file, convert them to the proper format 253 and remember its location 254 @param jcml_filename: full path of the XML file where data reside 255 @type jcml_filename: string 256 @param class_name: name of class 257 @type class_name: string 258 @param desired_attributes: desired attributes 259 @type desired_attributes: list of strings 260 @param meta_attributes: meta attributes 261 @type meta_attributes: list of strings 262 ''' 263 264 output_file = jcml_filename.replace(".jmcl", ".tab") 265 266 convertor = CElementTreeJcml2Orange(jcml_filename, 267 class_name, 268 desired_attributes, 269 meta_attributes, 270 output_file, 271 compact_mode=True, 272 **kwargs) 273 274 convertor.convert() 275 self.training_data_filename = output_file

276 277

278 - def load_training_data(self):

279 ''' 280 Load the previously defined/converted training data in place 281 ''' 282 self.training_table = Table(self.training_data_filename)

283

284 - def unload_training_data(self):

285 ''' 286 Free up the memory occupied by the training data 287 ''' 288 self.training_table = None

289 290

291 - def cross_validation_scores(self, folds=10):

292 ''' 293 Perform cross validation on the training data. 294 @param folds: number of cross-validation folds 295 @type: int 296 @return: the value of the classification accuracy 297 @ 298 ''' 299 cv = cross_validation([self.learner], self.training_table, folds) 300 ca = CA(cv) 301 return ca

302

303 - def train(self):

304 self.model = self.learner(self.training_table) 305 objectfile = self.training_data_filename.replace(".tab", ".clsf") 306 pickle.dump(self.model, objectfile)

307 308 309 #The following are algorithm-specific functions to write down details 310 #about the produced model 311

312 - def _write_model_svm(self, basename):

313 try: 314 weights = get_linear_svm_weights(self.model) 315 textfilename = "{}.weights.txt".format(basename) 316 f = open(textfilename, "w") 317 f.write("Fitted parameters: \nnu = {0}\ngamma = {1}\n\nWeights: \n".format(self.model.fitted_parameters[0], self.model.fitted_parameters[1])) 318 for weight_name, weight_value in weights.iteritems(): 319 f.write("{0}\t{1}\n".format(weight_name, weight_value)) 320 f.close() 321 return True 322 except: 323 return False

324 325

326 - def _write_model_rules(self, basename):

327 try: 328 rules = self.model.rules 329 textfilename = "{}.rules.txt".format(basename) 330 f = open(textfilename, "w") 331 for r in rules: 332 f.write("{}\n".format(rule_to_string(r))) 333 f.close() 334 return 335 except: 336 pass

337 338

339 - def _write_model_tree(self,basename):

340 try: 341 textfilename = "{}.tree.txt".format(basename) 342 f = open(textfilename, "w") 343 f.write(self.model.to_string("leaf", "node")) 344 f.close() 345 346 graphics_filename = "{}.tree.dot".format(basename) 347 self.model.dot(graphics_filename, "leaf", "node") 348 except: 349 pass

350 351

352 - def write_model_description(self, basename):

353 ''' 354 Method-specific functions for writing the model characteristics into a file 355 @param basename: specify part of the filename which will be written 356 @type basename: string 357 ''' 358 359 self._write_model_svm() 360 self._write_model_rules() 361 362 try: 363 textfilename = "{}.logreg.dump.txt".format(basename) 364 f = open(textfilename, 'w') 365 f.write(logreg.dump(self.model)) 366 f.close() 367 except: 368 pass

369 370

371 - def set_test_data(self, jcml_filename, 372 class_name, 373 desired_attributes, 374 meta_attributes, 375 output_file, 376 **kwargs):

377 ''' 378 Read the data from an XML file, convert them to the proper format 379 and remember its location 380 @param jcml_filename: full path of the XML file where data reside 381 @type jcml_filename: string 382 @param class_name: name of class 383 @type class_name: string 384 @param desired_attributes: desired attributes 385 @type desired_attributes: list of strings 386 @param meta_attributes: meta attributes 387 @type meta_attributes: list of strings 388 ''' 389 390 convertor = CElementTreeJcml2Orange(jcml_filename, 391 class_name, 392 desired_attributes, 393 meta_attributes, 394 output_file, 395 compact_mode=True, 396 **kwargs) 397 398 convertor.convert() 399 self.test_data_filename = output_file

400

401 - def load_test_data(self):

402 self.test_table = Table(self.test_data_filename)

403

404 - def unload_test_data(self):

405 self.test_table = None

406

407 - def unload(self):

408 self.unload_training_data() 409 self.unload_test_data() 410 self.model = None

411

Source Code for Module ml.lib.orange