1 '''
2 Created on 07 Mar 2012
3 @author: Eleftherios Avramidis
4 '''
5 import logging
6 import copy
7 from collections import OrderedDict
8 from Orange.regression.linear import LinearRegressionLearner
9 from Orange.regression.pls import PLSRegressionLearner
10 from Orange.regression.lasso import LassoRegressionLearner
11 from Orange.regression.earth import EarthLearner
12 from Orange.regression.tree import TreeLearner
13 from Orange.classification.rules import CN2Learner, CN2UnorderedLearner, CN2SDUnorderedLearner, CN2EVCUnorderedLearner
14 from Orange import feature
15
16 from Orange.classification.bayes import NaiveLearner
17 from Orange.classification.knn import kNNLearner
18
19 from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner
20 from Orange.classification.tree import TreeLearner
21 from Orange.classification.tree import C45Learner
22 from Orange.classification.logreg import LogRegLearner
23 from Orange import evaluation
24
25 from dataprocessor.input.jcmlreader import JcmlReader
26
27 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml
28 from dataprocessor.sax.saxjcml2orange import SaxJcml2Orange
29 from dataprocessor.ce.cejcml import CEJcmlReader
30 from dataprocessor.ce.cejcml2orange import CElementTreeJcml2Orange
31 from dataprocessor.output.wmt11tabwriter import Wmt11TabWriter
32 from classifier.classifier import OrangeClassifier
33 from Orange.data import Table
34 from datetime import datetime
35 from copy import deepcopy
36
37 from featuregenerator.diff_generator import DiffGenerator
38 from sentence.pairwisedataset import AnalyticPairwiseDataset, CompactPairwiseDataset, RawPairwiseDataset
39 from sentence.dataset import DataSet
40 from sentence.scoring import Scoring
41
42 import time
43
44
45 import random
46 import sys
47 import shutil
48 import pickle
49 import os
50
51 from expsuite import PyExperimentSuite
52
53
54
56 restore_supported = True
57
58 - def reset(self, params, rep):
59 self.restore_supported = True
60
61 self.remove_infinite = False
62
63 self.meta_attributes = params["meta_attributes"].split(",")
64 self.include_references = params.setdefault("include_references", False)
65 self.replacement = params.setdefault("replacement", True)
66 self.filter_unassigned = params.setdefault("filter_unassigned", False)
67 self.restrict_ranks = params.setdefault("restrict_ranks", [])
68
69 self.delay_accuracy = params.setdefault("delay_accuracy", False)
70 self.remove_infinite = params.setdefault("remove_infinite", False)
71
72 if self.restrict_ranks:
73 self.restrict_ranks = self.restrict_ranks.split(",")
74
75 source_attributes = params["{}_source".format(params["att"])].split(",")
76 target_attributes = params["{}_target".format(params["att"])].split(",")
77 general_attributes = params["{}_general".format(params["att"])].split(",")
78
79 params["source_attributes"] = source_attributes
80 params["target_attributes"] = target_attributes
81 params["general_attributes"] = general_attributes
82
83 self.active_attributes = []
84 if general_attributes != [""]:
85 self.active_attributes.extend(general_attributes)
86 if source_attributes != [""]:
87 self.active_attributes.extend(["src_{}".format(att) for att in source_attributes])
88 if target_attributes != [""]:
89 self.active_attributes.extend(["tgt-1_{}".format(att) for att in target_attributes])
90 self.active_attributes.extend(["tgt-2_{}".format(att) for att in target_attributes])
91
92 if self.active_attributes == [""]:
93 self.active_attributes = []
94 self.discretization = False
95 if params.has_key("discretization"):
96 self.discretization = params["discretization"]
97
98 self.hidden_attributes = params["hidden_attributes"].split(",")
99 self.discrete_attributes = params["discrete_attributes"].split(",")
100
101 self.class_name = params["class_name"]
102 self.class_type = params["class_type"]
103
104 self.testset = params["test_set"].format(**params)
105 self.ties = params["ties"]
106
107 objectfile = open(params["trained_classifier"], 'r')
108 self.classifier = OrangeClassifier(pickle.load(objectfile))
109 objectfile.close()
110
111 - def iterate(self, params, rep, n):
112 ret = {}
113
114 if n == 10:
115 print "fetch test set"
116 shutil.copy(self.testset, "testset.jcml")
117 self.testset = JcmlReader("testset.jcml").get_dataset()
118
119 if n == 30:
120 print "pairwise testset"
121 self.testset = AnalyticPairwiseDataset(self.testset, replacement = self.replacement, rankless=True)
122
123
124 if n == 50:
125
126 self.pairwise_test_filename = "pairwise_testset.jcml"
127
128
129
130
131
132
133 if n == 70:
134 print "produce orange testset"
135
136 input_file = "pairwise_testset.jcml"
137 self.testset_orange_filename = "testset.tab"
138
139 if os.path.isdir("/local"):
140 dir = "/local"
141 else:
142 dir = "."
143
144 CElementTreeJcml2Orange(input_file,
145 self.class_name,
146 self.active_attributes,
147 self.meta_attributes,
148 self.testset_orange_filename,
149 compact_mode = True,
150 discrete_attributes=self.discrete_attributes,
151 hidden_attributes=self.hidden_attributes,
152 get_nested_attributes=True,
153 dir=dir,
154 remove_infinite=self.remove_infinite
155
156
157 ).convert()
158
159 if n == 90:
160 print "test_classifier"
161 input_file = self.testset_orange_filename
162
163
164 print "performing classification"
165 orangedata = Table(input_file)
166
167
168
169
170 classified_set_vector = self.classifier.classify_orange_table(orangedata)
171
172 self.classified_values_vector = [str(v[0]) for v in classified_set_vector]
173 self.classified_probs_vector = [(v[1]["-1"], v[1]["1"]) for v in classified_set_vector]
174
175
176 if n == 100:
177 print "reloading coupled test set"
178 self.simple_testset = CEJcmlReader(self.pairwise_test_filename).get_dataset()
179 Parallelsentence2Jcml(self.simple_testset).write_to_file("testset-pairwise.reloaded.debug.jcml")
180
181 print "reconstructing test set"
182 att_vector = [{"rank_predicted": v} for v in self.classified_values_vector]
183 att_prob_neg = [{"prob_-1": v[0]} for v in self.classified_probs_vector]
184 att_prob_pos = [{"prob_1": v[1]} for v in self.classified_probs_vector]
185
186
187 print "adding guessed rank"
188 self.simple_testset.add_attribute_vector(att_vector, "ps")
189 self.simple_testset.add_attribute_vector(att_prob_neg, "ps")
190 self.simple_testset.add_attribute_vector(att_prob_pos, "ps")
191
192 Parallelsentence2Jcml(self.simple_testset).write_to_file("testset-pairwise-with-estranks.jcml")
193
194 self.simple_testset = RawPairwiseDataset(cast=self.simple_testset)
195
196
197 reconstructed_hard_testset = self.simple_testset.get_single_set_with_hard_ranks("rank_predicted", "rank_hard")
198 reconstructed_soft_testset = self.simple_testset.get_single_set_with_soft_ranks("prob_-1", "prob_1", "rank_soft_predicted", "rank_soft")
199
200 Parallelsentence2Jcml(reconstructed_hard_testset).write_to_file("reconstructed.hard.light.jcml")
201 Parallelsentence2Jcml(reconstructed_soft_testset).write_to_file("reconstructed.soft.light.jcml")
202
203 self.testset = JcmlReader("testset.jcml").get_dataset()
204 self.final_reconstructed_hard = deepcopy(self.testset)
205 self.final_reconstructed_hard.import_target_attributes_onsystem(reconstructed_hard_testset, ["rank_hard"],['langsrc','id','langtgt'],[],['rank','system'])
206 self.final_reconstructed_soft = deepcopy(self.testset)
207 self.final_reconstructed_soft.import_target_attributes_onsystem(reconstructed_soft_testset, ["rank_soft"],['langsrc','id','langtgt'],[],['rank','system'])
208
209
210 self.simple_testset = None
211
212
213 if n == 110:
214
215 print "Exporting results"
216 writer = Wmt11TabWriter(self.final_reconstructed_soft, "dfki_{}".format(params["att"]), "testset", "rank_soft")
217 writer.write_to_file("ranked.soft.tab")
218
219 writer = Wmt11TabWriter(self.final_reconstructed_hard, "dfki_{}".format(params["att"]), "testset", "rank_hard")
220 writer.write_to_file("ranked.hard.tab")
221
222 if n == 120:
223 print "Scoring correlation"
224 ret.update(score(self.final_reconstructed_soft, self.class_name, "soft", "rank_soft"))
225 ret = OrderedDict(sorted(ret.items(), key=lambda t: t[0]))
226
227 print ret
228
229 return ret
230
231
232
233
234
236
237 if n == 30:
238 Parallelsentence2Jcml(self.testset).write_to_file("pairwise_testset.jcml")
239 if n == 50:
240
241 pass
242
243
244 if n == 90:
245 classified_vector_file = open("classified.hard.txt", 'w')
246 for value in self.classified_values_vector:
247 classified_vector_file.write("{0}\n".format(value))
248
249 classified_vector_file.close()
250 classified_prob_file = open("classified.soft.txt", 'w')
251 for value1, value2 in self.classified_probs_vector:
252 classified_prob_file.write("{}\t{}\n".format(value1, value2))
253 classified_prob_file.close()
254 if n == 100:
255
256 Parallelsentence2Jcml(self.final_reconstructed_hard).write_to_file("testset.reconstructed.hard.jcml")
257
258 Parallelsentence2Jcml(self.final_reconstructed_soft).write_to_file("testset.reconstructed.soft.jcml")
259
260
261
262
264 self.class_name = "rank"
265
266
267 if n > 10 and n <=30 :
268 self.testset = JcmlReader("testset.jcml").get_dataset()
269
270
271 if n > 30 and n <=50:
272
273 pass
274 if n > 50:
275 self.pairwise_test_filename = "pairwise_testset.jcml"
276
277 if n > 70:
278 self.testset_orange_filename = "testset.tab"
279
280 if n > 90:
281 classified_vector_file = open("classified.hard.txt", 'r')
282 self.classified_values_vector = [int(line.strip()) for line in classified_vector_file]
283 classified_vector_file.close()
284 classified_prob_file = open("classified.soft.txt", 'r')
285 self.classified_probs_vector = [tuple(line.strip().split('\t')) for line in classified_prob_file]
286 self.classified_probs_vector = [(float(a),float(b)) for a,b in self.classified_probs_vector]
287 classified_prob_file.close()
288 if n > 100:
289 pass
290
291
292
293
294
295
296
297
298
299
300 - def _get_testset(self, test_filename, mode = "", ratio=0.7):
301 if not test_filename == "":
302 print "arbitrarily split given set to training and test sets 90% + 10%"
303 simple_trainset = JcmlReader("trainset.jcml").get_dataset()
304
305 if mode == "development":
306 simple_trainset, a = simple_trainset.split(0.03)
307
308 simple_trainset, simple_testset = simple_trainset.split(ratio)
309 Parallelsentence2Jcml(simple_trainset).write_to_file("trainset.jcml")
310 Parallelsentence2Jcml(simple_testset).write_to_file("testset.jcml")
311 else:
312 shutil.copy(test_filename, "testset.jcml")
313
314
315 -def get_scoring(testset, class_name, xid, featurename):
316 scoringset = Scoring(testset)
317 ret = {}
318 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid)))
319 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-ntp", exclude_ties=False))
320 ret.update(scoringset.get_kendall_tau(featurename, class_name, prefix="{}-".format(xid), suffix="-nt", penalize_predicted_ties=False))
321
322 ret["kendalltau_b-%s"%xid], ret["kendalltau_b-%s-pi"%xid] = scoringset.get_kendall_tau_b(featurename, class_name)
323 ret["b1-acc-1-%s"%xid], ret["b1-acc-%s-any"%xid] = scoringset.selectbest_accuracy(featurename, class_name)
324 ret["fr-%s"%xid] = scoringset.avg_first_ranked(featurename, class_name)
325 ret["pr-%s"%xid] = scoringset.avg_predicted_ranked(featurename, class_name)
326
327 sb_percentages = scoringset.best_predicted_vs_human(featurename, class_name)
328 for rank, percentage in sb_percentages.iteritems():
329 ret["sb-{}-{}".format(rank,xid)] = str(percentage)
330 return ret
331
332 -def score(testset, class_name, xid, featurename):
335
337 """
338 Fake file-like stream object that redirects writes to a logger instance.
339 """
340 - def __init__(self, logger, log_level=logging.INFO):
341 self.logger = logger
342 self.log_level = log_level
343 self.linebuf = ''
344
346 for line in buf.rstrip().splitlines():
347 self.logger.log(self.log_level, line.rstrip())
348
351
352
353
354 if __name__ == '__main__':
355 FORMAT = "%(asctime)-15s [%(process)d:%(thread)d] %(message)s "
356 now = datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M-%S")
357
358
359
360 mysuite = AutorankingSuite();
361 mysuite.start()
362