1 '''
2 Created on 19 Apr 2013
3
4 @author: Eleftherios Avramidis
5 '''
6
7 import cPickle as pickle
8 import sys
9
10 from dataprocessor.ce.cejcml2orange import CElementTreeJcml2Orange
11
12
13 from sentence.dataset import DataSet
14 from sentence.pairwisedataset import AnalyticPairwiseDataset
15 from sentence.pairwiseparallelsentenceset import CompactPairwiseParallelSentenceSet
16
17 from Orange.data import Table
18 from Orange.data import Instance, Value, Domain
19
20 from Orange.evaluation.testing import cross_validation
21 from Orange.classification.rules import rule_to_string
22 from Orange.classification.svm import get_linear_svm_weights
23 from Orange.classification import logreg
24
25
26 from Orange.classification.bayes import NaiveLearner
27 from Orange.classification.knn import kNNLearner
28 from Orange.classification.svm import SVMLearnerEasy as SVMEasyLearner
29 from Orange.classification.tree import TreeLearner
30 from Orange.classification.tree import C45Learner
31 from Orange.classification.logreg import LogRegLearner
32 from Orange.classification import Classifier
33 from Orange.feature import Continuous
34
35
37 """
38 Pythonic way to initialize and return an orange learner.
39 Pass any parameters needed for the initialization
40 @param name: the name of the learner to be returned
41 @type name: string
42 @return: an orange learner
43 @rtype: Orange.classification.Classifier
44 """
45 orangeclass = eval(name)
46 return orangeclass(**kwargs)
47
48
50 """
51 Return particular ranker class given a string
52
53 """
54 orangeclass = eval(name)
55 return OrangeRuntimeRanker(orangeclass(**kwargs))
56
57
59 """
60 Receive a parallel sentence and convert it into a memory instance for
61 the machine learner.
62 @param parallelsentence:
63 @type parallelsentence: L{sentence.parallelsentence.ParallelSentence}
64 @return: an orange instance
65 @type: Orange.data.Instance
66 """
67 attributes = parallelsentence.get_nested_attributes()
68
69 values = []
70
71
72
73 domain_features = domain.features
74
75 for feature in domain_features:
76 feature_type = feature.var_type
77 feature_name = feature.name
78
79 try:
80 value = attributes[feature_name]
81 except KeyError:
82 sys.stderr.write("Feature '{}' not given by the enabled generators\n".format(feature_name))
83 value = 0
84
85
86 orange_value = feature(value)
87 values.append(orange_value)
88
89
90 classless_domain = Domain(domain_features, False)
91 instance = Instance(classless_domain, values)
92 return instance
93
94
102
103
104
106 """
107 This class represents a ranker implemented over pairwise orange classifiers.
108 This ranker is loaded into the memory from a dump file which contains an already trained
109 model and provides functions to rank one source sentence + translations at a time
110 @ivar classifier: the orange classifier object
111 @type classifier: Orange.classification.Classifier
112 """
113
114 - def __init__(self, classifier_filename):
115 """
116 Load previously trained classifier given existing filename
117 @param classifier_filename: the filename which contains the trained classifier
118 @type classifier_filename: str
119 """
120 classifier_file = open(classifier_filename)
121 self.classifier = pickle.load(classifier_file)
122 classifier_file.close()
123
124
126 output = []
127 output.append("Used linear regression with Stepwise Feature Selection with the following weights")
128 coefficients = logreg.dump(self.classifier)
129 output.append(coefficients)
130
131 output.append("\n\n")
132 output.append("domain: {}\n\n".format(self.classifier.domain))
133
134 for resultentry in resultvector:
135 system_names = resultentry['systems']
136 value = resultentry['value']
137 instance = resultentry['instance']
138 distribution = resultentry['distribution']
139
140
141 if value == -1:
142 output.append("System{} < System{}".format(system_names[0], system_names[1]))
143 else:
144 output.append("System{} > System{}".format(system_names[0], system_names[1]))
145 output.append(" \n instance: {} \n probabilities: {}\n".format(instance, distribution))
146 return "".join(output)
147
149 """
150 Receive a parallel sentence with features and perform ranking
151 @param parallelsentence: an object containing the parallel sentence
152 @type parallelsentence: L{sentence.parallelsentence.ParallelSentence}
153 """
154
155
156 return_type = Classifier.GetBoth
157
158
159 domain = self.classifier.domain
160
161
162
163
164
165 resultvector = []
166
167
168 pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences()
169
170
171 classified_pairwise_parallelsentences = []
172
173 for pairwise_parallelsentence in pairwise_parallelsentences:
174
175
176 instance = parallelsentence_to_instance(domain, pairwise_parallelsentence)
177
178
179 value, distribution = self.classifier(instance, return_type)
180
181 sys.stderr.write("{}, {}, {}\n".format(pairwise_parallelsentence.get_system_names(), value, distribution))
182
183 resultvector.append({'systems' : pairwise_parallelsentence.get_system_names(),
184 'value' : (float(value.value)),
185 'distribution': distribution,
186 'instance' : instance})
187 pairwise_parallelsentence.add_attributes({"rank_predicted":float(value.value),
188 "prob_-1":distribution[0],
189 "prob_1":distribution[1]
190 })
191
192 classified_pairwise_parallelsentences.append(pairwise_parallelsentence)
193
194
195
196
197 sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences)
198 ranked_sentence = sentenceset.get_multiranked_sentence("rank_predicted")
199
200 result = [(t.get_attribute("rank"), t) for t in ranked_sentence.get_translations()]
201
202 description = self._get_description(resultvector)
203 return result, description
204
205
206
207
208
209
210
211
212
213
214
216 '''
217 Wrapper around an orange classifier object
218 @ivar learner: the wrapped orange class
219 @ivar training_data_filename: the jcml training file
220 @type training_data_filename: str
221 @ivar training_table: an Orange "table" of examples containing training instances
222 @type \L{Orange.data.Table}
223 @ivar model: the trained classifier
224 @type model: Orange.classification.Classifier
225 @ivar test_data_filename: the jcml test file
226 @type test_data_filename: str
227 @ivar test_table: the Orange "table" of test examples
228 @type \L{Orange.data.Table}
229 '''
231 '''
232 Constructor.
233 @param learner: an orange classifier whose functionality is to be wrapped
234 @type learner:
235
236 '''
237 self.learner = learner(**kwargs)
238 self.datafile = None
239 self.training_data_filename = None
240 self.training_table = None
241 self.model = None
242
243
244
245 - def set_training_data(self, jcml_filename,
246 class_name,
247 desired_attributes,
248 meta_attributes,
249
250 **kwargs):
251 '''
252 Read the data from an XML file, convert them to the proper format
253 and remember its location
254 @param jcml_filename: full path of the XML file where data reside
255 @type jcml_filename: string
256 @param class_name: name of class
257 @type class_name: string
258 @param desired_attributes: desired attributes
259 @type desired_attributes: list of strings
260 @param meta_attributes: meta attributes
261 @type meta_attributes: list of strings
262 '''
263
264 output_file = jcml_filename.replace(".jmcl", ".tab")
265
266 convertor = CElementTreeJcml2Orange(jcml_filename,
267 class_name,
268 desired_attributes,
269 meta_attributes,
270 output_file,
271 compact_mode=True,
272 **kwargs)
273
274 convertor.convert()
275 self.training_data_filename = output_file
276
277
279 '''
280 Load the previously defined/converted training data in place
281 '''
282 self.training_table = Table(self.training_data_filename)
283
285 '''
286 Free up the memory occupied by the training data
287 '''
288 self.training_table = None
289
290
292 '''
293 Perform cross validation on the training data.
294 @param folds: number of cross-validation folds
295 @type: int
296 @return: the value of the classification accuracy
297 @
298 '''
299 cv = cross_validation([self.learner], self.training_table, folds)
300 ca = CA(cv)
301 return ca
302
304 self.model = self.learner(self.training_table)
305 objectfile = self.training_data_filename.replace(".tab", ".clsf")
306 pickle.dump(self.model, objectfile)
307
308
309
310
311
313 try:
314 weights = get_linear_svm_weights(self.model)
315 textfilename = "{}.weights.txt".format(basename)
316 f = open(textfilename, "w")
317 f.write("Fitted parameters: \nnu = {0}\ngamma = {1}\n\nWeights: \n".format(self.model.fitted_parameters[0], self.model.fitted_parameters[1]))
318 for weight_name, weight_value in weights.iteritems():
319 f.write("{0}\t{1}\n".format(weight_name, weight_value))
320 f.close()
321 return True
322 except:
323 return False
324
325
327 try:
328 rules = self.model.rules
329 textfilename = "{}.rules.txt".format(basename)
330 f = open(textfilename, "w")
331 for r in rules:
332 f.write("{}\n".format(rule_to_string(r)))
333 f.close()
334 return
335 except:
336 pass
337
338
340 try:
341 textfilename = "{}.tree.txt".format(basename)
342 f = open(textfilename, "w")
343 f.write(self.model.to_string("leaf", "node"))
344 f.close()
345
346 graphics_filename = "{}.tree.dot".format(basename)
347 self.model.dot(graphics_filename, "leaf", "node")
348 except:
349 pass
350
351
353 '''
354 Method-specific functions for writing the model characteristics into a file
355 @param basename: specify part of the filename which will be written
356 @type basename: string
357 '''
358
359 self._write_model_svm()
360 self._write_model_rules()
361
362 try:
363 textfilename = "{}.logreg.dump.txt".format(basename)
364 f = open(textfilename, 'w')
365 f.write(logreg.dump(self.model))
366 f.close()
367 except:
368 pass
369
370
371 - def set_test_data(self, jcml_filename,
372 class_name,
373 desired_attributes,
374 meta_attributes,
375 output_file,
376 **kwargs):
377 '''
378 Read the data from an XML file, convert them to the proper format
379 and remember its location
380 @param jcml_filename: full path of the XML file where data reside
381 @type jcml_filename: string
382 @param class_name: name of class
383 @type class_name: string
384 @param desired_attributes: desired attributes
385 @type desired_attributes: list of strings
386 @param meta_attributes: meta attributes
387 @type meta_attributes: list of strings
388 '''
389
390 convertor = CElementTreeJcml2Orange(jcml_filename,
391 class_name,
392 desired_attributes,
393 meta_attributes,
394 output_file,
395 compact_mode=True,
396 **kwargs)
397
398 convertor.convert()
399 self.test_data_filename = output_file
400
402 self.test_table = Table(self.test_data_filename)
403
405 self.test_table = None
406
411