1 '''
2 Created on 25 Mar 2014
3
4 @author: Eleftherios Avramidis
5 '''
6 import numpy as np
7 from ml.learner import Regressor
8 from learn_model import set_learning_method, set_selection_method, scale_datasets_crossvalidation
9 from sklearn import cross_validation
10 from sklearn.metrics import make_scorer
11 import logging as log
12
13 from sklearn.externals.joblib import Parallel, delayed
14 from sklearn.base import is_classifier, clone
15 import numbers
16 from sklearn.cross_validation import is_classifier, check_cv, _PartitionIterator, KFold
17
18
19
20 -def dataset_to_instances(dataset,
21 class_name,
22 desired_parallel_attributes = [],
23 desired_source_attributes = [],
24 desired_target_attributes = [],
25 meta_attributes = [],
26 class_level="target"):
27
28 att_table = []
29 class_vector = []
30
31 f=open("data.tab", 'w')
32 c=open("class.tab", 'w')
33
34 for parallelsentence in dataset.get_parallelsentences():
35 for translation in parallelsentence.get_translations():
36
37
38 if class_name:
39 if class_level=="target":
40 class_vector.append(float(translation.get_attribute(class_name)))
41 c.write("{}\n".format(translation.get_attribute(class_name)))
42 elif class_level=="parallel":
43 class_vector.append(float(parallelsentence.get_attribute(class_name)))
44
45
46 att_row = []
47 log.debug("Target attributes: {}".format(len(desired_target_attributes)))
48 for att_name in desired_target_attributes:
49 if att_name != "":
50 try:
51 att_value = translation.get_attribute(att_name)
52 att_value = att_value.replace("inf", "99999999")
53 att_value = att_value.replace("nan", "0")
54 att_row.append(float(att_value))
55 f.write(str(att_value))
56 f.write("\t")
57 except AttributeError:
58 log.debug("target attribute {} could not be found in sentence with id={}, replacing with 0".format(att_name, parallelsentence.get_attribute("id")))
59 att_row.append(0)
60
61 log.debug("Parallel attributes: {}".format(len(desired_parallel_attributes)))
62 for att_name in desired_parallel_attributes:
63 if att_name != "":
64 try:
65 att_value = parallelsentence.get_attribute(att_name)
66 att_row.append(float(att_value))
67 except AttributeError:
68 log.debug("parallel attribute {} could not be found in sentence with id={}, replacing with 0".format(att_name, parallelsentence.get_attribute("id")))
69 att_row.append(0)
70
71 log.debug("Source attributes: {}".format(len(desired_source_attributes)))
72 for att_name in desired_source_attributes:
73 if att_name != "":
74 try:
75 att_value = parallelsentence.get_source().get_attribute(att_name)
76 att_value = att_value.replace("inf", "99999999")
77 att_value = att_value.replace("nan", "0")
78 att_row.append(float(att_value))
79 f.write(str(att_value))
80 f.write("\t")
81 except AttributeError:
82 log.debug("source attribute {} could not be found in sentence with id={}, replacing with 0".format(att_name, parallelsentence.get_attribute("id")))
83 att_row.append(0)
84
85 log.debug("id: {}, row length: {}".format(parallelsentence.get_attribute("id"), len(att_row)))
86 f.write("\n")
87 att_table.append(att_row)
88
89 numpy_att_table = np.asarray(att_table)
90
91 numpy_class_vector = np.asarray(class_vector)
92
93 if len(numpy_att_table.shape) != 2:
94 log.info("Shape of loaded data: {}".format(numpy_att_table.shape))
95 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.")
96
97
98
99
100 f.close()
101 c.close()
102
103
104 return numpy_att_table, numpy_class_vector
105
106
107
108
109
110
111
112
113
117
118
119 - def load_training_dataset(self, dataset,
120 class_name,
121 desired_parallel_attributes = [],
122 desired_source_attributes = [],
123 desired_target_attributes = [],
124 meta_attributes = [],
125 scale=True):
136
137
139 config = self.config
140
141 transformer = set_selection_method(config, threshold)
142
143
144
145 if transformer is not None:
146 log.info("Running feature selection %s" % str(transformer))
147 log.info("X_train dimensions before fit_transform(): %s,%s" % self.X_train.shape)
148 log.info("y_train dimensions before fit_transform(): %s" % self.y_train.shape)
149
150 X_train = transformer.fit_transform(self.X_train, self.y_train)
151
152 log.info("Dimensions after fit_transform(): %s,%s" % X_train.shape)
153
154
156 self.estimator, self.scorers = set_learning_method(self.config, self.X_train, self.y_train)
157
158
160 if not scorer:
161 scorer = make_scorer(self.scorers[0][1])
162 log.info("Running cross validator with %s" % str(self.estimator))
163 if not fixed_folds:
164 cv = KFold(len(self.y_train), n_folds=cv, indices=True)
165 print "test instances:\n", [fold[1] for fold in cv]
166 else:
167 log.info("proceeding with fixed folds provided")
168 cv = FixedFolds(len(self.y_train), fixed_folds)
169
170 scores = cross_validation.cross_val_score(self.estimator, self.X_train, self.y_train, cv=cv, n_jobs=n_jobs, scoring=scorer)
171 return scores
172
173
174 - def train_test(self, X_test, blah, dummy, roundup=None):
178
179
180
181
183 - def __init__(self, n, existing_test_indices):
184 self.test_folds = existing_test_indices
185 self.indices=True
186 self.n = n
187
189 for test_folds in self.test_folds:
190 yield test_folds
191
193 return '{}.{} (n={})'.format(
194 self.__class__.__module__,
195 self.__class__.__name__,
196 len(self.test_folds)
197 )
198
200 return len(self.test_folds)
201
202
203 -def ter_train_test(estimators, Xs_train, ys_train, X_test, denominator, verbose, fit_params, roundup=False):
204 estimations = []
205
206
207
208 for estimator in estimators:
209 X_train = Xs_train[estimator]
210 y_train = ys_train[estimator]
211 estimator.fit(X_train, y_train)
212 y_predict = estimator.predict(X_test)
213 if roundup:
214 y_predict = np.rint(y_predict)
215 estimations.append(y_predict)
216
217 all_estimations = np.column_stack(estimations)
218 log.info("all_estimations.shape = {}".format(all_estimations.shape))
219
220 sum_estimations = np.sum(all_estimations, axis=1)
221 log.info("sum_estimations.shape = {}".format(sum_estimations.shape))
222
223
224 ter = np.divide(sum_estimations, denominator)
225
226 for i in range(len(ter)):
227 log.info("ter{} = {:.3g} + {:.3g} + {:.3g} + {:.3g} / {} = {:.3g}".format(i, estimations[0][i], estimations[1][i], estimations[2][i], estimations[3][i], X_test[i,0], ter[i]))
228 return ter
229
230 -def ter_cross_validate_fold(estimators, X_dic, y_dic, denominator, tergold, scorer, train, test, verbose, fit_params, roundup=False):
231 estimations = []
232 denom_test = denominator[test]
233 tergold_test = tergold[test]
234 for estimator in estimators:
235 X = X_dic[estimator]
236 y = y_dic[estimator]
237 X_train = [X[idx] for idx in train]
238 X_test = [X[idx] for idx in test]
239 y_train = y[train]
240 y_test = y[test]
241 estimator.fit(X_train, y_train)
242 y_predict = estimator.predict(X_test)
243 if roundup:
244 y_predict = np.rint(y_predict)
245 estimations.append(y_predict)
246
247 all_estimations = np.column_stack(estimations)
248 log.info("all_estimations.shape = {}".format(all_estimations.shape))
249
250 sum_estimations = np.sum(all_estimations, axis=1)
251 log.info("sum_estimations.shape = {}".format(sum_estimations.shape))
252
253
254 ter = np.divide(sum_estimations, denom_test)
255 for i in range (0,10):
256 log.info("ter{} = {:.3g} + {:.3g} + {:.3g} + {:.3g} / {} = {:.3g} [{:.3g}]".format(i, estimations[0][i], estimations[1][i], estimations[2][i], estimations[3][i], X[i,0], ter[i], tergold_test[i]))
257
258
259
260
261
262
263 score = scorer(ter, tergold_test)
264 return score
265
266
268 - def __init__(self, config, skregressors, tergold):
269 self.tergold = tergold
270 self.config = config
271 self.estimators = [skregressor.estimator for skregressor in skregressors]
272 self.scorers = skregressors[0].scorers
273 self.X_train = {}
274 self.y_train = {}
275 for skregressor in skregressors:
276 self.X_train[skregressor.estimator] = skregressor.X_train
277 self.y_train[skregressor.estimator] = skregressor.y_train
278 self.size = len(self.y_train[skregressors[0].estimator])
279 self.denominator = self.X_train[skregressors[0].estimator][:,0]
280
281 - def cross_validate_start(self, cv=10, n_jobs=15, verbose=0, pre_dispatch='2*n_jobs', fit_params=None, fixed_folds=None, roundup=False):
282 if not fixed_folds:
283 cvfolds = KFold(self.size, n_folds=cv, indices=True)
284 else:
285 log.info("proceeding with fixed folds provided")
286 cvfolds = FixedFolds(self.size, fixed_folds)
287 parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
288 pre_dispatch=pre_dispatch)
289 scorer = self.scorers[0][1]
290 scores = parallel(
291 delayed(ter_cross_validate_fold)(self.estimators, self.X_train, self.y_train, self.denominator, self.tergold, scorer, train, test, verbose, fit_params, roundup)
292 for train, test in cvfolds)
293 scores = np.array(scores)
294 return scores
295
296 - def train_test(self, X_test, verbose, fit_params, roundup=False):
299
300
301
302
303
304
305 from sklearn.svm import SVR
306
309 self.estimators = estimators
310
311 - def fit(self, X, y):
312
313 for estimator in self.estimators:
314 estimator.fit(X,y)
315 return self
316
318 estimations = []
319
320
321 for estimator in self.estimators:
322 ex = estimator.predict(X)
323
324 estimation = np.rint(ex)
325
326 log.info("estimation.shape = {}".format(estimation.shape))
327 estimations.append(estimation)
328
329
330
331 all_estimations = np.column_stack(estimations)
332 log.info("all_estimations.shape = {}".format(all_estimations.shape))
333
334 sum_estimations = np.sum(all_estimations, axis=1)
335 log.info("sum_estimations.shape = {}".format(sum_estimations.shape))
336 log.info("tokens.shape = {}".format(X[:,0].shape))
337
338 ter = np.divide(sum_estimations, X[:,0])
339 for i in range (0,10):
340 log.info("ter{} = {:.3g} + {:.3g} + {:.3g} + {:.3g} / {} = {:.3g}".format(i, estimations[0][i], estimations[1][i], estimations[2][i], estimations[3][i], X[i,0], ter[i]))
341
342
343
344 return ter
345