ml.lib.scikit.scikit

1 ''' 2 Created on 25 Mar 2014 3 4 @author: Eleftherios Avramidis 5 ''' 6 import numpy as np 7 from ml.learner import Regressor 8 from learn_model import set_learning_method, set_selection_method, scale_datasets_crossvalidation 9 from sklearn import cross_validation 10 from sklearn.metrics import make_scorer 11 import logging as log 12 13 from sklearn.externals.joblib import Parallel, delayed 14 from sklearn.base import is_classifier, clone 15 import numbers 16 from sklearn.cross_validation import is_classifier, check_cv, _PartitionIterator, KFold 17 18 19

20 -def dataset_to_instances(dataset, 21 class_name, 22 desired_parallel_attributes = [], 23 desired_source_attributes = [], 24 desired_target_attributes = [], 25 meta_attributes = [], 26 class_level="target"):

27 28 att_table = [] 29 class_vector = [] 30 31 f=open("data.tab", 'w') 32 c=open("class.tab", 'w') 33 34 for parallelsentence in dataset.get_parallelsentences(): 35 for translation in parallelsentence.get_translations(): 36 #log.debug("Parallelsentence {}".format(parallelsentence.get_attribute("id"))) 37 #get the class value 38 if class_name: 39 if class_level=="target": 40 class_vector.append(float(translation.get_attribute(class_name))) 41 c.write("{}\n".format(translation.get_attribute(class_name))) 42 elif class_level=="parallel": 43 class_vector.append(float(parallelsentence.get_attribute(class_name))) 44 45 #get all features in a row and then in a table 46 att_row = [] 47 log.debug("Target attributes: {}".format(len(desired_target_attributes))) 48 for att_name in desired_target_attributes: 49 if att_name != "": 50 try: 51 att_value = translation.get_attribute(att_name) 52 att_value = att_value.replace("inf", "99999999") 53 att_value = att_value.replace("nan", "0") 54 att_row.append(float(att_value)) 55 f.write(str(att_value)) 56 f.write("\t") 57 except AttributeError: 58 log.debug("target attribute {} could not be found in sentence with id={}, replacing with 0".format(att_name, parallelsentence.get_attribute("id"))) 59 att_row.append(0) 60 61 log.debug("Parallel attributes: {}".format(len(desired_parallel_attributes))) 62 for att_name in desired_parallel_attributes: 63 if att_name != "": 64 try: 65 att_value = parallelsentence.get_attribute(att_name) 66 att_row.append(float(att_value)) 67 except AttributeError: 68 log.debug("parallel attribute {} could not be found in sentence with id={}, replacing with 0".format(att_name, parallelsentence.get_attribute("id"))) 69 att_row.append(0) 70 71 log.debug("Source attributes: {}".format(len(desired_source_attributes))) 72 for att_name in desired_source_attributes: 73 if att_name != "": 74 try: 75 att_value = parallelsentence.get_source().get_attribute(att_name) 76 att_value = att_value.replace("inf", "99999999") 77 att_value = att_value.replace("nan", "0") 78 att_row.append(float(att_value)) 79 f.write(str(att_value)) 80 f.write("\t") 81 except AttributeError: 82 log.debug("source attribute {} could not be found in sentence with id={}, replacing with 0".format(att_name, parallelsentence.get_attribute("id"))) 83 att_row.append(0) 84 85 log.debug("id: {}, row length: {}".format(parallelsentence.get_attribute("id"), len(att_row))) 86 f.write("\n") 87 att_table.append(att_row) 88 89 numpy_att_table = np.asarray(att_table) 90 #log.debug("numpy_att_table: {}".format(numpy_att_table)) 91 numpy_class_vector = np.asarray(class_vector) 92 93 if len(numpy_att_table.shape) != 2: 94 log.info("Shape of loaded data: {}".format(numpy_att_table.shape)) 95 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.") 96 97 #if numpy_att_table.shape[0] != numpy_class_vector.shape[0]: 98 # raise IOError("the number of instances in the train features file does not match the number of references given.") 99 100 f.close() 101 c.close() 102 103 104 return numpy_att_table, numpy_class_vector

105 106 107 # from sklearn.pipeline import Pipeline 108 # def set_pipeline(config, X_train, y_train): 109 # learning_cfg = config.get("learning", None) 110 # if learning_cfg: 111 # pipe = Pipeline(steps=[('gaussian', None), ('SVM', None)]) 112 113

114 -class SkRegressor(Regressor):

115 - def __init__(self, config=None):

116 self.config = config

117 118

119 - def load_training_dataset(self, dataset, 120 class_name, 121 desired_parallel_attributes = [], 122 desired_source_attributes = [], 123 desired_target_attributes = [], 124 meta_attributes = [], 125 scale=True):

126 127 self.X_train, self.y_train = dataset_to_instances(dataset, 128 class_name, 129 desired_parallel_attributes, 130 desired_source_attributes, 131 desired_target_attributes, 132 meta_attributes) 133 134 if scale: 135 self.X_train = scale_datasets_crossvalidation(self.X_train)

136 137

138 - def feature_selection(self, threshold=.25):

139 config = self.config 140 # sets the selection method 141 transformer = set_selection_method(config, threshold) 142 143 # if the system is configured to run feature selection 144 # runs it and modifies the datasets to the new dimensions 145 if transformer is not None: 146 log.info("Running feature selection %s" % str(transformer)) 147 log.info("X_train dimensions before fit_transform(): %s,%s" % self.X_train.shape) 148 log.info("y_train dimensions before fit_transform(): %s" % self.y_train.shape) 149 150 X_train = transformer.fit_transform(self.X_train, self.y_train) 151 152 log.info("Dimensions after fit_transform(): %s,%s" % X_train.shape)

153 154

155 - def set_learning_method(self):

156 self.estimator, self.scorers = set_learning_method(self.config, self.X_train, self.y_train)

157 158

159 - def cross_validate_start(self, cv=10, n_jobs=15, scorer=None, fixed_folds=None):

160 if not scorer: 161 scorer = make_scorer(self.scorers[0][1]) 162 log.info("Running cross validator with %s" % str(self.estimator)) 163 if not fixed_folds: 164 cv = KFold(len(self.y_train), n_folds=cv, indices=True) 165 print "test instances:\n", [fold[1] for fold in cv] 166 else: 167 log.info("proceeding with fixed folds provided") 168 cv = FixedFolds(len(self.y_train), fixed_folds) 169 170 scores = cross_validation.cross_val_score(self.estimator, self.X_train, self.y_train, cv=cv, n_jobs=n_jobs, scoring=scorer) 171 return scores

172 # return scores 173

174 - def train_test(self, X_test, blah, dummy, roundup=None):

175 self.estimator.fit(self.X_train, self.y_train) 176 X_test = scale_datasets_crossvalidation(X_test) 177 return self.estimator.predict(X_test)

178 179 180 181

182 -class FixedFolds(_PartitionIterator):

183 - def __init__(self, n, existing_test_indices):

184 self.test_folds = existing_test_indices 185 self.indices=True 186 self.n = n

187

188 - def _iter_test_indices(self):

189 for test_folds in self.test_folds: 190 yield test_folds

191

192 - def __repr__(self):

193 return '{}.{} (n={})'.format( 194 self.__class__.__module__, 195 self.__class__.__name__, 196 len(self.test_folds) 197 )

198

199 - def __len__(self):

200 return len(self.test_folds)

201 202

203 -def ter_train_test(estimators, Xs_train, ys_train, X_test, denominator, verbose, fit_params, roundup=False):

204 estimations = [] 205 206 207 208 for estimator in estimators: 209 X_train = Xs_train[estimator] 210 y_train = ys_train[estimator] 211 estimator.fit(X_train, y_train) 212 y_predict = estimator.predict(X_test) 213 if roundup: 214 y_predict = np.rint(y_predict) 215 estimations.append(y_predict) 216 217 all_estimations = np.column_stack(estimations) 218 log.info("all_estimations.shape = {}".format(all_estimations.shape)) 219 220 sum_estimations = np.sum(all_estimations, axis=1) 221 log.info("sum_estimations.shape = {}".format(sum_estimations.shape)) 222 # log.info("tokens.shape = {}".format(X[:,0].shape)) 223 224 ter = np.divide(sum_estimations, denominator) 225 226 for i in range(len(ter)): 227 log.info("ter{} = {:.3g} + {:.3g} + {:.3g} + {:.3g} / {} = {:.3g}".format(i, estimations[0][i], estimations[1][i], estimations[2][i], estimations[3][i], X_test[i,0], ter[i])) 228 return ter

229

230 -def ter_cross_validate_fold(estimators, X_dic, y_dic, denominator, tergold, scorer, train, test, verbose, fit_params, roundup=False):

231 estimations = [] 232 denom_test = denominator[test] 233 tergold_test = tergold[test] 234 for estimator in estimators: 235 X = X_dic[estimator] 236 y = y_dic[estimator] 237 X_train = [X[idx] for idx in train] 238 X_test = [X[idx] for idx in test] 239 y_train = y[train] 240 y_test = y[test] 241 estimator.fit(X_train, y_train) 242 y_predict = estimator.predict(X_test) 243 if roundup: 244 y_predict = np.rint(y_predict) 245 estimations.append(y_predict) 246 247 all_estimations = np.column_stack(estimations) 248 log.info("all_estimations.shape = {}".format(all_estimations.shape)) 249 250 sum_estimations = np.sum(all_estimations, axis=1) 251 log.info("sum_estimations.shape = {}".format(sum_estimations.shape)) 252 # log.info("tokens.shape = {}".format(X[:,0].shape)) 253 254 ter = np.divide(sum_estimations, denom_test) 255 for i in range (0,10): 256 log.info("ter{} = {:.3g} + {:.3g} + {:.3g} + {:.3g} / {} = {:.3g} [{:.3g}]".format(i, estimations[0][i], estimations[1][i], estimations[2][i], estimations[3][i], X[i,0], ter[i], tergold_test[i])) 257 258 # print ter.shape, 259 # print y_test.shape, 260 261 # print ter 262 # print tergold_test 263 score = scorer(ter, tergold_test) 264 return score

265 266

267 -class TerRegressor(SkRegressor):

268 - def __init__(self, config, skregressors, tergold):

269 self.tergold = tergold 270 self.config = config 271 self.estimators = [skregressor.estimator for skregressor in skregressors] 272 self.scorers = skregressors[0].scorers 273 self.X_train = {} 274 self.y_train = {} 275 for skregressor in skregressors: 276 self.X_train[skregressor.estimator] = skregressor.X_train 277 self.y_train[skregressor.estimator] = skregressor.y_train 278 self.size = len(self.y_train[skregressors[0].estimator]) 279 self.denominator = self.X_train[skregressors[0].estimator][:,0]

280

281 - def cross_validate_start(self, cv=10, n_jobs=15, verbose=0, pre_dispatch='2*n_jobs', fit_params=None, fixed_folds=None, roundup=False):

282 if not fixed_folds: 283 cvfolds = KFold(self.size, n_folds=cv, indices=True) 284 else: 285 log.info("proceeding with fixed folds provided") 286 cvfolds = FixedFolds(self.size, fixed_folds) 287 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, 288 pre_dispatch=pre_dispatch) 289 scorer = self.scorers[0][1] 290 scores = parallel( 291 delayed(ter_cross_validate_fold)(self.estimators, self.X_train, self.y_train, self.denominator, self.tergold, scorer, train, test, verbose, fit_params, roundup) 292 for train, test in cvfolds) 293 scores = np.array(scores) 294 return scores

295

296 - def train_test(self, X_test, verbose, fit_params, roundup=False):

297 X_test = scale_datasets_crossvalidation(X_test) 298 return ter_train_test(self.estimators, self.X_train, self.y_train, X_test, X_test[:,0], verbose, fit_params, roundup=False)

299 300 301 302 303 304 305 from sklearn.svm import SVR 306

307 -class TerSVR(SVR):

308 - def __init__(self, estimators):

309 self.estimators = estimators

310

311 - def fit(self, X, y):

312 # for estimator, X, y in zip(self.estimators, Xs[:-1], ys[:-1]): 313 for estimator in self.estimators: 314 estimator.fit(X,y) 315 return self

316

317 - def predict(self, X):

318 estimations = [] 319 # print "X: ", X 320 # for estimator, X in zip(self.estimators, Xs[:-1]): 321 for estimator in self.estimators: 322 ex = estimator.predict(X) 323 # print ex 324 estimation = np.rint(ex) 325 # print estimation 326 log.info("estimation.shape = {}".format(estimation.shape)) 327 estimations.append(estimation) 328 #log.info("Estimate: {}".format(estimate)) 329 330 #each estimation on numpy row 331 all_estimations = np.column_stack(estimations) 332 log.info("all_estimations.shape = {}".format(all_estimations.shape)) 333 334 sum_estimations = np.sum(all_estimations, axis=1) 335 log.info("sum_estimations.shape = {}".format(sum_estimations.shape)) 336 log.info("tokens.shape = {}".format(X[:,0].shape)) 337 338 ter = np.divide(sum_estimations, X[:,0]) 339 for i in range (0,10): 340 log.info("ter{} = {:.3g} + {:.3g} + {:.3g} + {:.3g} / {} = {:.3g}".format(i, estimations[0][i], estimations[1][i], estimations[2][i], estimations[3][i], X[i,0], ter[i])) 341 342 # print "x: ", [x[0] for x in X] 343 # print "Ter: ", ter 344 return ter

345

Source Code for Module ml.lib.scikit.scikit