ml.lib.scikit.learn

1 #!/usr/bin/env python 2 # encoding: utf-8 3 ''' 4 learn_model -- Program that learns machine translation quality estimation 5 models 6 7 learn_model is a program with which is possible to learn models for 8 sentence-pair quality estimation models using the algorithms implemented in the 9 scikit-learn machine learning toolkit. 10 11 It defines functions to work with different machine learning algorithms as well 12 as feature selection techniques and features preprocessing. The only dependency 13 so far is the sklearn package. ConfigParser is used to parse the configuration 14 file which has a similar layout to the Java properties file. 15 16 @author: Jose' de Souza 17 18 @copyright: 2012. All rights reserved. 19 20 @license: Apache License 2.0 21 22 @contact: jose.camargo.souza@gmail.com 23 @deffield updated: Updated 24 ''' 25 26 from argparse import ArgumentParser, RawDescriptionHelpFormatter 27 from evaluation_measures import root_mean_squared_error, mean_absolute_error 28 from sklearn.ensemble.forest import ExtraTreesClassifier 29 from sklearn.grid_search import GridSearchCV 30 from sklearn.linear_model.coordinate_descent import LassoCV 31 from sklearn.linear_model.least_angle import LassoLarsCV, LassoLars 32 from sklearn.linear_model. randomized_l1 import RandomizedLasso 33 from sklearn.metrics.metrics import mean_squared_error, f1_score, \ 34 precision_score, recall_score 35 from sklearn.svm.classes import SVR, SVC 36 from sklearn_utils import scale_datasets, open_datasets, assert_number, \ 37 assert_string 38 from sklearn import cross_validation 39 import logging as log 40 import numpy as np 41 import os 42 import sys 43 import yaml 44 from sklearn_utils import open_datasets_crossvalidation,\ 45 scale_datasets_crossvalidation 46 47 __all__ = [] 48 __version__ = 0.1 49 __date__ = '2012-11-01' 50 __updated__ = '2012-11-01' 51 52 DEBUG = 0 53 PROFILE = 0 54 55 DEFAULT_SEP = "\t" 56

57 -class CLIError(Exception):

58 '''Generic exception to raise and log different fatal errors.'''

59 - def __init__(self, msg):

60 super(CLIError).__init__(type(self)) 61 self.msg = "E: %s" % msg

62 - def __str__(self):

63 return self.msg

64 - def __unicode__(self):

65 return self.msg

66

67 -def set_selection_method(config, threshold=.25):

68 """ 69 Given the configuration settings, this function instantiates the configured 70 feature selection method initialized with the preset parameters. 71 72 TODO: implement the same method using reflection (load the class dinamically 73 at runtime) 74 75 @param config: the configuration file object loaded using yaml.load() 76 @return: an object that implements the TransformerMixin class (with fit(), 77 fit_transform() and transform() methods). 78 """ 79 transformer = None 80 81 selection_cfg = config.get("feature_selection", None) 82 if selection_cfg: 83 method_name = selection_cfg.get("method", None) 84 85 # checks for RandomizedLasso 86 if method_name == "RandomizedLasso": 87 p = selection_cfg.get("parameters", None) 88 if p: 89 transformer = \ 90 RandomizedLasso(alpha=p.get("alpha", "aic"), 91 scaling=p.get("scaling", .5), 92 sample_fraction=p.get('sample_fraction', .75), 93 n_resampling=p.get('n_resampling', 200), 94 selection_threshold=threshold, 95 fit_intercept=p.get('fit_intercept', True), 96 # TODO: set verbosity according to global level 97 verbose=True, 98 normalize=p.get('normalize', True), 99 max_iter=p.get('max_iter', 500), 100 n_jobs=p.get('n_jobs', 1)) 101 else: 102 transformer = RandomizedLasso() 103 104 # checks for ExtraTreesClassifier 105 elif method_name == "ExtraTreesClassifier": 106 p = selection_cfg.get("parameters", None) 107 if p: 108 transformer = \ 109 ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10), 110 max_depth=p.get('max_depth', None), 111 min_samples_split=p.get('min_samples_split', 1), 112 min_samples_leaf=p.get('min_samples_leaf', 1), 113 min_density=p.get('min_density', 1), 114 max_features=p.get('max_features', 'auto'), 115 bootstrap=p.get('bootstrap', False), 116 compute_importances=p.get('compute_importances', True), 117 n_jobs=p.get('n_jobs', 1), 118 random_state=p.get('random_state', None), 119 # TODO: set verbosity according to global level 120 verbose=True) 121 else: 122 transformer = ExtraTreesClassifier() 123 124 125 return transformer

126 127

128 -def set_scorer_functions(scorers):

129 scores = [] 130 for score in scorers: 131 if score == 'mae': 132 scores.append((score, mean_absolute_error)) 133 elif score == 'rmse': 134 scores.append((score, root_mean_squared_error)) 135 elif score == 'mse': 136 scores.append((score, mean_squared_error)) 137 elif score == 'f1_score': 138 scores.append((score, f1_score)) 139 elif score == 'precision_score': 140 scores.append((score, precision_score)) 141 elif score == 'recall_score': 142 scores.append((score, recall_score)) 143 144 return scores

145 146

147 -def set_optimization_params(opt):

148 params = {} 149 for key, item in opt.items(): 150 # checks if the item is a list with numbers (ignores cv and n_jobs params) 151 if isinstance(item, list) and (len(item) == 3) and assert_number(item): 152 # create linear space for each parameter to be tuned 153 params[key] = np.linspace(item[0], item[1], num=item[2], endpoint=True) 154 155 elif isinstance(item, list) and assert_string(item): 156 print key, item 157 params[key] = item 158 159 return params

160 161

162 -def optimize_model(estimator, X_train, y_train, params, scores, folds, verbose, n_jobs):

163 clf = None 164 for score_name, score_func in scores: 165 log.info("Tuning hyper-parameters for %s" % score_name) 166 167 log.debug(params) 168 log.debug(scores) 169 170 clf = GridSearchCV(estimator, params, loss_func=score_func, 171 cv=folds, verbose=verbose, n_jobs=n_jobs) 172 173 clf.fit(X_train, y_train) 174 175 log.info("Best parameters set found on development set:") 176 log.info(clf.best_params_) 177 178 return clf.best_estimator_

179 180

181 -def set_learning_method(config, X_train, y_train):

182 """ 183 Instantiates the sklearn's class corresponding to the value set in the 184 configuration file for running the learning method. 185 186 TODO: use reflection to instantiate the classes 187 188 @param config: configuration object 189 @return: an estimator with fit() and predict() methods 190 """ 191 estimator = None 192 193 learning_cfg = config.get("learning", None) 194 if learning_cfg: 195 p = learning_cfg.get("parameters", None) 196 o = learning_cfg.get("optimize", None) 197 scorers = \ 198 set_scorer_functions(learning_cfg.get("scorer", ['mae', 'rmse'])) 199 200 method_name = learning_cfg.get("method", None) 201 if method_name == "SVR": 202 if o: 203 tune_params = set_optimization_params(o) 204 estimator = optimize_model(SVR(), X_train, y_train, 205 tune_params, 206 scorers, 207 o.get("cv", 5), 208 o.get("verbose", True), 209 o.get("n_jobs", 1)) 210 211 elif p: 212 estimator = SVR(C=p.get("C", 10), 213 epsilon=p.get('epsilon', 0.01), 214 kernel=p.get('kernel', 'rbf'), 215 degree=p.get('degree', 3), 216 gamma=p.get('gamma', 0.0034), 217 tol=p.get('tol', 1e-3), 218 verbose=False) 219 else: 220 estimator = SVR() 221 222 elif method_name == "SVC": 223 if o: 224 tune_params = set_optimization_params(o) 225 estimator = optimize_model(SVC(), X_train, y_train, 226 tune_params, 227 scorers, 228 o.get('cv', 5), 229 o.get('verbose', True), 230 o.get('n_jobs', 1)) 231 232 elif p: 233 estimator = SVC(C=p.get('C', 1.0), 234 kernel=p.get('kernel', 'rbf'), 235 degree=p.get('degree', 3), 236 gamma=p.get('gamma', 0.0), 237 coef0=p.get('coef0', 0.0), 238 tol=p.get('tol', 1e-3), 239 verbose=p.get('verbose', False)) 240 else: 241 estimator = SVC() 242 243 elif method_name == "LassoCV": 244 if p: 245 estimator = LassoCV(eps=p.get('eps', 1e-3), 246 n_alphas=p.get('n_alphas', 100), 247 normalize=p.get('normalize', False), 248 precompute=p.get('precompute', 'auto'), 249 max_iter=p.get('max_iter', 1000), 250 tol=p.get('tol', 1e-4), 251 cv=p.get('cv', 10), 252 verbose=False) 253 else: 254 estimator = LassoCV() 255 256 elif method_name == "LassoLars": 257 if o: 258 tune_params = set_optimization_params(o) 259 estimator = optimize_model(LassoLars(), X_train, y_train, 260 tune_params, 261 scorers, 262 o.get("cv", 5), 263 o.get("verbose", True), 264 o.get("n_jobs", 1)) 265 266 if p: 267 estimator = LassoLars(alpha=p.get('alpha', 1.0), 268 fit_intercept=p.get('fit_intercept', True), 269 verbose=p.get('verbose', False), 270 normalize=p.get('normalize', True), 271 max_iter=p.get('max_iter', 500), 272 fit_path=p.get('fit_path', True)) 273 else: 274 estimator = LassoLars() 275 276 elif method_name == "LassoLarsCV": 277 if p: 278 estimator = LassoLarsCV(max_iter=p.get('max_iter', 500), 279 normalize=p.get('normalize', True), 280 max_n_alphas=p.get('max_n_alphas', 1000), 281 n_jobs=p.get('n_jobs', 1), 282 cv=p.get('cv', 10), 283 verbose=False) 284 else: 285 estimator = LassoLarsCV() 286 287 return estimator, scorers

288 289

290 -def fit_predict(config, X_train, y_train, X_test=None, y_test=None):

291 ''' 292 Uses the configuration dictionary settings to train a model using the 293 specified training algorithm. If set, also evaluates the trained model 294 in a test set. Additionally, performs feature selection and model parameters 295 optimization. 296 297 @param config: the configuration dictionary obtained parsing the 298 configuration file. 299 @param X_train: the np.array object for the matrix containing the feature 300 values for each instance in the training set. 301 @param y_train: the np.array object for the response values of each instance 302 in the training set. 303 @param X_test: the np.array object for the matrix containing the feature 304 values for each instance in the test set. Default is None. 305 @param y_test: the np.array object for the response values of each instance 306 in the test set. Default is None. 307 ''' 308 # sets the selection method 309 transformer = set_selection_method(config) 310 311 # if the system is configured to run feature selection 312 # runs it and modifies the datasets to the new dimensions 313 if transformer is not None: 314 log.info("Running feature selection %s" % str(transformer)) 315 316 log.debug("X_train dimensions before fit_transform(): %s,%s" % X_train.shape) 317 log.debug("y_train dimensions before fit_transform(): %s" % y_train.shape) 318 319 X_train = transformer.fit_transform(X_train, y_train) 320 321 log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape) 322 323 if X_test is not None: 324 X_test = transformer.transform(X_test) 325 326 327 # sets learning algorithm and runs it over the training data 328 estimator, scorers = set_learning_method(config, X_train, y_train) 329 log.info("Running learning algorithm %s" % str(estimator)) 330 estimator.fit(X_train, y_train) 331 332 if (X_test is not None) and (y_test is not None): 333 log.info("Predicting unseen data using the trained model...") 334 y_hat = estimator.predict(X_test) 335 336 log.info("Evaluating prediction on the test set...") 337 for scorer_name, scorer_func in scorers: 338 v = scorer_func(y_test, y_hat) 339 log.info("%s = %s" % (scorer_name, v))

340 341 342

343 -def cross_validate(config, X_train, y_train):

344 ''' 345 Uses the configuration dictionary settings to train a model using the 346 specified training algorithm. If set, also evaluates the trained model 347 in a test set. Additionally, performs feature selection and model parameters 348 optimization. 349 350 @param config: the configuration dictionary obtained parsing the 351 configuration file. 352 @param X_train: the np.array object for the matrix containing the feature 353 values for each instance in the training set. 354 @param y_train: the np.array object for the response values of each instance 355 in the training set. 356 @param X_test: the np.array object for the matrix containing the feature 357 values for each instance in the test set. Default is None. 358 @param y_test: the np.array object for the response values of each instance 359 in the test set. Default is None. 360 ''' 361 # sets the selection method 362 transformer = set_selection_method(config) 363 364 # if the system is configured to run feature selection 365 # runs it and modifies the datasets to the new dimensions 366 if transformer is not None: 367 log.info("Running feature selection %s" % str(transformer)) 368 369 log.debug("X_train dimensions before fit_transform(): %s,%s" % X_train.shape) 370 log.debug("y_train dimensions before fit_transform(): %s" % y_train.shape) 371 372 X_train = transformer.fit_transform(X_train, y_train) 373 374 log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape) 375 376 377 378 379 # sets learning algorithm and runs it over the training data 380 estimator, scorers = set_learning_method(config, X_train, y_train) 381 log.info("Running cross validator with %s" % str(estimator)) 382 scores = cross_validation.cross_val_score(estimator, X_train, y_train, cv=10, scoring=scorers) 383 return scores

384 385

386 -def run(config):

387 ''' 388 Runs the main code of the program. Checks for mandatory parameters, opens 389 input files and performs the learning steps. 390 ''' 391 # check if the mandatory parameters are set in the config file 392 x_train_path = config.get("x_train", None) 393 if not x_train_path: 394 msg = "'x_train' option not found in the configuration file. \ 395 The training dataset is mandatory." 396 raise Exception(msg) 397 398 y_train_path = config.get("y_train", None) 399 if not y_train_path: 400 msg = "'y_train' option not found in the configuration file. \ 401 The training dataset is mandatory." 402 raise Exception(msg) 403 404 learning = config.get("learning", None) 405 if not learning: 406 msg = "'learning' option not found. At least one \ 407 learning method must be set." 408 raise Exception(msg) 409 410 # checks for the optional parameters 411 x_test_path = config.get("x_test", None) 412 y_test_path = config.get("y_test", None) 413 414 separator = config.get("separator", DEFAULT_SEP) 415 416 labels_path = config.get("labels", None) 417 418 scale = config.get("scale", True) 419 420 log.info("Opening input files ...") 421 log.debug("X_train: %s" % x_train_path) 422 log.debug("y_train: %s" % y_train_path) 423 log.debug("X_test: %s" % x_test_path) 424 log.debug("y_test_path: %s" % y_test_path) 425 426 # open feature and response files 427 X_train, y_train, X_test, y_test, labels = \ 428 open_datasets(x_train_path, y_train_path, x_test_path, 429 y_test_path, separator, labels_path) 430 431 if scale: 432 # preprocess and execute mean removal 433 X_train, X_test = scale_datasets(X_train, X_test) 434 435 # fits training data and predicts the test set using the trained model 436 y_hat = fit_predict(config, X_train, y_train, X_test, y_test)

437 438 439 440

441 -def run_crossvalidation(config):

442 ''' 443 Runs the main code of the only cross validation. Checks for mandatory parameters, opens 444 input files and performs the learning steps. 445 ''' 446 # check if the mandatory parameters are set in the config file 447 x_train_path = config.get("x_train", None) 448 if not x_train_path: 449 msg = "'x_train' option not found in the configuration file. \ 450 The training dataset is mandatory." 451 raise Exception(msg) 452 453 y_train_path = config.get("y_train", None) 454 if not y_train_path: 455 msg = "'y_train' option not found in the configuration file. \ 456 The training dataset is mandatory." 457 raise Exception(msg) 458 459 learning = config.get("learning", None) 460 if not learning: 461 msg = "'learning' option not found. At least one \ 462 learning method must be set." 463 raise Exception(msg) 464 465 separator = config.get("separator", DEFAULT_SEP) 466 467 labels_path = config.get("labels", None) 468 469 scale = config.get("scale", True) 470 471 log.info("Opening input files ...") 472 log.debug("X_train: %s" % x_train_path) 473 log.debug("y_train: %s" % y_train_path) 474 475 # open feature and response files 476 X_train, y_train, X_test, y_test, labels = \ 477 open_datasets_crossvalidation(x_train_path, y_train_path, separator, labels_path) 478 479 if scale: 480 # preprocess and execute mean removal 481 X_train = scale_datasets_crossvalidation(X_train) 482 483 # fits training data and predicts the test set using the trained model 484 scores = cross_validate(config, X_train, y_train) 485 print scores

486 487 488

489 -def main(argv=None): # IGNORE:C0111

490 '''Command line options.''' 491 492 if argv is None: 493 argv = sys.argv 494 else: 495 sys.argv.extend(argv) 496 497 program_name = os.path.basename(sys.argv[0]) 498 program_version = "v%s" % __version__ 499 program_build_date = str(__updated__) 500 program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) 501 program_shortdesc = __import__('__main__').__doc__.split("\n")[1] 502 program_license = '''%s 503 504 Created by José de Souza on %s. 505 Copyright 2012. All rights reserved. 506 507 Licensed under the Apache License 2.0 508 http://www.apache.org/licenses/LICENSE-2.0 509 510 Distributed on an "AS IS" basis without warranties 511 or conditions of any kind, either express or implied. 512 513 USAGE 514 ''' % (program_shortdesc, str(__date__)) 515 516 try: 517 # Setup argument parser 518 parser = ArgumentParser(description=program_license, 519 formatter_class=RawDescriptionHelpFormatter) 520 521 parser.add_argument("configuration_file", action="store", 522 help="path to the configuration file (YAML file).") 523 parser.add_argument("-v", "--verbose", dest="verbose", action="count", 524 help="set verbosity level [default: %(default)s]") 525 parser.add_argument('-V', '--version', action='version', 526 version=program_version_message) 527 528 # Process arguments 529 args = parser.parse_args() 530 531 cfg_path = args.configuration_file 532 533 if args.verbose: 534 log.basicConfig(level=log.DEBUG) 535 else: 536 log.basicConfig(level=log.INFO) 537 538 # opens the config file 539 config = None 540 with open(cfg_path, "r") as cfg_file: 541 config = yaml.load(cfg_file.read()) 542 543 run(config) 544 545 546 except KeyboardInterrupt: 547 ### handle keyboard interrupt ### 548 return 0 549 550 if __name__ == "__main__": 551 if DEBUG: 552 sys.argv.append("-v") 553 554 sys.exit(main()) 555

Source Code for Module ml.lib.scikit.learn_model