1
2
3 '''
4 learn_model -- Program that learns machine translation quality estimation
5 models
6
7 learn_model is a program with which is possible to learn models for
8 sentence-pair quality estimation models using the algorithms implemented in the
9 scikit-learn machine learning toolkit.
10
11 It defines functions to work with different machine learning algorithms as well
12 as feature selection techniques and features preprocessing. The only dependency
13 so far is the sklearn package. ConfigParser is used to parse the configuration
14 file which has a similar layout to the Java properties file.
15
16 @author: Jose' de Souza
17
18 @copyright: 2012. All rights reserved.
19
20 @license: Apache License 2.0
21
22 @contact: jose.camargo.souza@gmail.com
23 @deffield updated: Updated
24 '''
25
26 from argparse import ArgumentParser, RawDescriptionHelpFormatter
27 from evaluation_measures import root_mean_squared_error, mean_absolute_error
28 from sklearn.ensemble.forest import ExtraTreesClassifier
29 from sklearn.grid_search import GridSearchCV
30 from sklearn.linear_model.coordinate_descent import LassoCV
31 from sklearn.linear_model.least_angle import LassoLarsCV, LassoLars
32 from sklearn.linear_model. randomized_l1 import RandomizedLasso
33 from sklearn.metrics.metrics import mean_squared_error, f1_score, \
34 precision_score, recall_score
35 from sklearn.svm.classes import SVR, SVC
36 from sklearn_utils import scale_datasets, open_datasets, assert_number, \
37 assert_string
38 from sklearn import cross_validation
39 import logging as log
40 import numpy as np
41 import os
42 import sys
43 import yaml
44 from sklearn_utils import open_datasets_crossvalidation,\
45 scale_datasets_crossvalidation
46
47 __all__ = []
48 __version__ = 0.1
49 __date__ = '2012-11-01'
50 __updated__ = '2012-11-01'
51
52 DEBUG = 0
53 PROFILE = 0
54
55 DEFAULT_SEP = "\t"
56
58 '''Generic exception to raise and log different fatal errors.'''
66
68 """
69 Given the configuration settings, this function instantiates the configured
70 feature selection method initialized with the preset parameters.
71
72 TODO: implement the same method using reflection (load the class dinamically
73 at runtime)
74
75 @param config: the configuration file object loaded using yaml.load()
76 @return: an object that implements the TransformerMixin class (with fit(),
77 fit_transform() and transform() methods).
78 """
79 transformer = None
80
81 selection_cfg = config.get("feature_selection", None)
82 if selection_cfg:
83 method_name = selection_cfg.get("method", None)
84
85
86 if method_name == "RandomizedLasso":
87 p = selection_cfg.get("parameters", None)
88 if p:
89 transformer = \
90 RandomizedLasso(alpha=p.get("alpha", "aic"),
91 scaling=p.get("scaling", .5),
92 sample_fraction=p.get('sample_fraction', .75),
93 n_resampling=p.get('n_resampling', 200),
94 selection_threshold=threshold,
95 fit_intercept=p.get('fit_intercept', True),
96
97 verbose=True,
98 normalize=p.get('normalize', True),
99 max_iter=p.get('max_iter', 500),
100 n_jobs=p.get('n_jobs', 1))
101 else:
102 transformer = RandomizedLasso()
103
104
105 elif method_name == "ExtraTreesClassifier":
106 p = selection_cfg.get("parameters", None)
107 if p:
108 transformer = \
109 ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10),
110 max_depth=p.get('max_depth', None),
111 min_samples_split=p.get('min_samples_split', 1),
112 min_samples_leaf=p.get('min_samples_leaf', 1),
113 min_density=p.get('min_density', 1),
114 max_features=p.get('max_features', 'auto'),
115 bootstrap=p.get('bootstrap', False),
116 compute_importances=p.get('compute_importances', True),
117 n_jobs=p.get('n_jobs', 1),
118 random_state=p.get('random_state', None),
119
120 verbose=True)
121 else:
122 transformer = ExtraTreesClassifier()
123
124
125 return transformer
126
127
145
146
148 params = {}
149 for key, item in opt.items():
150
151 if isinstance(item, list) and (len(item) == 3) and assert_number(item):
152
153 params[key] = np.linspace(item[0], item[1], num=item[2], endpoint=True)
154
155 elif isinstance(item, list) and assert_string(item):
156 print key, item
157 params[key] = item
158
159 return params
160
161
162 -def optimize_model(estimator, X_train, y_train, params, scores, folds, verbose, n_jobs):
163 clf = None
164 for score_name, score_func in scores:
165 log.info("Tuning hyper-parameters for %s" % score_name)
166
167 log.debug(params)
168 log.debug(scores)
169
170 clf = GridSearchCV(estimator, params, loss_func=score_func,
171 cv=folds, verbose=verbose, n_jobs=n_jobs)
172
173 clf.fit(X_train, y_train)
174
175 log.info("Best parameters set found on development set:")
176 log.info(clf.best_params_)
177
178 return clf.best_estimator_
179
180
182 """
183 Instantiates the sklearn's class corresponding to the value set in the
184 configuration file for running the learning method.
185
186 TODO: use reflection to instantiate the classes
187
188 @param config: configuration object
189 @return: an estimator with fit() and predict() methods
190 """
191 estimator = None
192
193 learning_cfg = config.get("learning", None)
194 if learning_cfg:
195 p = learning_cfg.get("parameters", None)
196 o = learning_cfg.get("optimize", None)
197 scorers = \
198 set_scorer_functions(learning_cfg.get("scorer", ['mae', 'rmse']))
199
200 method_name = learning_cfg.get("method", None)
201 if method_name == "SVR":
202 if o:
203 tune_params = set_optimization_params(o)
204 estimator = optimize_model(SVR(), X_train, y_train,
205 tune_params,
206 scorers,
207 o.get("cv", 5),
208 o.get("verbose", True),
209 o.get("n_jobs", 1))
210
211 elif p:
212 estimator = SVR(C=p.get("C", 10),
213 epsilon=p.get('epsilon', 0.01),
214 kernel=p.get('kernel', 'rbf'),
215 degree=p.get('degree', 3),
216 gamma=p.get('gamma', 0.0034),
217 tol=p.get('tol', 1e-3),
218 verbose=False)
219 else:
220 estimator = SVR()
221
222 elif method_name == "SVC":
223 if o:
224 tune_params = set_optimization_params(o)
225 estimator = optimize_model(SVC(), X_train, y_train,
226 tune_params,
227 scorers,
228 o.get('cv', 5),
229 o.get('verbose', True),
230 o.get('n_jobs', 1))
231
232 elif p:
233 estimator = SVC(C=p.get('C', 1.0),
234 kernel=p.get('kernel', 'rbf'),
235 degree=p.get('degree', 3),
236 gamma=p.get('gamma', 0.0),
237 coef0=p.get('coef0', 0.0),
238 tol=p.get('tol', 1e-3),
239 verbose=p.get('verbose', False))
240 else:
241 estimator = SVC()
242
243 elif method_name == "LassoCV":
244 if p:
245 estimator = LassoCV(eps=p.get('eps', 1e-3),
246 n_alphas=p.get('n_alphas', 100),
247 normalize=p.get('normalize', False),
248 precompute=p.get('precompute', 'auto'),
249 max_iter=p.get('max_iter', 1000),
250 tol=p.get('tol', 1e-4),
251 cv=p.get('cv', 10),
252 verbose=False)
253 else:
254 estimator = LassoCV()
255
256 elif method_name == "LassoLars":
257 if o:
258 tune_params = set_optimization_params(o)
259 estimator = optimize_model(LassoLars(), X_train, y_train,
260 tune_params,
261 scorers,
262 o.get("cv", 5),
263 o.get("verbose", True),
264 o.get("n_jobs", 1))
265
266 if p:
267 estimator = LassoLars(alpha=p.get('alpha', 1.0),
268 fit_intercept=p.get('fit_intercept', True),
269 verbose=p.get('verbose', False),
270 normalize=p.get('normalize', True),
271 max_iter=p.get('max_iter', 500),
272 fit_path=p.get('fit_path', True))
273 else:
274 estimator = LassoLars()
275
276 elif method_name == "LassoLarsCV":
277 if p:
278 estimator = LassoLarsCV(max_iter=p.get('max_iter', 500),
279 normalize=p.get('normalize', True),
280 max_n_alphas=p.get('max_n_alphas', 1000),
281 n_jobs=p.get('n_jobs', 1),
282 cv=p.get('cv', 10),
283 verbose=False)
284 else:
285 estimator = LassoLarsCV()
286
287 return estimator, scorers
288
289
290 -def fit_predict(config, X_train, y_train, X_test=None, y_test=None):
291 '''
292 Uses the configuration dictionary settings to train a model using the
293 specified training algorithm. If set, also evaluates the trained model
294 in a test set. Additionally, performs feature selection and model parameters
295 optimization.
296
297 @param config: the configuration dictionary obtained parsing the
298 configuration file.
299 @param X_train: the np.array object for the matrix containing the feature
300 values for each instance in the training set.
301 @param y_train: the np.array object for the response values of each instance
302 in the training set.
303 @param X_test: the np.array object for the matrix containing the feature
304 values for each instance in the test set. Default is None.
305 @param y_test: the np.array object for the response values of each instance
306 in the test set. Default is None.
307 '''
308
309 transformer = set_selection_method(config)
310
311
312
313 if transformer is not None:
314 log.info("Running feature selection %s" % str(transformer))
315
316 log.debug("X_train dimensions before fit_transform(): %s,%s" % X_train.shape)
317 log.debug("y_train dimensions before fit_transform(): %s" % y_train.shape)
318
319 X_train = transformer.fit_transform(X_train, y_train)
320
321 log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape)
322
323 if X_test is not None:
324 X_test = transformer.transform(X_test)
325
326
327
328 estimator, scorers = set_learning_method(config, X_train, y_train)
329 log.info("Running learning algorithm %s" % str(estimator))
330 estimator.fit(X_train, y_train)
331
332 if (X_test is not None) and (y_test is not None):
333 log.info("Predicting unseen data using the trained model...")
334 y_hat = estimator.predict(X_test)
335
336 log.info("Evaluating prediction on the test set...")
337 for scorer_name, scorer_func in scorers:
338 v = scorer_func(y_test, y_hat)
339 log.info("%s = %s" % (scorer_name, v))
340
341
342
344 '''
345 Uses the configuration dictionary settings to train a model using the
346 specified training algorithm. If set, also evaluates the trained model
347 in a test set. Additionally, performs feature selection and model parameters
348 optimization.
349
350 @param config: the configuration dictionary obtained parsing the
351 configuration file.
352 @param X_train: the np.array object for the matrix containing the feature
353 values for each instance in the training set.
354 @param y_train: the np.array object for the response values of each instance
355 in the training set.
356 @param X_test: the np.array object for the matrix containing the feature
357 values for each instance in the test set. Default is None.
358 @param y_test: the np.array object for the response values of each instance
359 in the test set. Default is None.
360 '''
361
362 transformer = set_selection_method(config)
363
364
365
366 if transformer is not None:
367 log.info("Running feature selection %s" % str(transformer))
368
369 log.debug("X_train dimensions before fit_transform(): %s,%s" % X_train.shape)
370 log.debug("y_train dimensions before fit_transform(): %s" % y_train.shape)
371
372 X_train = transformer.fit_transform(X_train, y_train)
373
374 log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape)
375
376
377
378
379
380 estimator, scorers = set_learning_method(config, X_train, y_train)
381 log.info("Running cross validator with %s" % str(estimator))
382 scores = cross_validation.cross_val_score(estimator, X_train, y_train, cv=10, scoring=scorers)
383 return scores
384
385
387 '''
388 Runs the main code of the program. Checks for mandatory parameters, opens
389 input files and performs the learning steps.
390 '''
391
392 x_train_path = config.get("x_train", None)
393 if not x_train_path:
394 msg = "'x_train' option not found in the configuration file. \
395 The training dataset is mandatory."
396 raise Exception(msg)
397
398 y_train_path = config.get("y_train", None)
399 if not y_train_path:
400 msg = "'y_train' option not found in the configuration file. \
401 The training dataset is mandatory."
402 raise Exception(msg)
403
404 learning = config.get("learning", None)
405 if not learning:
406 msg = "'learning' option not found. At least one \
407 learning method must be set."
408 raise Exception(msg)
409
410
411 x_test_path = config.get("x_test", None)
412 y_test_path = config.get("y_test", None)
413
414 separator = config.get("separator", DEFAULT_SEP)
415
416 labels_path = config.get("labels", None)
417
418 scale = config.get("scale", True)
419
420 log.info("Opening input files ...")
421 log.debug("X_train: %s" % x_train_path)
422 log.debug("y_train: %s" % y_train_path)
423 log.debug("X_test: %s" % x_test_path)
424 log.debug("y_test_path: %s" % y_test_path)
425
426
427 X_train, y_train, X_test, y_test, labels = \
428 open_datasets(x_train_path, y_train_path, x_test_path,
429 y_test_path, separator, labels_path)
430
431 if scale:
432
433 X_train, X_test = scale_datasets(X_train, X_test)
434
435
436 y_hat = fit_predict(config, X_train, y_train, X_test, y_test)
437
438
439
440
442 '''
443 Runs the main code of the only cross validation. Checks for mandatory parameters, opens
444 input files and performs the learning steps.
445 '''
446
447 x_train_path = config.get("x_train", None)
448 if not x_train_path:
449 msg = "'x_train' option not found in the configuration file. \
450 The training dataset is mandatory."
451 raise Exception(msg)
452
453 y_train_path = config.get("y_train", None)
454 if not y_train_path:
455 msg = "'y_train' option not found in the configuration file. \
456 The training dataset is mandatory."
457 raise Exception(msg)
458
459 learning = config.get("learning", None)
460 if not learning:
461 msg = "'learning' option not found. At least one \
462 learning method must be set."
463 raise Exception(msg)
464
465 separator = config.get("separator", DEFAULT_SEP)
466
467 labels_path = config.get("labels", None)
468
469 scale = config.get("scale", True)
470
471 log.info("Opening input files ...")
472 log.debug("X_train: %s" % x_train_path)
473 log.debug("y_train: %s" % y_train_path)
474
475
476 X_train, y_train, X_test, y_test, labels = \
477 open_datasets_crossvalidation(x_train_path, y_train_path, separator, labels_path)
478
479 if scale:
480
481 X_train = scale_datasets_crossvalidation(X_train)
482
483
484 scores = cross_validate(config, X_train, y_train)
485 print scores
486
487
488
489 -def main(argv=None):
490 '''Command line options.'''
491
492 if argv is None:
493 argv = sys.argv
494 else:
495 sys.argv.extend(argv)
496
497 program_name = os.path.basename(sys.argv[0])
498 program_version = "v%s" % __version__
499 program_build_date = str(__updated__)
500 program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
501 program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
502 program_license = '''%s
503
504 Created by José de Souza on %s.
505 Copyright 2012. All rights reserved.
506
507 Licensed under the Apache License 2.0
508 http://www.apache.org/licenses/LICENSE-2.0
509
510 Distributed on an "AS IS" basis without warranties
511 or conditions of any kind, either express or implied.
512
513 USAGE
514 ''' % (program_shortdesc, str(__date__))
515
516 try:
517
518 parser = ArgumentParser(description=program_license,
519 formatter_class=RawDescriptionHelpFormatter)
520
521 parser.add_argument("configuration_file", action="store",
522 help="path to the configuration file (YAML file).")
523 parser.add_argument("-v", "--verbose", dest="verbose", action="count",
524 help="set verbosity level [default: %(default)s]")
525 parser.add_argument('-V', '--version', action='version',
526 version=program_version_message)
527
528
529 args = parser.parse_args()
530
531 cfg_path = args.configuration_file
532
533 if args.verbose:
534 log.basicConfig(level=log.DEBUG)
535 else:
536 log.basicConfig(level=log.INFO)
537
538
539 config = None
540 with open(cfg_path, "r") as cfg_file:
541 config = yaml.load(cfg_file.read())
542
543 run(config)
544
545
546 except KeyboardInterrupt:
547
548 return 0
549
550 if __name__ == "__main__":
551 if DEBUG:
552 sys.argv.append("-v")
553
554 sys.exit(main())
555