Package ml :: Package lib :: Package scikit :: Module sklearn_utils
[hide private]
[frames] | no frames]

Source Code for Module ml.lib.scikit.sklearn_utils

  1  ''' 
  2  sklearn_utils -- Helper functions to deal with data in the formats sklearn uses. 
  3  Utilities to read from text files to numpy arrays used by sklearn. 
  4    
  5  @author:     Jose' de Souza 
  6           
  7  @copyright:  2012. All rights reserved. 
  8           
  9  @license:    Apache License 2.0 
 10   
 11  @contact:    jose.camargo.souza@gmail.com 
 12  @deffield    updated: Updated 
 13  ''' 
 14   
 15  from features_file_utils import read_labels_file, read_features_file, \ 
 16      read_reference_file 
 17   
 18  from sklearn import preprocessing 
 19  import logging as log 
 20  import numpy as np 
 21  import os 
 22   
23 -def assert_number(generic_list):
24 ''' 25 Checks whether the list is composed only by numeric datatypes. 26 27 @param generic_list: a list containing any object type. 28 @return: True if the list contains only numeric objects. False otherwise. 29 ''' 30 for i in generic_list: 31 if not isinstance(i, (int, float)): 32 return False 33 return True
34
35 -def assert_string(generic_list):
36 for i in generic_list: 37 if not isinstance(i, str): 38 return False 39 return True
40 41 42
43 -def open_datasets(train_path, train_ref_path, test_path, 44 test_ref_path, delim, labels_path=None):
45 46 if not os.path.isfile(os.path.abspath(train_path)): 47 raise IOError("training dataset path is not valid: %s" % train_path) 48 49 if not os.path.isfile(os.path.abspath(train_ref_path)): 50 raise IOError("training references path is not valid: %s" % train_ref_path) 51 52 if not os.path.isfile(os.path.abspath(test_path)): 53 raise IOError("test dataset path is not valid: %s" % test_path) 54 55 if not os.path.isfile(os.path.abspath(test_ref_path)): 56 raise IOError("test references path is not valid: %s" % test_ref_path) 57 58 labels = [] 59 if labels_path is not None: 60 if not os.path.isfile(os.path.abspath(labels_path)): 61 raise IOError("labels file is not valid: %s" % labels_path) 62 63 labels = read_labels_file(labels_path, delim) 64 65 X_train = read_features_file(train_path, delim) 66 y_train = read_reference_file(train_ref_path, delim) 67 68 X_test = read_features_file(test_path, delim) 69 y_test = read_reference_file(test_ref_path, delim) 70 71 if len(X_train.shape) != 2: 72 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.") 73 74 if len(X_test.shape) != 2: 75 raise IOError("the test dataset must be in the format of a matrix with M lines and N columns.") 76 77 if X_train.shape[0] != y_train.shape[0]: 78 raise IOError("the number of instances in the train features file does not match the number of references given.") 79 80 if X_test.shape[0] != y_test.shape[0]: 81 raise IOError("the number of instances in the test features file does not match the number of references given.") 82 83 if X_train.shape[1] != X_test.shape[1]: 84 raise IOError("the number of features in train and test datasets is different.") 85 86 return X_train, y_train, X_test, y_test, labels
87 88
89 -def open_datasets_crossvalidation(train_path, train_ref_path, delim, labels_path=None):
90 if not os.path.isfile(os.path.abspath(train_path)): 91 raise IOError("training dataset path is not valid: %s" % train_path) 92 93 if not os.path.isfile(os.path.abspath(train_ref_path)): 94 raise IOError("training references path is not valid: %s" % train_ref_path) 95 96 labels = [] 97 if labels_path is not None: 98 if not os.path.isfile(os.path.abspath(labels_path)): 99 raise IOError("labels file is not valid: %s" % labels_path) 100 101 labels = read_labels_file(labels_path, delim) 102 103 X_train = read_features_file(train_path, delim) 104 y_train = read_reference_file(train_ref_path, delim) 105 106 if len(X_train.shape) != 2: 107 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.") 108 109 if X_train.shape[0] != y_train.shape[0]: 110 raise IOError("the number of instances in the train features file does not match the number of references given.") 111 112 return X_train, y_train, labels
113 114
115 -def scale_datasets(X_train, X_test):
116 log.info("Scaling datasets...") 117 118 log.debug("X_train shape = %s,%s" % X_train.shape) 119 log.debug("X_test shape = %s,%s" % X_test.shape) 120 121 # concatenates the whole dataset so that the scaling is 122 # done over the same distribution 123 dataset = np.concatenate((X_train, X_test)) 124 scaled_dataset = preprocessing.scale(dataset) 125 126 # gets the scaled datasets splits back 127 X_train = scaled_dataset[:X_train.shape[0]] 128 X_test = scaled_dataset[X_train.shape[0]:] 129 130 log.debug("X_train after scaling = %s,%s" % X_train.shape) 131 log.debug("X_test after scaling = %s,%s" % X_test.shape) 132 133 return X_train, X_test
134
135 -def scale_datasets_crossvalidation(X_train):
136 log.info("Scaling datasets...") 137 138 log.debug("X_train shape = %s,%s" % X_train.shape) 139 140 # concatenates the whole dataset so that the scaling is 141 # done over the same distribution 142 dataset = X_train 143 scaled_dataset = preprocessing.scale(dataset) 144 145 # gets the scaled datasets splits back 146 X_train = scaled_dataset[:X_train.shape[0]] 147 148 log.debug("X_train after scaling = %s,%s" % X_train.shape) 149 150 151 return X_train
152 153 if __name__ == '__main__': 154 pass 155