ml.lib.scikit.sklearn

1 ''' 2 sklearn_utils -- Helper functions to deal with data in the formats sklearn uses. 3 Utilities to read from text files to numpy arrays used by sklearn. 4 5 @author: Jose' de Souza 6 7 @copyright: 2012. All rights reserved. 8 9 @license: Apache License 2.0 10 11 @contact: jose.camargo.souza@gmail.com 12 @deffield updated: Updated 13 ''' 14 15 from features_file_utils import read_labels_file, read_features_file, \ 16 read_reference_file 17 18 from sklearn import preprocessing 19 import logging as log 20 import numpy as np 21 import os 22

23 -def assert_number(generic_list):

24 ''' 25 Checks whether the list is composed only by numeric datatypes. 26 27 @param generic_list: a list containing any object type. 28 @return: True if the list contains only numeric objects. False otherwise. 29 ''' 30 for i in generic_list: 31 if not isinstance(i, (int, float)): 32 return False 33 return True

34

35 -def assert_string(generic_list):

36 for i in generic_list: 37 if not isinstance(i, str): 38 return False 39 return True

40 41 42

43 -def open_datasets(train_path, train_ref_path, test_path, 44 test_ref_path, delim, labels_path=None):

45 46 if not os.path.isfile(os.path.abspath(train_path)): 47 raise IOError("training dataset path is not valid: %s" % train_path) 48 49 if not os.path.isfile(os.path.abspath(train_ref_path)): 50 raise IOError("training references path is not valid: %s" % train_ref_path) 51 52 if not os.path.isfile(os.path.abspath(test_path)): 53 raise IOError("test dataset path is not valid: %s" % test_path) 54 55 if not os.path.isfile(os.path.abspath(test_ref_path)): 56 raise IOError("test references path is not valid: %s" % test_ref_path) 57 58 labels = [] 59 if labels_path is not None: 60 if not os.path.isfile(os.path.abspath(labels_path)): 61 raise IOError("labels file is not valid: %s" % labels_path) 62 63 labels = read_labels_file(labels_path, delim) 64 65 X_train = read_features_file(train_path, delim) 66 y_train = read_reference_file(train_ref_path, delim) 67 68 X_test = read_features_file(test_path, delim) 69 y_test = read_reference_file(test_ref_path, delim) 70 71 if len(X_train.shape) != 2: 72 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.") 73 74 if len(X_test.shape) != 2: 75 raise IOError("the test dataset must be in the format of a matrix with M lines and N columns.") 76 77 if X_train.shape[0] != y_train.shape[0]: 78 raise IOError("the number of instances in the train features file does not match the number of references given.") 79 80 if X_test.shape[0] != y_test.shape[0]: 81 raise IOError("the number of instances in the test features file does not match the number of references given.") 82 83 if X_train.shape[1] != X_test.shape[1]: 84 raise IOError("the number of features in train and test datasets is different.") 85 86 return X_train, y_train, X_test, y_test, labels

87 88

89 -def open_datasets_crossvalidation(train_path, train_ref_path, delim, labels_path=None):

90 if not os.path.isfile(os.path.abspath(train_path)): 91 raise IOError("training dataset path is not valid: %s" % train_path) 92 93 if not os.path.isfile(os.path.abspath(train_ref_path)): 94 raise IOError("training references path is not valid: %s" % train_ref_path) 95 96 labels = [] 97 if labels_path is not None: 98 if not os.path.isfile(os.path.abspath(labels_path)): 99 raise IOError("labels file is not valid: %s" % labels_path) 100 101 labels = read_labels_file(labels_path, delim) 102 103 X_train = read_features_file(train_path, delim) 104 y_train = read_reference_file(train_ref_path, delim) 105 106 if len(X_train.shape) != 2: 107 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.") 108 109 if X_train.shape[0] != y_train.shape[0]: 110 raise IOError("the number of instances in the train features file does not match the number of references given.") 111 112 return X_train, y_train, labels

113 114

115 -def scale_datasets(X_train, X_test):

116 log.info("Scaling datasets...") 117 118 log.debug("X_train shape = %s,%s" % X_train.shape) 119 log.debug("X_test shape = %s,%s" % X_test.shape) 120 121 # concatenates the whole dataset so that the scaling is 122 # done over the same distribution 123 dataset = np.concatenate((X_train, X_test)) 124 scaled_dataset = preprocessing.scale(dataset) 125 126 # gets the scaled datasets splits back 127 X_train = scaled_dataset[:X_train.shape[0]] 128 X_test = scaled_dataset[X_train.shape[0]:] 129 130 log.debug("X_train after scaling = %s,%s" % X_train.shape) 131 log.debug("X_test after scaling = %s,%s" % X_test.shape) 132 133 return X_train, X_test

134

135 -def scale_datasets_crossvalidation(X_train):

136 log.info("Scaling datasets...") 137 138 log.debug("X_train shape = %s,%s" % X_train.shape) 139 140 # concatenates the whole dataset so that the scaling is 141 # done over the same distribution 142 dataset = X_train 143 scaled_dataset = preprocessing.scale(dataset) 144 145 # gets the scaled datasets splits back 146 X_train = scaled_dataset[:X_train.shape[0]] 147 148 log.debug("X_train after scaling = %s,%s" % X_train.shape) 149 150 151 return X_train

152 153 if __name__ == '__main__': 154 pass 155

Source Code for Module ml.lib.scikit.sklearn_utils