1 '''
2 sklearn_utils -- Helper functions to deal with data in the formats sklearn uses.
3 Utilities to read from text files to numpy arrays used by sklearn.
4
5 @author: Jose' de Souza
6
7 @copyright: 2012. All rights reserved.
8
9 @license: Apache License 2.0
10
11 @contact: jose.camargo.souza@gmail.com
12 @deffield updated: Updated
13 '''
14
15 from features_file_utils import read_labels_file, read_features_file, \
16 read_reference_file
17
18 from sklearn import preprocessing
19 import logging as log
20 import numpy as np
21 import os
22
24 '''
25 Checks whether the list is composed only by numeric datatypes.
26
27 @param generic_list: a list containing any object type.
28 @return: True if the list contains only numeric objects. False otherwise.
29 '''
30 for i in generic_list:
31 if not isinstance(i, (int, float)):
32 return False
33 return True
34
36 for i in generic_list:
37 if not isinstance(i, str):
38 return False
39 return True
40
41
42
43 -def open_datasets(train_path, train_ref_path, test_path,
44 test_ref_path, delim, labels_path=None):
45
46 if not os.path.isfile(os.path.abspath(train_path)):
47 raise IOError("training dataset path is not valid: %s" % train_path)
48
49 if not os.path.isfile(os.path.abspath(train_ref_path)):
50 raise IOError("training references path is not valid: %s" % train_ref_path)
51
52 if not os.path.isfile(os.path.abspath(test_path)):
53 raise IOError("test dataset path is not valid: %s" % test_path)
54
55 if not os.path.isfile(os.path.abspath(test_ref_path)):
56 raise IOError("test references path is not valid: %s" % test_ref_path)
57
58 labels = []
59 if labels_path is not None:
60 if not os.path.isfile(os.path.abspath(labels_path)):
61 raise IOError("labels file is not valid: %s" % labels_path)
62
63 labels = read_labels_file(labels_path, delim)
64
65 X_train = read_features_file(train_path, delim)
66 y_train = read_reference_file(train_ref_path, delim)
67
68 X_test = read_features_file(test_path, delim)
69 y_test = read_reference_file(test_ref_path, delim)
70
71 if len(X_train.shape) != 2:
72 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.")
73
74 if len(X_test.shape) != 2:
75 raise IOError("the test dataset must be in the format of a matrix with M lines and N columns.")
76
77 if X_train.shape[0] != y_train.shape[0]:
78 raise IOError("the number of instances in the train features file does not match the number of references given.")
79
80 if X_test.shape[0] != y_test.shape[0]:
81 raise IOError("the number of instances in the test features file does not match the number of references given.")
82
83 if X_train.shape[1] != X_test.shape[1]:
84 raise IOError("the number of features in train and test datasets is different.")
85
86 return X_train, y_train, X_test, y_test, labels
87
88
90 if not os.path.isfile(os.path.abspath(train_path)):
91 raise IOError("training dataset path is not valid: %s" % train_path)
92
93 if not os.path.isfile(os.path.abspath(train_ref_path)):
94 raise IOError("training references path is not valid: %s" % train_ref_path)
95
96 labels = []
97 if labels_path is not None:
98 if not os.path.isfile(os.path.abspath(labels_path)):
99 raise IOError("labels file is not valid: %s" % labels_path)
100
101 labels = read_labels_file(labels_path, delim)
102
103 X_train = read_features_file(train_path, delim)
104 y_train = read_reference_file(train_ref_path, delim)
105
106 if len(X_train.shape) != 2:
107 raise IOError("the training dataset must be in the format of a matrix with M lines and N columns.")
108
109 if X_train.shape[0] != y_train.shape[0]:
110 raise IOError("the number of instances in the train features file does not match the number of references given.")
111
112 return X_train, y_train, labels
113
114
116 log.info("Scaling datasets...")
117
118 log.debug("X_train shape = %s,%s" % X_train.shape)
119 log.debug("X_test shape = %s,%s" % X_test.shape)
120
121
122
123 dataset = np.concatenate((X_train, X_test))
124 scaled_dataset = preprocessing.scale(dataset)
125
126
127 X_train = scaled_dataset[:X_train.shape[0]]
128 X_test = scaled_dataset[X_train.shape[0]:]
129
130 log.debug("X_train after scaling = %s,%s" % X_train.shape)
131 log.debug("X_test after scaling = %s,%s" % X_test.shape)
132
133 return X_train, X_test
134
136 log.info("Scaling datasets...")
137
138 log.debug("X_train shape = %s,%s" % X_train.shape)
139
140
141
142 dataset = X_train
143 scaled_dataset = preprocessing.scale(dataset)
144
145
146 X_train = scaled_dataset[:X_train.shape[0]]
147
148 log.debug("X_train after scaling = %s,%s" % X_train.shape)
149
150
151 return X_train
152
153 if __name__ == '__main__':
154 pass
155