ml.lib.scikit.features_file

1 ''' 2 Created on Aug 29, 2012 3 4 @author: desouza 5 ''' 6 import codecs 7 import numpy as np 8 import logging as log 9

10 -def read_labels_file(path, delim, encoding='utf-8'):

11 '''Reads the labels of each column in the training and test files (features 12 and reference files). 13 14 @param path: the path of the labels file 15 @param delim: the character used to separate the label strings. 16 @param encoding: the character encoding used to read the file. 17 Default is 'utf-8'. 18 19 @return: a list of strings representing each feature column. 20 ''' 21 labels_file = codecs.open(path, 'r', encoding) 22 lines = labels_file.readlines() 23 24 if len(lines) > 1: 25 log.warn("labels file has more than one line, using the first.") 26 27 if len(lines) == 0: 28 log.error("labels file is empty: %s" % path) 29 30 labels = lines[0].strip().split(delim) 31 32 return labels

33 34

35 -def read_reference_file(path, delim, encoding='utf-8'):

36 """Parses the file that contains the references and stores it in a numpy array. 37 38 @param path the path of the file. 39 @delim char the character used to separate values. 40 41 @return: a numpy array representing each instance response value 42 """ 43 44 # reads the references to a vector 45 refs_file = codecs.open(path, 'r', encoding) 46 refs_lines = [] 47 for line in refs_file: 48 cols = line.strip().split(delim) 49 refs_lines.append(cols[0]) 50 51 refs = np.asfarray(refs_lines) 52 53 54 return refs

55 56

57 -def read_features_file(path, delim, encoding='utf-8'):

58 ''' 59 Reads the features for each instance and stores it on an numpy array. 60 61 @param path: the path to the file containing the feature set. 62 @param delim: the character used to separate the values in the file pointed by path. 63 @param encoding: the character encoding used to read the file. 64 65 @return: an numpy array where the columns are the features and the rows are the instances. 66 ''' 67 # this method is memory unneficient as all the data is kept in memory 68 feats_file = codecs.open(path, 'r', encoding) 69 feats_lines = [] 70 line_num = 0 71 for line in feats_file: 72 if line == "": 73 continue 74 toks = tuple(line.strip().split(delim)) 75 cols = [] 76 for t in toks: 77 if t != '': 78 try: 79 cols.append(float(t)) 80 except ValueError as e: 81 log.error("%s line %s: %s" % (e, line_num, t)) 82 83 line_num += 1 84 feats_lines.append(cols) 85 86 # print feats_lines 87 feats = np.asarray(feats_lines) 88 89 return feats

90 91 92 if __name__ == '__main__': 93 pass 94

Source Code for Module ml.lib.scikit.features_file_utils