1 '''
2 Created on Aug 29, 2012
3
4 @author: desouza
5 '''
6 import codecs
7 import numpy as np
8 import logging as log
9
11 '''Reads the labels of each column in the training and test files (features
12 and reference files).
13
14 @param path: the path of the labels file
15 @param delim: the character used to separate the label strings.
16 @param encoding: the character encoding used to read the file.
17 Default is 'utf-8'.
18
19 @return: a list of strings representing each feature column.
20 '''
21 labels_file = codecs.open(path, 'r', encoding)
22 lines = labels_file.readlines()
23
24 if len(lines) > 1:
25 log.warn("labels file has more than one line, using the first.")
26
27 if len(lines) == 0:
28 log.error("labels file is empty: %s" % path)
29
30 labels = lines[0].strip().split(delim)
31
32 return labels
33
34
36 """Parses the file that contains the references and stores it in a numpy array.
37
38 @param path the path of the file.
39 @delim char the character used to separate values.
40
41 @return: a numpy array representing each instance response value
42 """
43
44
45 refs_file = codecs.open(path, 'r', encoding)
46 refs_lines = []
47 for line in refs_file:
48 cols = line.strip().split(delim)
49 refs_lines.append(cols[0])
50
51 refs = np.asfarray(refs_lines)
52
53
54 return refs
55
56
58 '''
59 Reads the features for each instance and stores it on an numpy array.
60
61 @param path: the path to the file containing the feature set.
62 @param delim: the character used to separate the values in the file pointed by path.
63 @param encoding: the character encoding used to read the file.
64
65 @return: an numpy array where the columns are the features and the rows are the instances.
66 '''
67
68 feats_file = codecs.open(path, 'r', encoding)
69 feats_lines = []
70 line_num = 0
71 for line in feats_file:
72 if line == "":
73 continue
74 toks = tuple(line.strip().split(delim))
75 cols = []
76 for t in toks:
77 if t != '':
78 try:
79 cols.append(float(t))
80 except ValueError as e:
81 log.error("%s line %s: %s" % (e, line_num, t))
82
83 line_num += 1
84 feats_lines.append(cols)
85
86
87 feats = np.asarray(feats_lines)
88
89 return feats
90
91
92 if __name__ == '__main__':
93 pass
94