1 '''
2 Created on Nov 13, 2012
3
4 @author: jogin
5 '''
6
7
8 from xml.etree.cElementTree import iterparse
9 from numpy import *
10 from optparse import OptionParser
11 import sys
12
13
14 '''
15 Convert jcml data format into numpy matrix of float values.
16 '''
18 - def __init__(self, globalAtts, srcAtts, tgtAtts, refAtts, className):
19 self.discrete = {}
20 self.TAG_DOC = 'jcml'
21 self.TAG_SENT = 'judgedsentence'
22 self.TAG_SRC = 'src'
23 self.TAG_TGT = 'tgt'
24 self.TAG_REF = 'ref'
25 noOfXColumns = len(globalAtts)+len(srcAtts)+len(tgtAtts)+len(refAtts)
26 self.x = zeros((noOfXColumns))
27 self.y = zeros((1))
28 self.allAttsCheck = set(set(globalAtts) | set(srcAtts) | set(tgtAtts) \
29 | set(refAtts) | set(className))
30 self.allAttsCheckSet = set()
31 self.actualSentId = 0
32
33
34 '''
35 Call conversion function and return matrices.
36 @param returnDict: boolean value, if a string dictionary should be
37 returned (True) or not (False)
38 @return x, y: numpy matrices X (attribute values) and Y (class names)
39 @return discrete: a dictionary with assigned numerical substitutions
40 of string values that were parsed from jcml file
41 '''
43 self.convert_jcml_attributes(globalAtts, srcAtts, tgtAtts, refAtts, \
44 className, jcmlFile)
45 if returnDict: return self.x, self.y, self.discrete
46 else: return self.x, self.y
47
48
49 '''
50 Parse jcml file and convert parsed values into numpy matrix X (attribute
51 values) and Y (class names).
52 @param globalAtts: list of global attributes to be parsed in jcml file
53 @param sourceAtts: list of source attributes to be parsed in jcml file
54 @param targetAtts: list of target attributes to be parsed in jcml file
55 @param referenceAtts: list of reference attributes to be parsed in jcml file
56 @param className: class name to be parsed in jcml file
57 @param jcmlFile: jcml filename
58 '''
61 sourceFile = open(jcmlFile, "r")
62
63 context = iterparse(sourceFile, events=("start", "end"))
64
65 context = iter(context)
66
67 event, root = context.next()
68
69 globalRow = []
70 srcRow = []
71 tgtRows = []
72 refRow = []
73 for event, elem in context:
74 if event == "start" and elem.tag == self.TAG_SENT:
75 self.actualSentId = elem.attrib.get('id')
76 for attr in globalAtts:
77 globalRow.append(self.encode_str(elem.attrib.get(attr), \
78 attr))
79
80 elif event == "start" and elem.tag == self.TAG_SRC:
81 for attr in srcAtts:
82 srcRow.append(self.encode_str(elem.attrib.get(attr), attr))
83 elif event == "start" and elem.tag == self.TAG_TGT:
84 tgtRow = []
85 for attr in tgtAtts:
86 tgtRow.append(self.encode_str(elem.attrib.get(attr), attr))
87 tgtRows.append(tgtRow)
88 for attr in className:
89 self.y = vstack((self.y, self.encode_str(elem.attrib.get \
90 (attr), attr)))
91
92 elif event == "start" and elem.tag == self.TAG_REF:
93 for attr in refAtts:
94 refRow.append(self.code_str(elem.attrib.get(attr), attr))
95
96 elif event == "end" and elem.tag == self.TAG_SENT:
97
98 for tgtRow in tgtRows:
99
100 row = []
101 row.extend(globalRow)
102 row.extend(srcRow)
103 row.extend(tgtRow)
104 row.extend(refRow)
105
106
107 notFoundAtts = self.allAttsCheck - self.allAttsCheckSet
108 if notFoundAtts:
109 sys.exit("Following attributes weren't found: %s\nSentence id: %s" \
110 % (notFoundAtts, self.actualSentId))
111
112
113 self.x = vstack((self.x, row))
114
115
116 globalRow = []
117 srcRow = []
118 refRow = []
119 root.clear()
120
121
122 self.x = delete(self.x, 0, 0)
123 self.y = delete(self.y, 0, 0)
124
125
126 '''
127 If elem is not a float, assign a unique number to a string that occurred
128 in matrix X. Matrix X contains attribute values.
129 Strings with an assigned numbers are saved into a dictionary.
130 @param elem: attribute value gained from jcml file.
131 @param attr: attribute name
132 @return: assigned unique number
133 '''
135
136 self.allAttsCheckSet.add(attr)
137
138
139 try:
140 return float(elem)
141
142 except:
143 if elem == None:
144 sys.exit('Attribute %s has a None value!\nSentence id: %s' \
145 % (attr, self.actualSentId))
146 s = str(elem)
147 if attr in self.discrete.keys():
148 if elem in self.discrete[attr].keys():
149
150 return self.discrete[attr][elem]
151 else:
152
153 elemNo = 0
154 for key, value in self.discrete[attr].items():
155 if elemNo < value: elemNo = value
156 elemNo += 1
157 self.discrete[attr].update({elem:elemNo})
158
159 return self.discrete[attr][elem]
160 else:
161 self.discrete[attr] = {elem:0}
162
163 return self.discrete[attr][elem]
164
165
166 if __name__ == '__main__':
167
168 parser = OptionParser()
169 parser.add_option("-g", '--globalAtts', dest='globalAtts', \
170 help="global attributes to be extracted, multiple parameters are separated by comma")
171 parser.add_option("-s", '--srcAtts', dest='srcAtts', \
172 help="source attributes to be extracted, multiple parameters are separated by comma")
173 parser.add_option("-t", '--tgtAtts', dest='tgtAtts', \
174 help="target attributes to be extracted, multiple parameters are separated by comma")
175 parser.add_option("-r", '--refAtts', dest='refAtts', \
176 help="reference attributes to be extracted, multiple parameters are separated by comma")
177 parser.add_option("-c", '--className', dest='className', \
178 help="class name, it can be only 1 parameter!")
179 parser.add_option("-f", '--jcmlFile', dest='jcmlFile', \
180 help="path to jcml file")
181 parser.add_option("-d", "--returnDict", dest="returnDict", default=False, \
182 help="return dictionary with numerical string assignments (default False), for True type 'True' or '1'")
183
184
185 opt, args = parser.parse_args()
186 if not opt.jcmlFile: sys.exit('ERROR: Option --jcmlFilename is missing!')
187
188
189
190
191
192
193 if opt.globalAtts: globalAtts = opt.globalAtts.split(',')
194 else: globalAtts = []
195 if opt.srcAtts: srcAtts = opt.srcAtts.split(',')
196 else: srcAtts = []
197 if opt.tgtAtts: tgtAtts = opt.tgtAtts.split(',')
198 else: tgtAtts = []
199 if opt.refAtts: refAtts = opt.refAtts.split(',')
200 else: refAtts = []
201 if opt.className: className = [opt.className]
202 else: className = []
203 Jcml2Array(globalAtts, srcAtts, tgtAtts, refAtts, className, \
204 opt.jcmlFile).get_array(opt.returnDict)
205