Package dataprocessor :: Package sax :: Module jcml2array
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.jcml2array

  1  ''' 
  2  Created on Nov 13, 2012 
  3   
  4  @author: jogin 
  5  ''' 
  6   
  7   
  8  from xml.etree.cElementTree import iterparse 
  9  from numpy import * 
 10  from optparse import OptionParser 
 11  import sys 
 12   
 13   
 14  ''' 
 15  Convert jcml data format into numpy matrix of float values. 
 16  ''' 
17 -class Jcml2Array():
18 - def __init__(self, globalAtts, srcAtts, tgtAtts, refAtts, className):
19 self.discrete = {} 20 self.TAG_DOC = 'jcml' 21 self.TAG_SENT = 'judgedsentence' 22 self.TAG_SRC = 'src' 23 self.TAG_TGT = 'tgt' 24 self.TAG_REF = 'ref' 25 noOfXColumns = len(globalAtts)+len(srcAtts)+len(tgtAtts)+len(refAtts) 26 self.x = zeros((noOfXColumns)) 27 self.y = zeros((1)) 28 self.allAttsCheck = set(set(globalAtts) | set(srcAtts) | set(tgtAtts) \ 29 | set(refAtts) | set(className)) 30 self.allAttsCheckSet = set() 31 self.actualSentId = 0
32 33 34 ''' 35 Call conversion function and return matrices. 36 @param returnDict: boolean value, if a string dictionary should be 37 returned (True) or not (False) 38 @return x, y: numpy matrices X (attribute values) and Y (class names) 39 @return discrete: a dictionary with assigned numerical substitutions 40 of string values that were parsed from jcml file 41 '''
42 - def get_array(self, returnDict, jcmlFile):
43 self.convert_jcml_attributes(globalAtts, srcAtts, tgtAtts, refAtts, \ 44 className, jcmlFile) 45 if returnDict: return self.x, self.y, self.discrete 46 else: return self.x, self.y
47 48 49 ''' 50 Parse jcml file and convert parsed values into numpy matrix X (attribute 51 values) and Y (class names). 52 @param globalAtts: list of global attributes to be parsed in jcml file 53 @param sourceAtts: list of source attributes to be parsed in jcml file 54 @param targetAtts: list of target attributes to be parsed in jcml file 55 @param referenceAtts: list of reference attributes to be parsed in jcml file 56 @param className: class name to be parsed in jcml file 57 @param jcmlFile: jcml filename 58 '''
59 - def convert_jcml_attributes(self, globalAtts, srcAtts, tgtAtts, refAtts, \ 60 className, jcmlFile):
61 sourceFile = open(jcmlFile, "r") 62 # get an iterable 63 context = iterparse(sourceFile, events=("start", "end")) 64 # turn it into an iterator 65 context = iter(context) 66 # get the root element 67 event, root = context.next() 68 69 globalRow = [] 70 srcRow = [] 71 tgtRows = [] 72 refRow = [] 73 for event, elem in context: 74 if event == "start" and elem.tag == self.TAG_SENT: 75 self.actualSentId = elem.attrib.get('id') 76 for attr in globalAtts: 77 globalRow.append(self.encode_str(elem.attrib.get(attr), \ 78 attr)) 79 80 elif event == "start" and elem.tag == self.TAG_SRC: 81 for attr in srcAtts: 82 srcRow.append(self.encode_str(elem.attrib.get(attr), attr)) 83 elif event == "start" and elem.tag == self.TAG_TGT: 84 tgtRow = [] 85 for attr in tgtAtts: 86 tgtRow.append(self.encode_str(elem.attrib.get(attr), attr)) 87 tgtRows.append(tgtRow) 88 for attr in className: 89 self.y = vstack((self.y, self.encode_str(elem.attrib.get \ 90 (attr), attr))) 91 92 elif event == "start" and elem.tag == self.TAG_REF: 93 for attr in refAtts: 94 refRow.append(self.code_str(elem.attrib.get(attr), attr)) 95 96 elif event == "end" and elem.tag == self.TAG_SENT: 97 98 for tgtRow in tgtRows: 99 # summarize the whole row of X matrix 100 row = [] 101 row.extend(globalRow) 102 row.extend(srcRow) 103 row.extend(tgtRow) 104 row.extend(refRow) 105 106 # check if all attributes were found in jcml sentence 107 notFoundAtts = self.allAttsCheck - self.allAttsCheckSet 108 if notFoundAtts: 109 sys.exit("Following attributes weren't found: %s\nSentence id: %s" \ 110 % (notFoundAtts, self.actualSentId)) 111 112 # insert row into X matrix 113 self.x = vstack((self.x, row)) 114 115 # delete content of previous rows (previous sentence) 116 globalRow = [] 117 srcRow = [] 118 refRow = [] 119 root.clear() 120 121 # delete first rows in matrices (left from matrix initialization) 122 self.x = delete(self.x, 0, 0) 123 self.y = delete(self.y, 0, 0)
124 125 126 ''' 127 If elem is not a float, assign a unique number to a string that occurred 128 in matrix X. Matrix X contains attribute values. 129 Strings with an assigned numbers are saved into a dictionary. 130 @param elem: attribute value gained from jcml file. 131 @param attr: attribute name 132 @return: assigned unique number 133 '''
134 - def encode_str(self, elem, attr):
135 # add attr to the check set 136 self.allAttsCheckSet.add(attr) 137 138 # if elem is a number, return float 139 try: 140 return float(elem) 141 # if elem is not a number, convert elem to string and assign a value 142 except: 143 if elem == None: 144 sys.exit('Attribute %s has a None value!\nSentence id: %s' \ 145 % (attr, self.actualSentId)) 146 s = str(elem) 147 if attr in self.discrete.keys(): 148 if elem in self.discrete[attr].keys(): 149 # return assigned int value 150 return self.discrete[attr][elem] 151 else: 152 # assign a number greater by 1 than the actual greatest one 153 elemNo = 0 154 for key, value in self.discrete[attr].items(): 155 if elemNo < value: elemNo = value 156 elemNo += 1 157 self.discrete[attr].update({elem:elemNo}) 158 # return assigned int value 159 return self.discrete[attr][elem] 160 else: 161 self.discrete[attr] = {elem:0} 162 # return assigned int value (always 0 in this case) 163 return self.discrete[attr][elem]
164 165 166 if __name__ == '__main__': 167 # command line arguments definition 168 parser = OptionParser() 169 parser.add_option("-g", '--globalAtts', dest='globalAtts', \ 170 help="global attributes to be extracted, multiple parameters are separated by comma") 171 parser.add_option("-s", '--srcAtts', dest='srcAtts', \ 172 help="source attributes to be extracted, multiple parameters are separated by comma") 173 parser.add_option("-t", '--tgtAtts', dest='tgtAtts', \ 174 help="target attributes to be extracted, multiple parameters are separated by comma") 175 parser.add_option("-r", '--refAtts', dest='refAtts', \ 176 help="reference attributes to be extracted, multiple parameters are separated by comma") 177 parser.add_option("-c", '--className', dest='className', \ 178 help="class name, it can be only 1 parameter!") 179 parser.add_option("-f", '--jcmlFile', dest='jcmlFile', \ 180 help="path to jcml file") 181 parser.add_option("-d", "--returnDict", dest="returnDict", default=False, \ 182 help="return dictionary with numerical string assignments (default False), for True type 'True' or '1'") 183 184 # command line arguments check 185 opt, args = parser.parse_args() 186 if not opt.jcmlFile: sys.exit('ERROR: Option --jcmlFilename is missing!') 187 #if not opt.globalAtts: sys.exit('ERROR: Option --global attributes are missing!') 188 #if not opt.srcAtts: sys.exit('ERROR: Option --source attributes are missing!') 189 #if not opt.tgtAtts: sys.exit('ERROR: Option --target attributes are missing!') 190 #if not opt.refAtts: sys.exit('ERROR: Option --reference attributes are missing!') 191 #if not opt.className: sys.exit('ERROR: Option --class name is missing!') 192 #if not opt.returnDict: sys.exit('ERROR: Option --return dictionary is missing!') 193 if opt.globalAtts: globalAtts = opt.globalAtts.split(',') 194 else: globalAtts = [] 195 if opt.srcAtts: srcAtts = opt.srcAtts.split(',') 196 else: srcAtts = [] 197 if opt.tgtAtts: tgtAtts = opt.tgtAtts.split(',') 198 else: tgtAtts = [] 199 if opt.refAtts: refAtts = opt.refAtts.split(',') 200 else: refAtts = [] 201 if opt.className: className = [opt.className] 202 else: className = [] 203 Jcml2Array(globalAtts, srcAtts, tgtAtts, refAtts, className, \ 204 opt.jcmlFile).get_array(opt.returnDict) 205