dataprocessor.ce.cejcml2orange

1 ''' 2 Created on 26 Jun 2012 3 4 @author: Eleftherios Avramidis 5 ''' 6 7 import codecs 8 import sys 9 import tempfile 10 import shutil 11 from xml.etree.cElementTree import iterparse 12

13 -class CElementTreeJcml2Orange():

14 """ 15 This class converts jcml format to tab format (orange format). 16 The output file is saved to the same folder where input file is. 17 """

18 - def __init__(self, input_xml_filename, class_name, desired_attributes, meta_attributes, output_file, **kwargs):

19 """ 20 Init calls class SaxJcmlOrangeHeader for creating header and 21 SaxJcmlOrangeContent for creating content. 22 @param input_xml_filename: name of input jcml file 23 @type input_xml_filename: string 24 @param class_name: name of class 25 @type class_name: string 26 @param desired_attributes: desired attributes 27 @type desired_attributes: list of strings 28 @param meta_attributes: meta attributes 29 @type meta_attributes: list of strings 30 31 """ 32 33 self.TAG_SENT = 'judgedsentence' 34 self.TAG_SRC = 'src' 35 self.TAG_TGT = 'tgt' 36 self.TAG_DOC = 'jcml' 37 38 #read kwargs 39 self.compact_mode = kwargs.setdefault('compact_mode', False) 40 self.discrete_attributes = kwargs.setdefault('discrete_attributes', []) 41 self.hidden_attributes = kwargs.setdefault('hidden_attributes', []) 42 self.filter_attributes = kwargs.setdefault('filter_attributes', {}) 43 self.class_type = kwargs.setdefault('class_type', 'd') 44 self.class_discretize = kwargs.setdefault('class_discretize', False) 45 self.dir = kwargs.setdefault('dir', '.') 46 self.remove_infinite = kwargs.setdefault('remove_infinite', False) 47 self.nullimputation = kwargs.setdefault('nullimputation', False) 48 sys.stderr.write("Imputation {}\n".format(self.nullimputation)) 49 self.input_filename = input_xml_filename 50 self.class_name = class_name 51 self.desired_attributes = set(desired_attributes) 52 self.meta_attributes = set(meta_attributes) 53 54 self.orange_filename = output_file 55 self.temporary_filename = tempfile.mktemp(dir=self.dir, suffix='.tab')

56 #self.dataset = XmlReader(self.input_xml_filename).get_dataset() 57

58 - def convert(self):

59 self.object_file = codecs.open(self.temporary_filename, encoding='utf-8', mode = 'w') 60 61 # get orange header 62 self.get_orange_header() 63 64 # get orange content 65 self.get_orange_content() 66 self.object_file.close() 67 shutil.move(self.temporary_filename, self.orange_filename) 68 print 'Orange file %s created!' % self.orange_filename

69 70 # test orange file 71 #self.test_orange() 72

73 - def get_orange_header(self):

74 """ 75 This function gets orange header. 76 """ 77 self.attribute_names, self.number_of_targets = self._get_attribute_names() 78 self.object_file.write(self._get_header_text())

79 80

81 - def _get_attribute_names(self):

82 ''' 83 Parse once the given XML file and return a set with the attribute names 84 @param input_xml_filename: The XML file to be parsed 85 ''' 86 source_xml_file = open(self.input_filename, "r") 87 # get an iterable 88 context = iterparse(source_xml_file, events=("start", "end")) 89 # turn it into an iterator 90 context = iter(context) 91 # get the root element 92 event, root = context.next() 93 94 number_of_targets = 0 95 attribute_names = [] 96 for event, elem in context: 97 #new sentence: get attributes 98 if event == "start" and elem.tag == self.TAG_SENT: 99 attribute_names.extend(elem.attrib.keys()) 100 target_id = 0 101 #new source sentence 102 elif event == "start" and elem.tag == self.TAG_SRC: 103 source_attributes = ["src_{}".format(key) for key in elem.attrib.keys()] 104 attribute_names.extend(source_attributes) 105 #new target sentence 106 elif event == "start" and elem.tag == self.TAG_TGT: 107 target_id += 1 108 target_attributes = ["tgt-{0}_{1}".format(target_id, key) for key in elem.attrib.keys()] 109 attribute_names.extend(target_attributes) 110 elif event == "end" and elem.tag == self.TAG_SENT: 111 if target_id > number_of_targets: 112 number_of_targets = target_id 113 root.clear() 114 source_xml_file.close() 115 return set(attribute_names), number_of_targets

116 117

118 - def _get_header_text(self):

119 # check if the desired attributes are in attribute names that we got from input file 120 if set(self.desired_attributes) - self.attribute_names: 121 notfound = set(self.desired_attributes) - self.attribute_names 122 errortext = 'Error: Following desired attributes weren\'t found in input file:\n\t{0}'.format("\n\t".join(list(notfound))) 123 sys.stderr.write(errortext) 124 raise IndexError(errortext) 125 126 # first construct thecd /home/Eleftherios Avramidis/taraxu_data/selection-mechanism/emnlp/app/4b/autoranking_development/class_nameranklangpairde-eninclude_references0.0ties0.0trainset_modeannotatedattattset_2classifierNaive lines for the declaration 127 line_1 = [] # line for the name of the arguments 128 line_2 = [] # line for the type of the arguments 129 line_3 = [] # line for the definition of the class 130 131 if self.desired_attributes == set([]): 132 self.desired_attributes = self.attribute_names - self.meta_attributes 133 134 # prepare heading 135 for attribute_name in self.attribute_names: 136 # line 1 holds just the names 137 138 #skip hidden attributes 139 if attribute_name in self.hidden_attributes: 140 continue 141 if self.compact_mode and attribute_name not in self.desired_attributes and attribute_name != self.class_name: 142 continue 143 line_1.append(attribute_name) 144 145 #TODO: find a way to define continuous and discrete arg 146 # line 2 holds the class type 147 if attribute_name == self.class_name: 148 line_2.append(self.class_type) 149 elif (attribute_name in self.desired_attributes 150 and attribute_name not in self.meta_attributes 151 ): 152 if attribute_name in self.discrete_attributes: 153 line_2.append("d") 154 else: 155 line_2.append("c") 156 else: 157 line_2.append("s") 158 159 # line 3 defines the class and the metadata 160 if attribute_name == self.class_name: 161 line_3.append("c") 162 elif ((attribute_name not in self.desired_attributes 163 or attribute_name in self.meta_attributes) 164 ): 165 line_3.append("m") 166 elif "id" == attribute_name or "_id" in attribute_name or "-id" in attribute_name or ".id" in attribute_name: 167 sys.stderr.write('Warning: One of the given features, {} seems to be a unique identifier\n'.format(attribute_name)) 168 line_3.append("") 169 else: 170 line_3.append("") 171 172 if not self.compact_mode:# src 173 line_1.append("src") 174 line_2.append("string") 175 line_3.append("m") 176 #target 177 178 for i in range(self.number_of_targets): 179 line_1.append("tgt-{0}".format(i+1)) 180 line_2.append("string") 181 line_3.append("m") 182 #ref 183 # line_1 += "ref\t" 184 # line_2 += "string\t" 185 # line_3 += "m\t" 186 187 line_1 = "\t".join(line_1) 188 line_2 = "\t".join(line_2) 189 line_3 = "\t".join(line_3) 190 #break the line in the end 191 line_3 = line_3 + "\n" 192 193 output = "\n".join([line_1, line_2, line_3]) 194 return output

195 196

197 - def get_orange_content(self):

198 199 source_xml_file = open(self.input_filename, "r") 200 # get an iterable 201 context = iterparse(source_xml_file, events=("start", "end")) 202 # turn it into an iterator 203 context = iter(context) 204 # get the root element 205 event, root = context.next() 206 207 attributes = [] 208 target_id = 0 209 for event, elem in context: 210 #new sentence: get attributes 211 if event == "start" and elem.tag == self.TAG_SENT: 212 attributes = elem.attrib 213 tgt_text = [] 214 attribute_list = [] 215 target_id = 0 216 #new source sentence 217 elif event == "start" and elem.tag == self.TAG_SRC: 218 source_attributes = [("src_{}".format(key), value) for key, value in elem.attrib.iteritems()] 219 attribute_list.extend(source_attributes) 220 221 #new target sentence 222 elif event == "start" and elem.tag == self.TAG_TGT: 223 target_id += 1 224 target_attributes = [("tgt-{0}_{1}".format(target_id, key), value) for key, value in elem.attrib.iteritems()] 225 attribute_list.extend(target_attributes) 226 227 elif event == "end" and elem.tag == self.TAG_SRC: 228 src_text = elem.text 229 230 elif event == "end" and elem.tag == self.TAG_TGT: 231 tgt_text.append(elem.text) 232 233 elif event == "end" and elem.tag in self.TAG_SENT: 234 attributes.update(dict(attribute_list)) 235 self._write_orange_line(attributes, src_text, tgt_text) 236 237 root.clear()

238 239 240

241 - def _write_orange_line(self, ps_nested_attributes, src_text, tgt_text):

242 #skip totally lines that have a certain value for a particular att 243 for fatt in self.filter_attributes: 244 if ps_nested_attributes[fatt] == self.filter_attributes[fatt]: 245 return 246 247 output = [] 248 # print source and target sentence 249 for attribute_name in self.attribute_names: 250 if self.compact_mode and attribute_name not in self.desired_attributes and attribute_name != self.class_name: 251 continue 252 if not attribute_name in self.hidden_attributes: 253 if attribute_name == self.class_name and self.class_discretize: 254 attvalue = float(ps_nested_attributes[attribute_name].strip()) 255 attvalue = round(attvalue/self.class_discretize) * self.class_discretize 256 attvalue = str(attvalue) 257 output.append(attvalue) 258 output.append("\t") 259 elif attribute_name in ps_nested_attributes: 260 # print attribute names 261 attvalue = ps_nested_attributes[attribute_name].strip() 262 if self.remove_infinite: 263 attvalue = attvalue.replace("inf", "99999999") 264 attvalue = attvalue.replace("nan", "0") 265 output.append(attvalue) 266 output.append("\t") 267 268 else: 269 if self.nullimputation: 270 output.append('0') 271 # even if attribute value exists or not, we have to tab 272 output.append('\t') 273 274 # print source sentence 275 if not self.compact_mode: 276 output.append(src_text) 277 output.append("\t") 278 # print target sentences 279 for tgt in tgt_text: 280 output.append(tgt) 281 output.append('\t') 282 # split parallel sentences by an additional tab and by a newline 283 output.append('\n') 284 line = "".join(output) 285 self.object_file.write(line)

286

Source Code for Module dataprocessor.ce.cejcml2orange