Package dataprocessor :: Package sax :: Module saxjcml2orange
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.saxjcml2orange

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3  ''' 
  4  Created on Jul 21, 2011 
  5   
  6  @author: jogin, Eleftherios Avramidis 
  7  ''' 
  8   
  9  import codecs 
 10  import os 
 11  import sys 
 12  import tempfile 
 13  import shutil 
 14  from xml.sax import make_parser 
 15  from xml.sax.handler import ContentHandler 
 16  from sentence.sentence import SimpleSentence 
 17  from sentence.parallelsentence import ParallelSentence 
 18  from dataprocessor.input.xmlreader import XmlReader 
 19   
 20   
21 -class SaxJcml2Orange():
22 """ 23 This class converts jcml format to tab format (orange format). 24 The output file is saved to the same folder where input file is. 25 """
26 - def __init__(self, input_xml_filename, class_name, desired_attributes, meta_attributes, output_file, **kwargs):
27 """ 28 Init calls class SaxJcmlOrangeHeader for creating header and 29 SaxJcmlOrangeContent for creating content. 30 @param input_xml_filename: name of input jcml file 31 @type input_xml_filename: string 32 @param class_name: name of class 33 @type class_name: string 34 @param desired_attributes: desired attributes 35 @type desired_attributes: list of strings 36 @param meta_attributes: meta attributes 37 @type meta_attributes: list of strings 38 """ 39 self.get_nested_attributes = False 40 self.compact_mode = False 41 self.discrete_attributes = [] 42 self.hidden_attributes = [] 43 self.filter_attributes = {} 44 self.class_type = "d" 45 self.class_discretize = False 46 self.dir = "." 47 48 if "compact_mode" in kwargs: 49 self.compact_mode = kwargs["compact_mode"] 50 51 if "discrete_attributes" in kwargs: 52 self.discrete_attributes = set(kwargs["discrete_attributes"]) 53 54 if "hidden_attributes" in kwargs: 55 self.hidden_attributes = set(kwargs["hidden_attributes"]) 56 57 if "get_nested_attributes" in kwargs: 58 self.get_nested_attributes = kwargs["get_nested_attributes"] 59 60 if "filter_attributes" in kwargs: 61 self.filter_attributes = kwargs["filter_attributes"] 62 63 if "class_type" in kwargs: 64 self.class_type = kwargs["class_type"] 65 66 if "class_discretize" in kwargs: 67 self.class_discretize = kwargs["class_discretize"] 68 69 if "dir" in kwargs: 70 self.dir = kwargs["dir"] 71 72 self.input_filename = input_xml_filename 73 self.class_name = class_name 74 self.desired_attributes = set(desired_attributes) 75 self.meta_attributes = set(meta_attributes) 76 77 self.orange_filename = output_file 78 self.temporary_filename = tempfile.mktemp(dir=self.dir, suffix='.tab') 79 #self.dataset = XmlReader(self.input_xml_filename).get_dataset() 80 self.object_file = codecs.open(self.temporary_filename, encoding='utf-8', mode = 'w') 81 82 # get orange header 83 self.get_orange_header() 84 85 # get orange content 86 self.get_orange_content() 87 self.object_file.close() 88 shutil.move(self.temporary_filename, self.orange_filename) 89 print 'Orange file %s created!' % self.orange_filename
90 91 # test orange file 92 #self.test_orange() 93 94
95 - def get_orange_header(self):
96 """ 97 This function gets orange header. 98 """ 99 parser = make_parser() 100 curHandler1 = SaxJcmlOrangeHeader(self.object_file, self.class_name, self.desired_attributes, self.meta_attributes, self.discrete_attributes, self.get_nested_attributes, self.class_type, self.hidden_attributes, self.class_discretize) 101 parser.setContentHandler(curHandler1) 102 parser.parse( open(self.input_filename, 'r'))
103 104
105 - def get_orange_content(self):
106 """ 107 This function gets orange content. 108 """ 109 parser = make_parser() 110 curHandler2 = SaxJcmlOrangeContent(self.object_file, self.class_name, self.meta_attributes, self.compact_mode, self.filter_attributes, self.hidden_attributes, self.class_discretize) 111 parser.setContentHandler(curHandler2) 112 parser.parse(open(self.input_filename, 'r')) 113 114
115 - def test_orange(self):
116 """ 117 Test function for getting orange file. 118 """ 119 from dataprocessor.input.orangereader import OrangeData 120 dataset = XmlReader(self.input_filename).get_dataset() 121 wrapped_data = OrangeData(dataset, self.class_name, self.desired_attributes, self.meta_attributes, self.orange_filename) 122 new_dataset = wrapped_data.get_dataset()
123 124
125 -class SaxJcmlOrangeHeader(ContentHandler):
126 127
128 - def __init__ (self, o_file, class_name, desired_attributes, meta_attributes, discrete_attributes, get_nested_attributes, class_type, hidden_attributes=[], class_discretize = False):
129 """ 130 @param oFile: file object to receive processed changes 131 @type oFile: file object 132 @param attributeNames: a list of all attribute names 133 @type attributeNames: list of strings 134 """ 135 self.o_file = o_file 136 self.desired_attributes = desired_attributes 137 self.meta_attributes = meta_attributes 138 self.discrete_attributes = discrete_attributes 139 self.hidden_attributes = hidden_attributes 140 self.class_name = class_name 141 self.get_nested_attributes = get_nested_attributes 142 self.class_type = class_type 143 if class_discretize: 144 self.class_type = 'd' 145 146 self.attribute_names = set() 147 self.number_of_targets = 0 148 149 self.TAG_SENT = 'judgedsentence' 150 self.TAG_SRC = 'src' 151 self.TAG_TGT = 'tgt' 152 self.TAG_DOC = 'jcml' 153 154 self.src = None 155 self.tgt = [] 156 self.ref = None 157 self.ps_list = [] 158 self.is_simple_sentence = False 159 160 self.ss_text = [] 161 self.ss_attributes = {} 162 self.ps_attributes = {}
163 164
165 - def startElement(self, name, attrs):
166 """ 167 Signals the start of an element (simplesentence or parallelsentence) 168 @param name: the name of the element 169 @type name: string 170 @param attrs: of the element type as a string and the attrs parameter 171 holds an object of the Attributes interface containing the attributes 172 of the element. 173 @type attrs: attributes 174 """ 175 if name in [self.TAG_SRC, self.TAG_TGT]: 176 self.ss_text = [] 177 self.ss_attributes = {} 178 for att_name in attrs.getNames(): 179 self.ss_attributes[att_name] = attrs.getValue(att_name) 180 self.is_simple_sentence = True 181 182 elif name == self.TAG_SENT: 183 self.ps_attributes = {} 184 self.tgt = [] 185 for att_name in attrs.getNames(): 186 self.ps_attributes[att_name] = attrs.getValue(att_name)
187 188
189 - def characters(self, ch):
190 """ 191 The Parser will call this method to report each chunk of character data. 192 We use it to store the string of the simplesentence 193 @param ch: character being parsed 194 @type ch: str 195 """ 196 if self.is_simple_sentence: 197 self.ss_text.append(ch) 198 # self.ss_text = u'%s%s' % (self.ss_text, ch) 199 self.is_simple_sentence = False
200 201
202 - def endElement(self, name):
203 self.ss_text = "".join(self.ss_text) 204 if name == self.TAG_SRC: 205 self.src = SimpleSentence(self.ss_text, self.ss_attributes) 206 self.ss_text = [] 207 elif name == self.TAG_TGT: 208 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes)) 209 elif name == self.TAG_SENT: 210 if len(self.tgt) > self.number_of_targets: 211 self.number_of_targets = len(self.tgt) 212 ps = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes) 213 self.src = u'' 214 self.tgt = [] 215 self.ref = u'' 216 if self.get_nested_attributes: 217 for attribute in ps.get_nested_attributes(): 218 self.attribute_names.add(str(attribute)) 219 else: 220 for attribute in self.ps_attributes: 221 self.attribute_names.add(str(attribute)) 222 self.ps_attributes = {}
223 224
225 - def endDocument(self):
226 # check if the desired attributes are in attribute names that we got from input file 227 if set(self.desired_attributes) - self.attribute_names: 228 notfound = set(self.desired_attributes) - self.attribute_names 229 sys.stderr.write('Warning: Following desired attributes were not found in input file:\n{0}'.format(notfound)) 230 231 232 # first construct the lines for the declaration 233 line_1 = '' # line for the name of the arguments 234 line_2 = '' # line for the type of the arguments 235 line_3 = '' # line for the definition of the class 236 237 if self.desired_attributes == set([]): 238 self.desired_attributes = self.attribute_names - self.meta_attributes 239 240 # prepare heading 241 for attribute_name in self.attribute_names: 242 # line 1 holds just the names 243 if attribute_name in self.hidden_attributes: 244 continue 245 line_1 += attribute_name +"\t" 246 247 #TODO: find a way to define continuous and discrete arg 248 # line 2 holds the class type 249 if attribute_name == self.class_name: 250 line_2 += u"%s\t"% self.class_type 251 elif (attribute_name in self.desired_attributes 252 and attribute_name not in self.meta_attributes 253 ): 254 if attribute_name in self.discrete_attributes: 255 line_2 += "d\t" 256 else: 257 line_2 += "c\t" 258 else: 259 line_2 += "s\t" 260 261 # line 3 defines the class and the metadata 262 if attribute_name == self.class_name: 263 line_3 = line_3 + "c" 264 elif ((attribute_name not in self.desired_attributes 265 or attribute_name in self.meta_attributes) 266 ): 267 line_3 = line_3 + "m" 268 elif "id" in attribute_name: 269 sys.stderr.write('One of the given features, {} seems to be a unique identifier\n'.format(attribute_name)) 270 271 line_3 = line_3 + "\t" 272 273 # src 274 line_1 += "src\t" 275 line_2 += "string\t" 276 line_3 += "m\t" 277 #target 278 279 for i in range(self.number_of_targets): 280 line_1 += "tgt-" + str(i+1) + "\t" 281 line_2 += "string\t" 282 line_3 += "m\t" 283 #ref 284 line_1 += "ref\t" 285 line_2 += "string\t" 286 line_3 += "m\t" 287 288 #break the line in the end 289 line_1 = line_1 + "\n" 290 line_2 = line_2 + "\n" 291 line_3 = line_3 + "\n" 292 output = line_1 + line_2 + line_3 293 self.o_file.write(output) 294 295 # creating a temp file with attribute names for class SaxJcmlOrangeContent 296 f = open('attribute_names.dat', 'w') 297 for attribute_name in self.attribute_names: 298 f.write(attribute_name + '\n') 299 f.close()
300 301
302 -class SaxJcmlOrangeContent(ContentHandler):
303 304
305 - def __init__ (self, o_file, class_name, meta_attributes, compact_mode=False, filter_attributes={}, hidden_attributes=[], class_discretize=False):
306 """ 307 @param oFile: file object to receive processed changes 308 @type oFile: file object 309 @param attributeNames: a list of attribute names 310 @type attributeNames: list of strings 311 """ 312 self.filter_attributes = filter_attributes 313 self.compact_mode = compact_mode 314 self.o_file = o_file 315 self.is_simple_sentence = False 316 self.class_name = class_name 317 self.set_tags() 318 self.hidden_attributes = hidden_attributes 319 self.class_discretize = class_discretize 320 # reading a temp file with attribute names for class SaxJcmlOrangeContent 321 f = open('attribute_names.dat', 'r') 322 self.attribute_names = f.read().strip().split('\n') 323 f.close() 324 os.remove('attribute_names.dat')
325 326
327 - def set_tags(self):
328 """ 329 Handles the basic tags used for reading the simple XML format. 330 As tags are prone to changes, this can be done by changing values here, 331 or overriding accordingly 332 """ 333 self.TAG_DOC = 'jcml' 334 self.TAG_SENT = 'judgedsentence' 335 self.TAG_SRC = 'src' 336 self.TAG_TGT = 'tgt' 337 338 self.src = None 339 self.tgt = [] 340 self.ref = None 341 self.ps_list = [] 342 343 self.ss_text = [] 344 self.ss_attributes = {} 345 self.ps_attributes = {}
346 347
348 - def startElement(self, name, attrs):
349 """ 350 Signals the start of an element (simplesentence or parallelsentence) 351 @param name: the name of the element 352 @type name: string 353 @param attrs: of the element type as a string and the attrs parameter 354 holds an object of the Attributes interface containing the attributes 355 of the element. 356 @type attrs: attributes 357 """ 358 if name in [self.TAG_SRC, self.TAG_TGT]: 359 self.ss_text = [] 360 self.ss_attributes = {} 361 for att_name in attrs.getNames(): 362 self.ss_attributes[att_name] = attrs.getValue(att_name) 363 self.is_simple_sentence = True 364 365 elif name == self.TAG_SENT: 366 self.ps_attributes = {} 367 self.tgt = [] 368 for att_name in attrs.getNames(): 369 self.ps_attributes[att_name] = attrs.getValue(att_name)
370 371
372 - def characters(self, ch):
373 """ 374 The Parser will call this method to report each chunk of character data. 375 We use it to store the string of the simplesentence 376 @param ch: character being parsed 377 @type ch: str 378 """ 379 if self.is_simple_sentence: 380 if not self.compact_mode: 381 # self.ss_text = u'%s%s' % (self.ss_text, ch) 382 self.ss_text.append(ch) 383 self.is_simple_sentence = False
384 385
386 - def endElement(self, name):
387 """ 388 Saves the data from an element that is currently ending. 389 @param name: the name of the element 390 @type name: string 391 """ 392 self.ss_text = "".join(self.ss_text) 393 output = [] 394 if name == self.TAG_SRC: 395 self.src = SimpleSentence(self.ss_text, self.ss_attributes) 396 self.ss_text = [] 397 elif name == self.TAG_TGT: 398 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes)) 399 self.ss_text = [] 400 elif name == self.TAG_SENT: 401 ps = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes) 402 self.src = u'' 403 self.tgt = [] 404 self.ref = u'' 405 self.ps_attributes = {} 406 407 #skip totally lines that have a certain value for a particular att 408 for fatt in self.filter_attributes: 409 if ps.get_attribute(fatt) == self.filter_attributes[fatt]: 410 return 411 412 # print source and target sentence 413 for attribute_name in self.attribute_names: 414 ps_nested_attributes = ps.get_nested_attributes() 415 if not attribute_name in self.hidden_attributes: 416 if attribute_name == self.class_name and self.class_discretize: 417 attvalue = float(ps_nested_attributes[attribute_name].strip()) 418 attvalue = round(attvalue/self.class_discretize) * self.class_discretize 419 attvalue = str(attvalue) 420 output.append(attvalue) 421 output.append("\t") 422 elif attribute_name in ps_nested_attributes: 423 # print attribute names 424 attvalue = ps_nested_attributes[attribute_name].strip() 425 attvalue.replace("inf", "99999999") 426 attvalue.replace("nan", "0") 427 output.append(attvalue) 428 output.append("\t") 429 430 else: 431 # even if attribute value exists or not, we have to tab 432 output.append('\t') 433 434 # print source sentence 435 output.append(ps.get_source().get_string()) 436 output.append("\t") 437 # print target sentences 438 for tgt in ps.get_translations(): 439 output.append(tgt.get_string()) 440 output.append('\t') 441 # split parallel sentences by an additional tab and by a newline 442 output.append('\t\n') 443 self.o_file.write("".join(output))
444 445 #meta_attributes = set(["testset", "judgment-id", "langsrc", "langtgt", "ps1_judgement_id", 446 # "ps1_id", "ps2_id", "tgt-1_score" , "tgt-2_score", "tgt-1_system" , "tgt-2_system", "tgt-2_berkeley-tree", "tgt-1_berkeley-tree", "src-1_berkeley-tree", "src-2_berkeley-tree", 447 # ]) 448 #SaxJcml2Orange("/home/Eleftherios Avramidis/taraxu_data/selection-mechanism/wmt12qe/app/1/trainset.coupled.jcml", "rank", [], meta_attributes, "/home/Eleftherios Avramidis/taraxu_data/selection-mechanism/wmt12qe/app/1/trainset.coupled.utf8.tab") 449