1
2
3 '''
4 Created on Jul 21, 2011
5
6 @author: jogin, Eleftherios Avramidis
7 '''
8
9 import codecs
10 import os
11 import sys
12 import tempfile
13 import shutil
14 from xml.sax import make_parser
15 from xml.sax.handler import ContentHandler
16 from sentence.sentence import SimpleSentence
17 from sentence.parallelsentence import ParallelSentence
18 from dataprocessor.input.xmlreader import XmlReader
19
20
22 """
23 This class converts jcml format to tab format (orange format).
24 The output file is saved to the same folder where input file is.
25 """
26 - def __init__(self, input_xml_filename, class_name, desired_attributes, meta_attributes, output_file, **kwargs):
27 """
28 Init calls class SaxJcmlOrangeHeader for creating header and
29 SaxJcmlOrangeContent for creating content.
30 @param input_xml_filename: name of input jcml file
31 @type input_xml_filename: string
32 @param class_name: name of class
33 @type class_name: string
34 @param desired_attributes: desired attributes
35 @type desired_attributes: list of strings
36 @param meta_attributes: meta attributes
37 @type meta_attributes: list of strings
38 """
39 self.get_nested_attributes = False
40 self.compact_mode = False
41 self.discrete_attributes = []
42 self.hidden_attributes = []
43 self.filter_attributes = {}
44 self.class_type = "d"
45 self.class_discretize = False
46 self.dir = "."
47
48 if "compact_mode" in kwargs:
49 self.compact_mode = kwargs["compact_mode"]
50
51 if "discrete_attributes" in kwargs:
52 self.discrete_attributes = set(kwargs["discrete_attributes"])
53
54 if "hidden_attributes" in kwargs:
55 self.hidden_attributes = set(kwargs["hidden_attributes"])
56
57 if "get_nested_attributes" in kwargs:
58 self.get_nested_attributes = kwargs["get_nested_attributes"]
59
60 if "filter_attributes" in kwargs:
61 self.filter_attributes = kwargs["filter_attributes"]
62
63 if "class_type" in kwargs:
64 self.class_type = kwargs["class_type"]
65
66 if "class_discretize" in kwargs:
67 self.class_discretize = kwargs["class_discretize"]
68
69 if "dir" in kwargs:
70 self.dir = kwargs["dir"]
71
72 self.input_filename = input_xml_filename
73 self.class_name = class_name
74 self.desired_attributes = set(desired_attributes)
75 self.meta_attributes = set(meta_attributes)
76
77 self.orange_filename = output_file
78 self.temporary_filename = tempfile.mktemp(dir=self.dir, suffix='.tab')
79
80 self.object_file = codecs.open(self.temporary_filename, encoding='utf-8', mode = 'w')
81
82
83 self.get_orange_header()
84
85
86 self.get_orange_content()
87 self.object_file.close()
88 shutil.move(self.temporary_filename, self.orange_filename)
89 print 'Orange file %s created!' % self.orange_filename
90
91
92
93
94
96 """
97 This function gets orange header.
98 """
99 parser = make_parser()
100 curHandler1 = SaxJcmlOrangeHeader(self.object_file, self.class_name, self.desired_attributes, self.meta_attributes, self.discrete_attributes, self.get_nested_attributes, self.class_type, self.hidden_attributes, self.class_discretize)
101 parser.setContentHandler(curHandler1)
102 parser.parse( open(self.input_filename, 'r'))
103
104
106 """
107 This function gets orange content.
108 """
109 parser = make_parser()
110 curHandler2 = SaxJcmlOrangeContent(self.object_file, self.class_name, self.meta_attributes, self.compact_mode, self.filter_attributes, self.hidden_attributes, self.class_discretize)
111 parser.setContentHandler(curHandler2)
112 parser.parse(open(self.input_filename, 'r'))
113
114
123
124
126
127
129 """
130 @param oFile: file object to receive processed changes
131 @type oFile: file object
132 @param attributeNames: a list of all attribute names
133 @type attributeNames: list of strings
134 """
135 self.o_file = o_file
136 self.desired_attributes = desired_attributes
137 self.meta_attributes = meta_attributes
138 self.discrete_attributes = discrete_attributes
139 self.hidden_attributes = hidden_attributes
140 self.class_name = class_name
141 self.get_nested_attributes = get_nested_attributes
142 self.class_type = class_type
143 if class_discretize:
144 self.class_type = 'd'
145
146 self.attribute_names = set()
147 self.number_of_targets = 0
148
149 self.TAG_SENT = 'judgedsentence'
150 self.TAG_SRC = 'src'
151 self.TAG_TGT = 'tgt'
152 self.TAG_DOC = 'jcml'
153
154 self.src = None
155 self.tgt = []
156 self.ref = None
157 self.ps_list = []
158 self.is_simple_sentence = False
159
160 self.ss_text = []
161 self.ss_attributes = {}
162 self.ps_attributes = {}
163
164
166 """
167 Signals the start of an element (simplesentence or parallelsentence)
168 @param name: the name of the element
169 @type name: string
170 @param attrs: of the element type as a string and the attrs parameter
171 holds an object of the Attributes interface containing the attributes
172 of the element.
173 @type attrs: attributes
174 """
175 if name in [self.TAG_SRC, self.TAG_TGT]:
176 self.ss_text = []
177 self.ss_attributes = {}
178 for att_name in attrs.getNames():
179 self.ss_attributes[att_name] = attrs.getValue(att_name)
180 self.is_simple_sentence = True
181
182 elif name == self.TAG_SENT:
183 self.ps_attributes = {}
184 self.tgt = []
185 for att_name in attrs.getNames():
186 self.ps_attributes[att_name] = attrs.getValue(att_name)
187
188
190 """
191 The Parser will call this method to report each chunk of character data.
192 We use it to store the string of the simplesentence
193 @param ch: character being parsed
194 @type ch: str
195 """
196 if self.is_simple_sentence:
197 self.ss_text.append(ch)
198
199 self.is_simple_sentence = False
200
201
203 self.ss_text = "".join(self.ss_text)
204 if name == self.TAG_SRC:
205 self.src = SimpleSentence(self.ss_text, self.ss_attributes)
206 self.ss_text = []
207 elif name == self.TAG_TGT:
208 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes))
209 elif name == self.TAG_SENT:
210 if len(self.tgt) > self.number_of_targets:
211 self.number_of_targets = len(self.tgt)
212 ps = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)
213 self.src = u''
214 self.tgt = []
215 self.ref = u''
216 if self.get_nested_attributes:
217 for attribute in ps.get_nested_attributes():
218 self.attribute_names.add(str(attribute))
219 else:
220 for attribute in self.ps_attributes:
221 self.attribute_names.add(str(attribute))
222 self.ps_attributes = {}
223
224
226
227 if set(self.desired_attributes) - self.attribute_names:
228 notfound = set(self.desired_attributes) - self.attribute_names
229 sys.stderr.write('Warning: Following desired attributes were not found in input file:\n{0}'.format(notfound))
230
231
232
233 line_1 = ''
234 line_2 = ''
235 line_3 = ''
236
237 if self.desired_attributes == set([]):
238 self.desired_attributes = self.attribute_names - self.meta_attributes
239
240
241 for attribute_name in self.attribute_names:
242
243 if attribute_name in self.hidden_attributes:
244 continue
245 line_1 += attribute_name +"\t"
246
247
248
249 if attribute_name == self.class_name:
250 line_2 += u"%s\t"% self.class_type
251 elif (attribute_name in self.desired_attributes
252 and attribute_name not in self.meta_attributes
253 ):
254 if attribute_name in self.discrete_attributes:
255 line_2 += "d\t"
256 else:
257 line_2 += "c\t"
258 else:
259 line_2 += "s\t"
260
261
262 if attribute_name == self.class_name:
263 line_3 = line_3 + "c"
264 elif ((attribute_name not in self.desired_attributes
265 or attribute_name in self.meta_attributes)
266 ):
267 line_3 = line_3 + "m"
268 elif "id" in attribute_name:
269 sys.stderr.write('One of the given features, {} seems to be a unique identifier\n'.format(attribute_name))
270
271 line_3 = line_3 + "\t"
272
273
274 line_1 += "src\t"
275 line_2 += "string\t"
276 line_3 += "m\t"
277
278
279 for i in range(self.number_of_targets):
280 line_1 += "tgt-" + str(i+1) + "\t"
281 line_2 += "string\t"
282 line_3 += "m\t"
283
284 line_1 += "ref\t"
285 line_2 += "string\t"
286 line_3 += "m\t"
287
288
289 line_1 = line_1 + "\n"
290 line_2 = line_2 + "\n"
291 line_3 = line_3 + "\n"
292 output = line_1 + line_2 + line_3
293 self.o_file.write(output)
294
295
296 f = open('attribute_names.dat', 'w')
297 for attribute_name in self.attribute_names:
298 f.write(attribute_name + '\n')
299 f.close()
300
301
302 -class SaxJcmlOrangeContent(ContentHandler):
303
304
305 - def __init__ (self, o_file, class_name, meta_attributes, compact_mode=False, filter_attributes={}, hidden_attributes=[], class_discretize=False):
306 """
307 @param oFile: file object to receive processed changes
308 @type oFile: file object
309 @param attributeNames: a list of attribute names
310 @type attributeNames: list of strings
311 """
312 self.filter_attributes = filter_attributes
313 self.compact_mode = compact_mode
314 self.o_file = o_file
315 self.is_simple_sentence = False
316 self.class_name = class_name
317 self.set_tags()
318 self.hidden_attributes = hidden_attributes
319 self.class_discretize = class_discretize
320
321 f = open('attribute_names.dat', 'r')
322 self.attribute_names = f.read().strip().split('\n')
323 f.close()
324 os.remove('attribute_names.dat')
325
326
328 """
329 Handles the basic tags used for reading the simple XML format.
330 As tags are prone to changes, this can be done by changing values here,
331 or overriding accordingly
332 """
333 self.TAG_DOC = 'jcml'
334 self.TAG_SENT = 'judgedsentence'
335 self.TAG_SRC = 'src'
336 self.TAG_TGT = 'tgt'
337
338 self.src = None
339 self.tgt = []
340 self.ref = None
341 self.ps_list = []
342
343 self.ss_text = []
344 self.ss_attributes = {}
345 self.ps_attributes = {}
346
347
348 - def startElement(self, name, attrs):
349 """
350 Signals the start of an element (simplesentence or parallelsentence)
351 @param name: the name of the element
352 @type name: string
353 @param attrs: of the element type as a string and the attrs parameter
354 holds an object of the Attributes interface containing the attributes
355 of the element.
356 @type attrs: attributes
357 """
358 if name in [self.TAG_SRC, self.TAG_TGT]:
359 self.ss_text = []
360 self.ss_attributes = {}
361 for att_name in attrs.getNames():
362 self.ss_attributes[att_name] = attrs.getValue(att_name)
363 self.is_simple_sentence = True
364
365 elif name == self.TAG_SENT:
366 self.ps_attributes = {}
367 self.tgt = []
368 for att_name in attrs.getNames():
369 self.ps_attributes[att_name] = attrs.getValue(att_name)
370
371
372 - def characters(self, ch):
373 """
374 The Parser will call this method to report each chunk of character data.
375 We use it to store the string of the simplesentence
376 @param ch: character being parsed
377 @type ch: str
378 """
379 if self.is_simple_sentence:
380 if not self.compact_mode:
381
382 self.ss_text.append(ch)
383 self.is_simple_sentence = False
384
385
386 - def endElement(self, name):
387 """
388 Saves the data from an element that is currently ending.
389 @param name: the name of the element
390 @type name: string
391 """
392 self.ss_text = "".join(self.ss_text)
393 output = []
394 if name == self.TAG_SRC:
395 self.src = SimpleSentence(self.ss_text, self.ss_attributes)
396 self.ss_text = []
397 elif name == self.TAG_TGT:
398 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes))
399 self.ss_text = []
400 elif name == self.TAG_SENT:
401 ps = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)
402 self.src = u''
403 self.tgt = []
404 self.ref = u''
405 self.ps_attributes = {}
406
407
408 for fatt in self.filter_attributes:
409 if ps.get_attribute(fatt) == self.filter_attributes[fatt]:
410 return
411
412
413 for attribute_name in self.attribute_names:
414 ps_nested_attributes = ps.get_nested_attributes()
415 if not attribute_name in self.hidden_attributes:
416 if attribute_name == self.class_name and self.class_discretize:
417 attvalue = float(ps_nested_attributes[attribute_name].strip())
418 attvalue = round(attvalue/self.class_discretize) * self.class_discretize
419 attvalue = str(attvalue)
420 output.append(attvalue)
421 output.append("\t")
422 elif attribute_name in ps_nested_attributes:
423
424 attvalue = ps_nested_attributes[attribute_name].strip()
425 attvalue.replace("inf", "99999999")
426 attvalue.replace("nan", "0")
427 output.append(attvalue)
428 output.append("\t")
429
430 else:
431
432 output.append('\t')
433
434
435 output.append(ps.get_source().get_string())
436 output.append("\t")
437
438 for tgt in ps.get_translations():
439 output.append(tgt.get_string())
440 output.append('\t')
441
442 output.append('\t\n')
443 self.o_file.write("".join(output))
444
445
446
447
448
449