1 '''
2 Created on 26 Jun 2012
3
4 @author: Eleftherios Avramidis
5 '''
6
7 import codecs
8 import sys
9 import tempfile
10 import shutil
11 import numpy as np
12 import logging as log
13
14 from collections import defaultdict, OrderedDict
15 from xml.etree.cElementTree import iterparse
16 from sentence.sentence import SimpleSentence
17 from sentence.parallelsentence import ParallelSentence
18 from sentence.dataset import DataSet
19
20
22 """
23 This class converts jcml format to tab format (orange format).
24 The output file is saved to the same folder where input file is.
25 """
26 - def __init__(self, input_xml_filename, **kwargs):
27 """
28 Init calls class SaxJcmlOrangeHeader for creating header and
29 SaxJcmlOrangeContent for creating content.
30 @param input_xml_filename: name of input jcml file
31 @type input_xml_filename: string
32 @param class_name: name of class
33 @type class_name: string
34 @param desired_attributes: desired attributes
35 @type desired_attributes: list of strings
36 @param meta_attributes: meta attributes
37 @type meta_attributes: list of strings
38
39 """
40
41 self.TAG_SENT = 'judgedsentence'
42 self.TAG_SRC = 'src'
43 self.TAG_TGT = 'tgt'
44 self.TAG_DOC = 'jcml'
45
46 self.desired_general = kwargs.setdefault('desired_general', ["rank","langsrc","langtgt","id","judgement_id"])
47 self.desired_source = kwargs.setdefault("desired_source", [])
48 self.desired_target = kwargs.setdefault('desired_target', ["system","rank"])
49 self.all_general = kwargs.setdefault('all_general', False)
50 self.all_target = kwargs.setdefault('all_target', False)
51 self.input_filename = input_xml_filename
52
54 parallelsentences = []
55 source_xml_file = open(self.input_filename, "r")
56
57 context = iterparse(source_xml_file, events=("start", "end"))
58
59 context = iter(context)
60
61 event, root = context.next()
62
63 attributes = []
64 target_id = 0
65
66
67
68
69
70 for event, elem in context:
71
72 if event == "start" and elem.tag == self.TAG_SENT:
73 if not self.all_general:
74 attributes = dict([(key, value) for key, value in elem.attrib.iteritems() if (key in self.desired_general or self.all_general)])
75
76
77
78
79
80 elif event == "start" and elem.tag == self.TAG_TGT:
81 target_id += 1
82 target_attributes = dict([(key, value) for key, value in elem.attrib.iteritems() if (key in self.desired_target or self.all_target)])
83 targets.append(SimpleSentence("", target_attributes))
84
85
86
87
88
89
90
91 elif event == "end" and elem.tag in self.TAG_SENT:
92 source = SimpleSentence("",{})
93 parallelsentence = ParallelSentence(source,targets,None,attributes)
94 parallelsentences.append(parallelsentence)
95
96 root.clear()
97
98
99 return DataSet(parallelsentences)
100
102 parallelsentences = []
103 source_file = open(self.input_filename, "r")
104
105 context = iterparse(source_file, events=("start", "end"))
106
107 context = iter(context)
108
109 event, root = context.next()
110
111 attributes = []
112 target_id = 0
113
114 src_text = ""
115 tgt_text = ""
116
117
118 for event, elem in context:
119
120 if event == "start" and elem.tag == self.TAG_SENT:
121 attributes = dict([(key, value) for key, value in elem.attrib.iteritems() if key in self.desired_general or self.all_general])
122 targets = []
123
124 elif event == "start" and elem.tag == self.TAG_SRC:
125 source_attributes = dict([(key, value) for key, value in elem.attrib.iteritems() if key in self.desired_source or self.all_target])
126
127
128 elif event == "start" and elem.tag == self.TAG_TGT:
129 target_id += 1
130 target_attributes = dict([(key, value) for key, value in elem.attrib.iteritems() if key in self.desired_target or self.all_target])
131
132 elif not compact and event == "end" and elem.tag == self.TAG_SRC and elem.text:
133 src_text = elem.text
134
135 elif event == "end" and elem.tag == self.TAG_TGT:
136 if not compact and elem.text:
137 tgt_text = elem.text
138 else:
139 tgt_text = ""
140 targets.append(SimpleSentence(tgt_text, target_attributes))
141
142 elif event == "end" and elem.tag in self.TAG_SENT:
143 source = SimpleSentence(src_text, source_attributes)
144 parallelsentence = ParallelSentence(source, targets, None, attributes)
145 log.debug("cejml.py: Just process sentence {}".format(parallelsentence.get_attribute("judgement_id")))
146 yield parallelsentence
147 root.clear()
148 source_file.close()
149
150
151 - def fix(self, value):
152 if self.remove_infinite:
153 value = value.replace("inf", "9999999")
154 value = value.replace("nan", "0")
155 return value
156
157
158
159
160
161
163 """calculates statistics about specified attributes on an annotated JCML corpus. Low memory load"""
164
165 - def __init__(self, input_xml_filename, **kwargs):
166
167 self.TAG_SENT = 'judgedsentence'
168 self.TAG_SRC = 'src'
169 self.TAG_TGT = 'tgt'
170 self.TAG_DOC = 'jcml'
171
172 self.input_filename = input_xml_filename
173 self.desired_general = kwargs.setdefault("desired_general", [])
174 self.desired_source = kwargs.setdefault("desired_source", [])
175 self.desired_target = kwargs.setdefault("desired_target", [])
176 self.desired_ref = kwargs.setdefault("desired_ref", [])
177
178
180 try:
181 values = np.asarray([float(v) for v in values])
182 print "{}\t{:5.3f}\t{:5.3f}\t{:5.3f}\t{:5.3f}".format(key,
183 np.average(values),
184 np.std(values),
185 np.min(values),
186 np.max(values)
187 )
188 except ValueError:
189 print "[{}] : distinct values ".format(key)
190
191
193 general_attributes, source_attributes, target_attributes, ref_attributes = self.get_attribute_vectors()
194
195 print "Source:"
196
197 print '"{}"'.format('","'.join([key for key in source_attributes.iterkeys() if not key.endswith("_ratio") and not key.startswith("q_")]))
198
199 print "\n Target:"
200
201 target_attributes = OrderedDict(sorted(target_attributes.iteritems(), key=lambda t: t[0]))
202 print '"{}"'.format('","'.join([key for key in target_attributes.iterkeys() if not key.endswith("_ratio") and not key.startswith("q_")]))
203
204 print
205
206 for key, value in general_attributes.iteritems():
207 print "General attributes:\n"
208 self._print_statistics(key, value)
209
210 for key, value in source_attributes.iteritems():
211 print "Source attributes:\n"
212 self._print_statistics(key, value)
213
214 for key, value in target_attributes.iteritems():
215 print "Target attributes:\n"
216 self._print_statistics(key, value)
217
218
219
220
222 """
223 Extract a list of values for each attribute
224 """
225
226 source_xml_file = open(self.input_filename, "r")
227
228 context = iterparse(source_xml_file, events=("start", "end"))
229
230 context = iter(context)
231
232 event, root = context.next()
233
234 general_attributes = defaultdict(list)
235 source_attributes = defaultdict(list)
236 target_attributes = defaultdict(list)
237 ref_attributes = defaultdict(list)
238
239 for event, elem in context:
240
241 if event == "start" and elem.tag == self.TAG_SENT:
242 for key, value in elem.attrib.iteritems():
243
244 general_attributes[key].append(value)
245
246
247 elif event == "start" and elem.tag == self.TAG_SRC:
248 for key, value in elem.attrib.iteritems():
249
250 source_attributes[key].append(value)
251
252
253 elif event == "start" and elem.tag == self.TAG_TGT:
254 for key, value in elem.attrib.iteritems():
255
256 target_attributes[key].append(value)
257
258 elif event == "start" and elem.tag == self.TAG_REF:
259 for key, value in elem.attrib.iteritems():
260
261 ref_attributes[key].append(value)
262
263 root.clear()
264
265 source_xml_file.close()
266
267 return general_attributes, source_attributes, target_attributes, ref_attributes
268