1 '''
2 Created on 26 Jun 2012
3
4 @author: Eleftherios Avramidis
5 '''
6
7 import codecs
8 import sys
9 import tempfile
10 import shutil
11 from xml.etree.cElementTree import iterparse
12
14 """
15 This class converts jcml format to tab format (orange format).
16 The output file is saved to the same folder where input file is.
17 """
18 - def __init__(self, input_xml_filename, class_name, desired_attributes, meta_attributes, output_file, **kwargs):
19 """
20 Init calls class SaxJcmlOrangeHeader for creating header and
21 SaxJcmlOrangeContent for creating content.
22 @param input_xml_filename: name of input jcml file
23 @type input_xml_filename: string
24 @param class_name: name of class
25 @type class_name: string
26 @param desired_attributes: desired attributes
27 @type desired_attributes: list of strings
28 @param meta_attributes: meta attributes
29 @type meta_attributes: list of strings
30
31 """
32
33 self.TAG_SENT = 'judgedsentence'
34 self.TAG_SRC = 'src'
35 self.TAG_TGT = 'tgt'
36 self.TAG_DOC = 'jcml'
37
38
39 self.compact_mode = kwargs.setdefault('compact_mode', False)
40 self.discrete_attributes = kwargs.setdefault('discrete_attributes', [])
41 self.hidden_attributes = kwargs.setdefault('hidden_attributes', [])
42 self.filter_attributes = kwargs.setdefault('filter_attributes', {})
43 self.class_type = kwargs.setdefault('class_type', 'd')
44 self.class_discretize = kwargs.setdefault('class_discretize', False)
45 self.dir = kwargs.setdefault('dir', '.')
46 self.remove_infinite = kwargs.setdefault('remove_infinite', False)
47 self.nullimputation = kwargs.setdefault('nullimputation', False)
48 sys.stderr.write("Imputation {}\n".format(self.nullimputation))
49 self.input_filename = input_xml_filename
50 self.class_name = class_name
51 self.desired_attributes = set(desired_attributes)
52 self.meta_attributes = set(meta_attributes)
53
54 self.orange_filename = output_file
55 self.temporary_filename = tempfile.mktemp(dir=self.dir, suffix='.tab')
56
57
59 self.object_file = codecs.open(self.temporary_filename, encoding='utf-8', mode = 'w')
60
61
62 self.get_orange_header()
63
64
65 self.get_orange_content()
66 self.object_file.close()
67 shutil.move(self.temporary_filename, self.orange_filename)
68 print 'Orange file %s created!' % self.orange_filename
69
70
71
72
74 """
75 This function gets orange header.
76 """
77 self.attribute_names, self.number_of_targets = self._get_attribute_names()
78 self.object_file.write(self._get_header_text())
79
80
82 '''
83 Parse once the given XML file and return a set with the attribute names
84 @param input_xml_filename: The XML file to be parsed
85 '''
86 source_xml_file = open(self.input_filename, "r")
87
88 context = iterparse(source_xml_file, events=("start", "end"))
89
90 context = iter(context)
91
92 event, root = context.next()
93
94 number_of_targets = 0
95 attribute_names = []
96 for event, elem in context:
97
98 if event == "start" and elem.tag == self.TAG_SENT:
99 attribute_names.extend(elem.attrib.keys())
100 target_id = 0
101
102 elif event == "start" and elem.tag == self.TAG_SRC:
103 source_attributes = ["src_{}".format(key) for key in elem.attrib.keys()]
104 attribute_names.extend(source_attributes)
105
106 elif event == "start" and elem.tag == self.TAG_TGT:
107 target_id += 1
108 target_attributes = ["tgt-{0}_{1}".format(target_id, key) for key in elem.attrib.keys()]
109 attribute_names.extend(target_attributes)
110 elif event == "end" and elem.tag == self.TAG_SENT:
111 if target_id > number_of_targets:
112 number_of_targets = target_id
113 root.clear()
114 source_xml_file.close()
115 return set(attribute_names), number_of_targets
116
117
119
120 if set(self.desired_attributes) - self.attribute_names:
121 notfound = set(self.desired_attributes) - self.attribute_names
122 errortext = 'Error: Following desired attributes weren\'t found in input file:\n\t{0}'.format("\n\t".join(list(notfound)))
123 sys.stderr.write(errortext)
124 raise IndexError(errortext)
125
126
127 line_1 = []
128 line_2 = []
129 line_3 = []
130
131 if self.desired_attributes == set([]):
132 self.desired_attributes = self.attribute_names - self.meta_attributes
133
134
135 for attribute_name in self.attribute_names:
136
137
138
139 if attribute_name in self.hidden_attributes:
140 continue
141 if self.compact_mode and attribute_name not in self.desired_attributes and attribute_name != self.class_name:
142 continue
143 line_1.append(attribute_name)
144
145
146
147 if attribute_name == self.class_name:
148 line_2.append(self.class_type)
149 elif (attribute_name in self.desired_attributes
150 and attribute_name not in self.meta_attributes
151 ):
152 if attribute_name in self.discrete_attributes:
153 line_2.append("d")
154 else:
155 line_2.append("c")
156 else:
157 line_2.append("s")
158
159
160 if attribute_name == self.class_name:
161 line_3.append("c")
162 elif ((attribute_name not in self.desired_attributes
163 or attribute_name in self.meta_attributes)
164 ):
165 line_3.append("m")
166 elif "id" == attribute_name or "_id" in attribute_name or "-id" in attribute_name or ".id" in attribute_name:
167 sys.stderr.write('Warning: One of the given features, {} seems to be a unique identifier\n'.format(attribute_name))
168 line_3.append("")
169 else:
170 line_3.append("")
171
172 if not self.compact_mode:
173 line_1.append("src")
174 line_2.append("string")
175 line_3.append("m")
176
177
178 for i in range(self.number_of_targets):
179 line_1.append("tgt-{0}".format(i+1))
180 line_2.append("string")
181 line_3.append("m")
182
183
184
185
186
187 line_1 = "\t".join(line_1)
188 line_2 = "\t".join(line_2)
189 line_3 = "\t".join(line_3)
190
191 line_3 = line_3 + "\n"
192
193 output = "\n".join([line_1, line_2, line_3])
194 return output
195
196
198
199 source_xml_file = open(self.input_filename, "r")
200
201 context = iterparse(source_xml_file, events=("start", "end"))
202
203 context = iter(context)
204
205 event, root = context.next()
206
207 attributes = []
208 target_id = 0
209 for event, elem in context:
210
211 if event == "start" and elem.tag == self.TAG_SENT:
212 attributes = elem.attrib
213 tgt_text = []
214 attribute_list = []
215 target_id = 0
216
217 elif event == "start" and elem.tag == self.TAG_SRC:
218 source_attributes = [("src_{}".format(key), value) for key, value in elem.attrib.iteritems()]
219 attribute_list.extend(source_attributes)
220
221
222 elif event == "start" and elem.tag == self.TAG_TGT:
223 target_id += 1
224 target_attributes = [("tgt-{0}_{1}".format(target_id, key), value) for key, value in elem.attrib.iteritems()]
225 attribute_list.extend(target_attributes)
226
227 elif event == "end" and elem.tag == self.TAG_SRC:
228 src_text = elem.text
229
230 elif event == "end" and elem.tag == self.TAG_TGT:
231 tgt_text.append(elem.text)
232
233 elif event == "end" and elem.tag in self.TAG_SENT:
234 attributes.update(dict(attribute_list))
235 self._write_orange_line(attributes, src_text, tgt_text)
236
237 root.clear()
238
239
240
242
243 for fatt in self.filter_attributes:
244 if ps_nested_attributes[fatt] == self.filter_attributes[fatt]:
245 return
246
247 output = []
248
249 for attribute_name in self.attribute_names:
250 if self.compact_mode and attribute_name not in self.desired_attributes and attribute_name != self.class_name:
251 continue
252 if not attribute_name in self.hidden_attributes:
253 if attribute_name == self.class_name and self.class_discretize:
254 attvalue = float(ps_nested_attributes[attribute_name].strip())
255 attvalue = round(attvalue/self.class_discretize) * self.class_discretize
256 attvalue = str(attvalue)
257 output.append(attvalue)
258 output.append("\t")
259 elif attribute_name in ps_nested_attributes:
260
261 attvalue = ps_nested_attributes[attribute_name].strip()
262 if self.remove_infinite:
263 attvalue = attvalue.replace("inf", "99999999")
264 attvalue = attvalue.replace("nan", "0")
265 output.append(attvalue)
266 output.append("\t")
267
268 else:
269 if self.nullimputation:
270 output.append('0')
271
272 output.append('\t')
273
274
275 if not self.compact_mode:
276 output.append(src_text)
277 output.append("\t")
278
279 for tgt in tgt_text:
280 output.append(tgt)
281 output.append('\t')
282
283 output.append('\n')
284 line = "".join(output)
285 self.object_file.write(line)
286