1 '''
2 Created on 27 Aug 2012
3
4 @author: Eleftherios Avramidis
5 '''
6 from xml.etree.cElementTree import iterparse
7
8 TAG_SENT = 'judgedsentence'
9 TAG_SRC = 'src'
10 TAG_TGT = 'tgt'
11 TAG_DOC = 'jcml'
12 import sys
13 import math
14
26
28 current_attributes = parallelsentence.get_nested_attributes()
29 label = current_attributes["rank"]
30 del(current_attributes["rank"])
31 new_attributes = []
32 for att, value in current_attributes.iteritems():
33 att_id = 1.0 * float(attribute_names.index(att))
34 try:
35 value = float(value)
36 except:
37 continue
38 new_attributes.append((att_id, 1.0 * value))
39 instance = (int(label), new_attributes)
40 return instance
41
42
44 '''
45 Parse once the given XML file and return a set with the attribute names
46 @param input_xml_filename: The XML file to be parsed
47 '''
48 source_xml_file = open(input_xml_filename, "r")
49
50 context = iterparse(source_xml_file, events=("start", "end"))
51
52 context = iter(context)
53
54 event, root = context.next()
55
56 number_of_targets = 0
57 attribute_names = []
58 for event, elem in context:
59
60 if event == "start" and elem.tag == TAG_SENT:
61 attribute_names.extend(elem.attrib.keys())
62 target_id = 0
63
64 elif event == "start" and elem.tag == TAG_SRC:
65 source_attributes = ["src_{}".format(key) for key in elem.attrib.keys()]
66 attribute_names.extend(source_attributes)
67
68 elif event == "start" and elem.tag == TAG_TGT:
69 target_id += 1
70 target_attributes = ["tgt_{}".format(key) for key in elem.attrib.keys()]
71 attribute_names.extend(target_attributes)
72 elif event == "end" and elem.tag == TAG_SENT:
73 if target_id > number_of_targets:
74 number_of_targets = target_id
75 root.clear()
76 source_xml_file.close()
77 return set(attribute_names)
78
79
81
82 desired_attributes = kwargs.setdefault("desired_attributes", [])
83 class_name = kwargs.setdefault("class_name", "tgt_rank")
84 group_test = kwargs.setdefault("group_test", False)
85 id_start = kwargs.setdefault("id_start", 0)
86 impute = kwargs.setdefault("impute", True)
87 remove_inf = kwargs.setdefault("remove_inf", True)
88
89 existing_attribute_names = get_attribute_names(input_xml_filename)
90
91 if desired_attributes:
92 attribute_names = set(desired_attributes)
93 missing_attribute_names = attribute_names - existing_attribute_names
94 usable_attribute_names = attribute_names.intersection(existing_attribute_names)
95 if list(missing_attribute_names):
96 sys.stderr.write("could not find attributes {}".format("\n\t".join(list(missing_attribute_names))))
97 attribute_names = desired_attributes
98
99 if not desired_attributes or not usable_attribute_names:
100 meta_attributes = kwargs.setdefault("meta_attributes", [])
101 attribute_names = existing_attribute_names - set(meta_attributes)
102 attribute_names = sorted(list(attribute_names))
103
104
105 source_xml_file = open(input_xml_filename, "r")
106
107 context = iterparse(source_xml_file, events=("start", "end"))
108
109 context = iter(context)
110
111 event, root = context.next()
112
113 instances = []
114 instancegroups = []
115
116 attributes = []
117 target_id = 0
118 i = id_start
119 for event, elem in context:
120
121 if event == "start" and elem.tag == TAG_SENT:
122 general_attributes = elem.attrib
123 i +=1
124 attribute_list = []
125 target_id = 0
126
127 elif event == "start" and elem.tag == TAG_SRC:
128 source_attributes = [("src_{}".format(key), value) for key, value in elem.attrib.iteritems()]
129
130
131 elif event == "start" and elem.tag == TAG_TGT:
132 target_id += 1
133 target_attributes = [("tgt_{}".format(key), value) for key, value in elem.attrib.iteritems()]
134 attribute_list = []
135 attribute_list.extend(source_attributes)
136 attribute_list.extend(target_attributes)
137 attributes = dict(attribute_list)
138 attributes.update(general_attributes)
139 label = attributes[class_name]
140 del(attributes[class_name])
141
142 new_attributes = []
143 for att, value in attributes.iteritems():
144 try:
145 att_id = int(attribute_names.index(att)+1)
146 except ValueError:
147 continue
148 try:
149 value = float(value)
150 except:
151 if impute:
152 value = 0
153 else:
154 continue
155 if remove_inf:
156 if math.isnan(value):
157 value = math.copysign(0, value)
158 elif math.isinf(value):
159 value = math.copysign(99999, value)
160 new_attributes.append((att_id, 1.0 * value))
161 instance = (int(label), new_attributes, i)
162 instances.append(instance)
163
164 elif event == "end" and elem.tag == TAG_SRC:
165 pass
166 elif event == "end" and elem.tag == TAG_TGT:
167 pass
168 elif event == "end" and elem.tag in TAG_SENT:
169 if group_test:
170 instancegroups.append(instances)
171 instances = []
172
173 root.clear()
174 source_xml_file.close()
175 if group_test:
176 return instancegroups
177 return instances
178
180 instances = read_file_incremental(jcml_filename, **kwargs)
181 dat = open(dat_filename, 'w')
182 for label, features, qid in instances:
183 featurestring = " ".join("{}:{}".format(name, value) for name, value in sorted(features))
184 line = "{} qid:{} {}".format(label, qid, featurestring)
185 dat.write("{}\n".format(line))
186 dat.close()
187