1
2
3 '''
4 Created on Jun 27, 2011
5
6 @author: jogin
7 '''
8
9 import string
10 from xml.dom.minidom import parse
11 from sentence.parallelsentence import ParallelSentence
12 from sentence.sentence import SimpleSentence
13 from sentence.dataset import DataSet
14 from xml.sax.saxutils import unescape
15 from dataprocessor.input.genericxmlreader import GenericXmlReader
16
18 """
19 classdocs
20 """
21
22
23
24
26 """
27 Loads the data of the file into memory. It is useful if the Classes has
28 been asked not to load the filename upon initialization
29 """
30 self.xmlObject = parse(self.input_filename)
31
32
34 """
35 Returs the contents of the XML file into an object structure, which is
36 represented by the DataSet object
37 Note that this will cause all the data of the XML file to be loaded
38 into system memory at once.
39 For big data sets this may not be optimal, so consider
40 sentence-by-sentence reading with SAX (saxjcml.py)
41 @rtype: sentence.dataset.DataSet
42 @return: A data set containing all the data of the XML file
43 """
44 return DataSet(self.get_parallelsentences())
45
46
48 return len(self.xmlObject.getElementsByTagName('trans-unit'))
49
50
52 """
53 Finds the global weights for particular tool ID in file. Used by
54 function get_parallelsentences().
55 @tool_id: tool id
56 @type string
57 @return: global weights
58 @type: list
59 """
60 weights = []
61 tools = self.xmlObject.getElementsByTagName('tool')
62 for tool in tools:
63 if tool.getAttribute('tool-id') == tool_id:
64 for elem in tool.getElementsByTagName('metanet:weight'):
65 weights.append(('%s-%s-%s' % ('global', elem.getAttribute('type'), tool_id), \
66 elem.getAttribute('value')))
67 return weights
68
69
71 """
72 Finds a system name of given tool id
73 @param tool_id: tool-id
74 @type tool_id: string
75 @return system_name: name of system
76 @type system_name: string
77 """
78 system_name = ''
79 tools = self.xmlObject.getElementsByTagName('tool')
80 for tool in tools:
81 if tool.getAttribute('tool-id') == tool_id:
82 system_name = tool.getAttribute('tool-name')
83 break
84 return system_name
85
86
88 """
89
90 """
91
92
93 altTranss = transUnit.getElementsByTagName('alt-trans')
94 sentence_id = transUnit.getAttribute("id")
95
96
97 src = ''
98 for transunit_src in transUnit.childNodes:
99 if transunit_src.nodeName == 'source':
100 src = SimpleSentence(unescape(transunit_src.childNodes[0].nodeValue))
101 break
102
103
104 tgt_list = []
105 for altTrans in altTranss:
106
107 tgt = ''
108 for transunit_tgt in altTrans.childNodes:
109 if transunit_tgt.nodeName == 'target':
110 tgt = SimpleSentence(unescape(transunit_tgt.childNodes[0].nodeValue))
111 break
112
113
114 tool_id = altTrans.getAttribute('tool-id')
115
116
117
118
119
120 tgt.add_attribute('system', tool_id)
121
122
123 for weight in self.get_weights(tool_id):
124 tgt.add_attribute(weight[0], weight[1])
125
126
127 alttrans_scores = altTrans.getElementsByTagName("metanet:scores")
128 for alttrans_score in alttrans_scores:
129 if alttrans_score in altTrans.childNodes:
130 for elem in alttrans_score.getElementsByTagName("metanet:score"):
131 tgt.add_attribute('sc_%s-%s' % (tool_id, elem.getAttribute('type').replace(' ', '-')), \
132 elem.getAttribute('value'))
133
134
135 alttrans_derivations = altTrans.getElementsByTagName("metanet:derivation")
136 for alttrans_derivation in alttrans_derivations:
137 derivation_id = alttrans_derivation.getAttribute("id")
138
139 labels_count = {}
140 if alttrans_derivation in altTrans.childNodes:
141 for elem in alttrans_derivation.getElementsByTagName("metanet:annotation"):
142 ann_type = elem.getAttribute('type').replace(' ', '-')
143 value = elem.getAttribute('value').replace("$", "SS")
144 if elem in alttrans_derivation.childNodes:
145 tgt.add_attribute('an_%s-%s-%s' % (tool_id, derivation_id, ann_type), value)
146
147
148 elif ann_type == "cat":
149 if labels_count.has_key(value):
150 labels_count[value] += 1
151 else:
152 labels_count[value] = 1
153
154 for label in labels_count:
155 att_name = "%s-cat-%s" % (derivation_id, label)
156 tgt.add_attribute(att_name, labels_count[label])
157
158
159
160
161
162
163 phrases = altTrans.getElementsByTagName("metanet:phrase")
164
165 if phrases:
166 tgt.add_attribute("phrases_count", str(len(phrases)))
167
168 phrase_id = 0
169 for phrase in phrases:
170 scoresets = phrase.getElementsByTagName("metanet:scores")
171 phrase_id += 1
172 if scoresets:
173 scoreset = scoresets[0]
174 for score in scoreset.getElementsByTagName("metanet:score"):
175 if score.getAttribute('value'):
176 value = score.getAttribute('value')
177 else:
178 value = string.strip(score.firstChild.nodeValue, "\n ")
179 tgt.add_attribute('ds_%s-%s-%d' % (tool_id, score.getAttribute('type').replace(' ', '-'), phrase_id), value)
180
181
182
183
184
185 alttrans_annotations = altTrans.getElementsByTagName("metanet:annotation")
186 OOV_count = 0
187 for alttrans_annotation in alttrans_annotations:
188 if alttrans_annotation.getAttribute('type') == 'oov' \
189 or alttrans_annotation.getAttribute('type') == 'OOV':
190 OOV_count += int(alttrans_annotation.getAttribute('value'))
191 tgt.add_attribute('%s-%s' % (tool_id, 'OOV_count'), str(OOV_count))
192
193
194 token_count = {}
195 tokens = altTrans.getElementsByTagName('metanet:token')
196
197 for token in tokens:
198 d = token.getAttribute('id').partition('_d')[2].partition('_')[0]
199 if d not in token_count:
200 token_count[d] = 1
201 else:
202 token_count[d] = int(token_count[d]) + 1
203 for d_count in token_count:
204 tgt.add_attribute('%s-%s%s-%s' % (tool_id, 'd', d_count, \
205 'token-count'), token_count[d_count])
206
207
208 tgt_list.append(tgt)
209
210
211 ref = ''
212 for transunit_ref in transUnit.childNodes:
213 if transunit_ref.nodeName == 'target':
214 ref = SimpleSentence(unescape(transunit_ref.childNodes[0].nodeValue))
215 break
216
217 ps = ParallelSentence(src, tgt_list, ref, {"id" : sentence_id})
218 print "."
219 return ps
220
221
223 """
224 @return: a list of ParallelSentence objects
225 """
226 xmlObject = self.xmlObject
227
228
229 return [self.get_parallelsentence(transUnit) for transUnit in xmlObject.getElementsByTagName('trans-unit')]
230
231
232
233
234
236 self.xmlObject.unlink()
237
238
240 """
241 @return: a dictionary of the attributes of the current sentence (name:value)
242 """
243 attributes = {}
244 attributeKeys = xmlEntry.attributes.keys()
245 for attributeKey in attributeKeys:
246 attributes[attributeKey] = unescape(xmlEntry.attributes[attributeKey].value)
247 return attributes
248