1
2
3
4 """
5 @author: Eleftherios Avramidis
6 """
7
8
9 from xml.sax.saxutils import XMLGenerator
10 from sentence.sentence import SimpleSentence
11 from sentence.parallelsentence import ParallelSentence
12
14 """
15 Handles the generation of features over an XML object formatted as JCML.
16 It does processing every time a parallel sentence including its contents has been declared.
17 Processing of any other XML type should follow this example.
18 """
19
20 - def __init__(self, out, feature_generators, tab_filename, metric_name, lang_pair, test_set):
21 """
22 @param out: file object to receive processed changes
23 @type out: file
24 @param feature_generators: list of feature generators to be applied
25 @type feature_generators: list
26 """
27
28 self.tab_file = open(tab_filename, 'w')
29 self.metric_name = metric_name
30 self.lang_pair = lang_pair
31 self.test_set = test_set
32
33 self.is_parallelsentence = False
34 self.is_simplesentence = False
35 self.passed_head = False
36
37
38 self.ps_attributes = {}
39 self.ss_attributes = {}
40
41 self.src = None
42 self.tgt = []
43 self.ref = None
44 self.annotations = []
45
46 self.ss_text = ""
47
48 self.set_tags()
49
50 self.feature_generators = feature_generators
51 self._encoding = "utf-8"
52 XMLGenerator._encoding = "utf-8"
53 XMLGenerator._out = out
54
67
71
75
77 """
78 Signals the start of an element (simplesentence or parallelsentence)
79 @param name: the name of the element
80 @type name: str
81 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
82 @type attrs: Attributes
83 """
84 if name == self.TAG_SENT:
85
86
87 self.ss_text = u""
88 self.ps_attributes = {}
89 self.tgt = []
90 for att_name in attrs.getNames():
91 self.ps_attributes[att_name] = attrs.getValue(att_name)
92 self.is_parallelsentence = True
93
94
95 XMLGenerator.startElement(self, self.TAG_ANNOTATIONS, {})
96 if not self.passed_head:
97 for featuregenerator in self.feature_generators:
98 atts = {"name" : featuregenerator.get_annotation_name()}
99
100
101
102 self.passed_head = True
103
104 if name == self.TAG_ANNOTATION:
105 if not self.passed_head:
106 self.annotations.append(attrs.getValue("name"))
107 XMLGenerator.startElement(self, name, attrs)
108 else:
109 print "Format error. Annotation must be declared in the beginning of the document"
110
111 elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]:
112
113
114 self.ss_text = u""
115 self.ss_attributes = {}
116 for att_name in attrs.getNames():
117 self.ss_attributes[att_name] = attrs.getValue(att_name)
118 self.is_simplesentence = True
119
120
122 """
123 The Parser will call this method to report each chunk of character data.
124 We use it to store the string of the simplesentence
125 @param ch: character being parsed
126 @type ch: str
127 """
128 if self.is_simplesentence :
129 self.ss_text = u"%s%s" % (self.ss_text, ch)
130
131
133 """
134 Signals the end of an element.
135 Data stored in global vars of the class, time to create our objects and fire their processing
136 @param name: the name of the element
137 @type name: str
138 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
139 @type attrs: Attributes
140 """
141
142
143 self.ss_text = self.ss_text.strip()
144
145
146
147 if name == self.TAG_SRC:
148 self.src = SimpleSentence(self.ss_text, self.ss_attributes)
149 self.ss_text = u""
150 elif name == self.TAG_TGT:
151 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes))
152 self.ss_text = u""
153 elif name == self.TAG_SENT:
154
155 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)
156
157
158 for fg in self.feature_generators:
159 parallelsentence = fg.add_features_parallelsentence(parallelsentence)
160
161
162
163 src = self.src
164
165
166
167
168
169
170
171 XMLGenerator._write(self, "\n\t")
172
173 XMLGenerator.startElement(self, name, parallelsentence.get_attributes())
174
175 XMLGenerator._write(self, "\n\t\t")
176 XMLGenerator.startElement(self, self.TAG_SRC, src.get_attributes())
177 XMLGenerator.characters(self, src.get_string())
178 XMLGenerator.endElement(self, self.TAG_SRC)
179
180 for tgt in parallelsentence.get_translations():
181
182
183
184
185 XMLGenerator._write(self, "\n\t\t")
186 XMLGenerator.startElement(self, self.TAG_TGT, tgt.get_attributes())
187 XMLGenerator.characters(self, tgt.get_string())
188 XMLGenerator.endElement(self, self.TAG_TGT)
189
190 tab_entry = "%s\t%s\t%s\t%s\t%s\t%s\n" % (self.metric_name, self.lang_pair, self.test_set, tgt.get_attribute("system"), parallelsentence.get_attribute("id"), tgt.get_attribute("rank"))
191 self.tab_file.write(tab_entry)
192
193
194 XMLGenerator._write(self, "\n\t")
195 XMLGenerator.endElement(self, name)
196