1
2
3
4 """
5 @author: Eleftherios Avramidis
6 """
7
8
9 from xml.sax.saxutils import XMLGenerator
10 from xml import sax
11 from sentence.sentence import SimpleSentence
12 from sentence.parallelsentence import ParallelSentence
13 import shutil
14 import codecs
15 import sys
16
17
19 """
20 Function that runs a jcml file through a list of featuregenerators in the SAX way
21 and adds the features directly on a target jcml file
22 @param input_file Filename for the XML-formated data used as input
23 @type input_file string
24 @param output_file Filename for the result of the featuregenerator, to be generated
25 @type output_file string
26 @param generators List of generators to be applied on each of the parallelsentences contained in the XMLs
27 """
28
29
30 input_file_object = open(input_file, 'r' )
31 tmpfile = "%s.tmp" % output_file
32 output_file_object = open(tmpfile, 'w' )
33 saxhandler = SaxJCMLProcessor(output_file_object, generators)
34 sax.parse(input_file_object, saxhandler)
35 input_file_object.close()
36 output_file_object.close()
37 shutil.move(tmpfile, output_file)
38
40 """
41 Handles the generation of features over an XML object formatted as JCML.
42 It does processing every time a parallel sentence including its contents has been declared.
43 Processing of any other XML type should follow this example.
44 """
45
46 - def __init__(self, out, feature_generators = []):
47 """
48 @param out: file object to receive processed changes
49 @type out: file
50 @param feature_generators: list of feature generators to be applied
51 @type feature_generators: list
52 """
53
54
55 self.is_parallelsentence = False
56 self.is_simplesentence = False
57 self.passed_head = False
58
59
60 self.ps_attributes = {}
61 self.ss_attributes = {}
62
63 self.src = None
64 self.tgt = []
65 self.ref = None
66 self.annotations = []
67
68 self.ss_text = []
69
70 self.set_tags()
71
72 self.feature_generators = feature_generators
73 self._encoding = "utf-8"
74 XMLGenerator._encoding = "utf-8"
75 XMLGenerator._out = out
76
89
93
97
99 """
100 Signals the start of an element (simplesentence or parallelsentence)
101 @param name: the name of the element
102 @type name: str
103 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
104 @type attrs: Attributes
105 """
106 if name == self.TAG_SENT:
107
108
109 self.ss_text = []
110 self.ps_attributes = {}
111 self.tgt = []
112 for att_name in attrs.getNames():
113 self.ps_attributes[att_name] = attrs.getValue(att_name)
114 self.is_parallelsentence = True
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133 elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]:
134
135
136 self.ss_text = []
137 self.ss_attributes = {}
138 for att_name in attrs.getNames():
139 self.ss_attributes[att_name] = attrs.getValue(att_name)
140 self.is_simplesentence = True
141
142
144 """
145 The Parser will call this method to report each chunk of character data.
146 We use it to store the string of the simplesentence
147 @param ch: character being parsed
148 @type ch: str
149 """
150 if self.is_simplesentence :
151 self.ss_text.append(ch)
152
153
154
156 """
157 Signals the end of an element.
158 Data stored in global vars of the class, time to create our objects and fire their processing
159 @param name: the name of the element
160 @type name: str
161 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
162 @type attrs: Attributes
163 """
164 parsed_text = "".join(self.ss_text).strip()
165
166
167
168
169 if name == self.TAG_SRC:
170 self.src = SimpleSentence(parsed_text, self.ss_attributes)
171 self.ss_text = []
172 elif name == self.TAG_REF:
173 self.ref = SimpleSentence(parsed_text, self.ss_attributes)
174 self.ss_text = []
175 elif name == self.TAG_TGT:
176 self.tgt.append(SimpleSentence(parsed_text, self.ss_attributes))
177 self.ss_text = []
178 elif name == self.TAG_SENT:
179
180 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)
181 sys.stderr.write("\\")
182
183 for fg in self.feature_generators:
184 parallelsentence = fg.add_features_parallelsentence(parallelsentence)
185
186 sys.stderr.write("/")
187
188 src = parallelsentence.get_source()
189
190
191
192
193
194
195
196 XMLGenerator._write(self, "\n\t")
197
198 XMLGenerator.startElement(self, name, parallelsentence.get_attributes())
199
200 XMLGenerator._write(self, "\n\t\t")
201 XMLGenerator.startElement(self, self.TAG_SRC, src.get_attributes())
202 XMLGenerator.characters(self, src.get_string())
203 XMLGenerator.endElement(self, self.TAG_SRC)
204
205 for tgt in parallelsentence.get_translations():
206
207
208
209
210 XMLGenerator._write(self, "\n\t\t")
211 XMLGenerator.startElement(self, self.TAG_TGT, tgt.get_attributes())
212 XMLGenerator.characters(self, tgt.get_string())
213 XMLGenerator.endElement(self, self.TAG_TGT)
214
215
216 ref = parallelsentence.get_reference()
217
218 XMLGenerator._write(self, "\n\t\t")
219 try:
220 XMLGenerator.startElement(self, self.TAG_REF, ref.get_attributes())
221 XMLGenerator.characters(self, ref.get_string())
222 XMLGenerator.endElement(self, self.TAG_REF)
223 XMLGenerator._write(self, "\n\t")
224 except:
225 pass
226
227 XMLGenerator.endElement(self, name)
228