1
2
3
4 """
5 @author: Eleftherios Avramidis
6 """
7
8
9 from xml.sax.saxutils import XMLGenerator
10 from sentence.sentence import SimpleSentence
11 from sentence.parallelsentence import ParallelSentence
12
14 """
15 Handles the generation of features over an XML object formatted as JCML.
16 It does processing every time a parallel sentence including its contents has been declared.
17 Processing of any other XML type should follow this example.
18 """
19
20 - def __init__(self, out, feature_generators, tab_filename):
21 """
22 @param out: file object to receive processed changes
23 @type out: file
24 @param feature_generators: list of feature generators to be applied
25 @type feature_generators: list
26 """
27
28 self.tab_file = open(tab_filename, 'w')
29
30
31 self.is_parallelsentence = False
32 self.is_simplesentence = False
33 self.passed_head = False
34
35
36 self.ps_attributes = {}
37 self.ss_attributes = {}
38
39 self.src = None
40 self.tgt = []
41 self.ref = None
42 self.annotations = []
43
44 self.ss_text = ""
45
46 self.set_tags()
47
48 self.feature_generators = feature_generators
49 self._encoding = "utf-8"
50 XMLGenerator._encoding = "utf-8"
51 XMLGenerator._out = out
52
65
69
74
76 """
77 Signals the start of an element (simplesentence or parallelsentence)
78 @param name: the name of the element
79 @type name: str
80 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
81 @type attrs: Attributes
82 """
83 if name == self.TAG_SENT:
84
85
86 self.ss_text = u""
87 self.ps_attributes = {}
88 self.tgt = []
89 for att_name in attrs.getNames():
90 self.ps_attributes[att_name] = attrs.getValue(att_name)
91 self.is_parallelsentence = True
92
93
94 XMLGenerator.startElement(self, self.TAG_ANNOTATIONS, {})
95 if not self.passed_head:
96 for featuregenerator in self.feature_generators:
97 atts = {"name" : featuregenerator.get_annotation_name()}
98
99
100
101 self.passed_head = True
102
103 if name == self.TAG_ANNOTATION:
104 if not self.passed_head:
105 self.annotations.append(attrs.getValue("name"))
106 XMLGenerator.startElement(self, name, attrs)
107 else:
108 print "Format error. Annotation must be declared in the beginning of the document"
109
110 elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]:
111
112
113 self.ss_text = u""
114 self.ss_attributes = {}
115 for att_name in attrs.getNames():
116 self.ss_attributes[att_name] = attrs.getValue(att_name)
117 self.is_simplesentence = True
118
119
121 """
122 The Parser will call this method to report each chunk of character data.
123 We use it to store the string of the simplesentence
124 @param ch: character being parsed
125 @type ch: str
126 """
127 if self.is_simplesentence :
128 self.ss_text = u"%s%s" % (self.ss_text, ch)
129
130
132 """
133 Signals the end of an element.
134 Data stored in global vars of the class, time to create our objects and fire their processing
135 @param name: the name of the element
136 @type name: str
137 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
138 @type attrs: Attributes
139 """
140
141
142 self.ss_text = self.ss_text.strip()
143
144
145
146 if name == self.TAG_SRC:
147 self.src = SimpleSentence(self.ss_text, self.ss_attributes)
148 self.ss_text = u""
149 elif name == self.TAG_TGT:
150 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes))
151 self.ss_text = u""
152 elif name == self.TAG_SENT:
153
154 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)
155
156
157 for fg in self.feature_generators:
158 parallelsentence = fg.add_features_parallelsentence(parallelsentence)
159
160
161
162 src = parallelsentence.get_source()
163
164
165
166
167
168
169
170 XMLGenerator._write(self, "\n\t")
171
172 XMLGenerator.startElement(self, name, parallelsentence.get_attributes())
173
174 XMLGenerator._write(self, "\n\t\t")
175 XMLGenerator.startElement(self, self.TAG_SRC, src.get_attributes())
176 XMLGenerator.characters(self, src.get_string())
177 XMLGenerator.endElement(self, self.TAG_SRC)
178
179 found_best = False
180 tab_entry = "\n"
181
182 for tgt in parallelsentence.get_translations():
183
184
185
186
187 XMLGenerator._write(self, "\n\t\t")
188 XMLGenerator.startElement(self, self.TAG_TGT, tgt.get_attributes())
189 XMLGenerator.characters(self, tgt.get_string())
190 XMLGenerator.endElement(self, self.TAG_TGT)
191
192
193 if int(tgt.get_attribute("rank")) == 1 and not found_best:
194 string = tgt.get_string()
195 tab_entry = "%s\n" % string
196 found_best = True
197
198
199
200
201 if not found_best:
202 print "ERROR: didn't find best ranked sentence"
203 XMLGenerator._write(self, "\n\t")
204 XMLGenerator.endElement(self, name)
205 self.tab_file.write(tab_entry)
206