1
2
3
4 """
5 @author: Eleftherios Avramidis
6 """
7
8
9 from xml.sax.saxutils import XMLGenerator
10 from sentence.sentence import SimpleSentence
11 from sentence.parallelsentence import ParallelSentence
12
14 """
15 Handles the generation of features over an XML object formatted as JCML.
16 It does processing every time a parallel sentence including its contents has been declared.
17 Processing of any other XML type should follow this example.
18 """
19
20 - def __init__(self, out, feature_generators = []):
21 """
22 @param out: file object to receive processed changes
23 @type out: file
24 @param reader: an implementation of FileReader
25 @param feature_generators: list of feature generators to be applied
26 @type feature_generators: list
27 """
28
29
30 self.is_parallelsentence = False
31 self.is_simplesentence = False
32 self.passed_head = False
33
34
35 self.ps_attributes = {}
36 self.ss_attributes = {}
37
38 self.IN_TAG = self.inputformat.get_tags()
39 self.OUT_TAG = self.outputformat.get_tags()
40
41 self.src = None
42 self.tgt = []
43 self.ref = None
44 self.annotations = []
45
46 self.ss_text = []
47
48 self.set_tags()
49
50 self.feature_generators = feature_generators
51 self._encoding = "utf-8"
52 XMLGenerator._encoding = "utf-8"
53 XMLGenerator._out = out
54
55
59
63
65 """
66 Signals the start of an element (simplesentence or parallelsentence)
67 @param name: the name of the element
68 @type name: str
69 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
70 @type attrs: Attributes
71 """
72 if name == self.IN_TAG["sent"]:
73
74
75 self.ss_text = []
76 self.ps_attributes = {}
77 self.tgt = []
78 for att_name in attrs.getNames():
79 self.ps_attributes[att_name] = attrs.getValue(att_name)
80 self.is_parallelsentence = True
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99 elif name in [self.IN_TAG["src"], self.IN_TAG["tgt"], self.IN_TAG["ref"] ]:
100
101
102 self.ss_text = []
103 self.ss_attributes = {}
104 for att_name in attrs.getNames():
105 self.ss_attributes[att_name] = attrs.getValue(att_name)
106 self.is_simplesentence = True
107
108
110 """
111 The Parser will call this method to report each chunk of character data.
112 We use it to store the string of the simplesentence
113 @param ch: character being parsed
114 @type ch: str
115 """
116 if self.is_simplesentence :
117
118 self.ss_text.append(ch)
119
120
122 """
123 Signals the end of an element.
124 Data stored in global vars of the class, time to create our objects and fire their processing
125 @param name: the name of the element
126 @type name: str
127 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
128 @type attrs: Attributes
129 """
130 self.ss_text = "".join(self.ss_text)
131
132 self.ss_text = self.ss_text.strip()
133
134
135
136 if name == self.IN_TAG["src"]:
137 self.src = SimpleSentence(self.ss_text, self.ss_attributes)
138 self.ss_text = []
139 elif name == self.IN_TAG["tgt"]:
140 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes))
141 self.ss_text = []
142 elif name == self.IN_TAG["sent"]:
143
144 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)
145
146
147 for fg in self.feature_generators:
148 parallelsentence = fg.add_features_parallelsentence(parallelsentence)
149
150
151
152 src = self.src
153
154
155
156
157
158
159
160 XMLGenerator._write(self, "\n\t")
161
162 XMLGenerator.startElement(self, name, parallelsentence.get_attributes())
163
164 XMLGenerator._write(self, "\n\t\t")
165 XMLGenerator.startElement(self, self.OUT_TAG["src"], src.get_attributes())
166 XMLGenerator.characters(self, src.get_string())
167 XMLGenerator.endElement(self, self.OUT_TAG["src"])
168
169 for tgt in parallelsentence.get_translations():
170
171
172
173
174 XMLGenerator._write(self, "\n\t\t")
175 XMLGenerator.startElement(self, self.OUT_TAG["tgt"], tgt.get_attributes())
176 XMLGenerator.characters(self, tgt.get_string())
177 XMLGenerator.endElement(self, self.OUT_TAG["tgt"])
178
179 XMLGenerator._write(self, "\n\t")
180 XMLGenerator.endElement(self, name)
181