1
2
3
4 """
5 @author: Eleftherios Avramidis
6 """
7
8
9 from xml.sax.saxutils import XMLGenerator
10 from xml.sax import ContentHandler
11 from dataprocessor.input.genericreader import GenericReader
12 from dataprocessor.output.xmlwriter import GenericWriter
13 from sentence.sentence import SimpleSentence
14 from sentence.parallelsentence import ParallelSentence
15 import StringIO
16 from copy import deepcopy
17
18
20 """
21 Abstract SAX wrapper to facilitate use of older minidom processors.
22 """
23 i = 0
24
26 """
27 """
28
29 self.element_focus = element_focus
30 self.recording = False
31 self.buffer_generator = None
32 self.nodetext = u""
33 self.reader = reader
34 self.writer = writer
35 self.stringbuffer = None
36 self.parallelsentences = []
37 self.filename_out = filename_out
38 self.tobeparsedonce = []
39
40
41
42
44 """
45 Signals the start of an element (simplesentence or parallelsentence)
46 @param name: the name of the element
47 @type name: str
48 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
49 @type attrs: Attributes
50 """
51
52
53 if name == self.element_focus:
54 self.recording = True
55 self.stringbuffer = StringIO.StringIO()
56 self.buffer_generator = XMLGenerator(self.stringbuffer, encoding="utf-8")
57 for (element_name, element_attrs) in self.tobeparsedonce:
58 self.buffer_generator.startElement(element_name, element_attrs)
59 if self.recording:
60 self.buffer_generator.startElement(name, attrs)
61 else:
62 self.tobeparsedonce.append((name, attrs))
63
64
65
66
68 """
69 The Parser will call this method to report each chunk of character data.
70 We use it to store the string
71 @param ch: character being parsed
72 @type ch: str
73 """
74 if self.recording:
75 self.nodetext = u"%s%s" % (self.nodetext, ch)
76 else:
77
78 pass
79
80
82 """
83 Signals the end of an element.
84 Data stored in global vars of the class, time to create our objects and fire their processing
85 @param name: the name of the element
86 @type name: str
87 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
88 @type attrs: Attributes
89 """
90
91 if not self.recording:
92
93 self.tobeparsedonce.pop()
94 pass
95 else:
96 self.buffer_generator.characters(self.nodetext)
97 self.buffer_generator.endElement(name)
98 self.nodetext = u""
99
100 if name == self.element_focus:
101 self.recording = False
102 tobeclosed = deepcopy(self.tobeparsedonce)
103 tobeclosed.reverse()
104 for (element_name, element_attrs) in tobeclosed:
105 self.buffer_generator.endElement(element_name)
106 self.stringbuffer.flush()
107
108 string = self.stringbuffer.getvalue()
109
110 reader = self.reader(string, True, True)
111 parallelsentence = reader.get_parallelsentences()[0]
112
113
114 self.i += 1
115 print self.i
116 self.parallelsentences.append(parallelsentence)
117 self.stringbuffer.close()
118 self.stringbuffer = None
119
120
122 self.writer(self.parallelsentences).write_to_file(self.filename_out)
123
124
125