1 '''
2 Created on 14 Dec 2011
3
4 @author: Eleftherios Avramidis
5 '''
6
7 import shutil
8 import sys
9 import os
10 import re
11 import tempfile
12 from random import shuffle
13 from xml.sax.saxutils import XMLGenerator
14 from xml.sax.xmlreader import AttributesImpl
15 from dataprocessor.dataformat.jcmlformat import JcmlFormat
16 from sentence.sentence import SimpleSentence
17 from sentence.dataset import DataSet
18
19
20
21 illegal_xml_chars_RE = re.compile(u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]')
22
23
25 """
26 Kills extended unicode characters that are not allowed in a proper XML
27 """
28 clean_string, rep = illegal_xml_chars_RE.subn('', string)
29 if rep > 0:
30 sys.stderr.write("I had to kill {0} unicode characters because they were not XML-compliant\n".format(rep))
31
32 return clean_string.strip()
33
34
35
37 """
38 Write line by line incrementally on an XML file, without loading anything in the memory.
39 Don't forget the close function. Object sentences cannot be edited after written
40 """
42 self.TAG = xmlformat.TAG
43 self.filename = filename
44 self.file = tempfile.NamedTemporaryFile(mode='w',delete=False,suffix='.jcml', prefix='tmp_', dir='.')
45 self.tempfilename = self.file.name
46
47 self.generator = XMLGenerator(self.file, "utf-8")
48 self.generator.startDocument()
49 self.generator.startElement(self.TAG["doc"], {})
50
52 self.generator.characters("\n\t")
53
54 attributes = dict([(key,str(val)) for key,val in parallelsentence.get_attributes().iteritems()])
55 self.generator.startElement(self.TAG["sent"], attributes)
56
57 src = parallelsentence.get_source()
58
59 if isinstance(src, SimpleSentence):
60
61 self.generator._write("\n\t\t")
62 src_attributes = dict([(key,str(val)) for key,val in src.get_attributes().iteritems()])
63 self.generator.startElement(self.TAG["src"], src_attributes)
64 self.generator.characters(c(src.get_string()))
65 self.generator.endElement(self.TAG["src"])
66 elif isinstance(src, tuple):
67 for src in parallelsentence.get_source():
68 self.generator._write("\n\t\t")
69 src_attributes = dict([(key,str(val)) for key,val in src.get_attributes().iteritems()])
70 self.generator.startElement(self.TAG["src"], src_attributes)
71 self.generator.characters(c(src.get_string()))
72 self.generator.endElement(self.TAG["src"])
73
74 for tgt in parallelsentence.get_translations():
75 self.generator._write("\n\t\t")
76 tgt_attributes = dict([(key,str(val)) for key,val in tgt.get_attributes().iteritems()])
77 self.generator.startElement(self.TAG["tgt"], tgt_attributes)
78 self.generator.characters(c(tgt.get_string()))
79 self.generator.endElement(self.TAG["tgt"])
80
81
82 ref = parallelsentence.get_reference()
83 if ref and ref.get_string() != "":
84 self.generator._write("\n\t\t")
85 ref_attributes = dict([(key,str(val)) for key,val in ref.get_attributes().iteritems()])
86 self.generator.startElement(self.TAG["ref"], ref_attributes)
87 self.generator.characters(c(ref.get_string()))
88 self.generator.endElement(self.TAG["ref"])
89
90 self.generator._write("\n\t")
91 self.generator.endElement(self.TAG["sent"])
92
93
101
102
104 '''
105 This is a helper class which is meant to produce quickly an XML file
106 given a list of parallel sentences, without loading a new heavy XML object
107 into the memory
108 '''
109
111 '''
112 Provide a list of parallel sentences
113 '''
114
115 self.shuffle_translations = kwargs.setdefault("shuffle_translations", False)
116 self.sort_attribute = kwargs.setdefault("sort_attribute", None)
117
118 if isinstance (parallelsentences, DataSet):
119 self.parallelsentences = parallelsentences.get_parallelsentences()
120 else:
121 self.parallelsentences = parallelsentences
122
123 self.TAG = format.TAG
124
125
127 '''
128 XML output is written to the desired file
129 '''
130 tempfilename = "%s.tmp" % filename
131 f = open(tempfilename, 'w')
132 generator = XMLGenerator(f, "utf-8")
133 generator.startDocument()
134 generator.startElement(self.TAG["doc"], {})
135
136 for parallelsentence in self.parallelsentences:
137 generator.characters("\n\t")
138 attributes = dict([(k,str(v)) for k,v in parallelsentence.get_attributes().iteritems()])
139 generator.startElement(self.TAG["sent"], attributes)
140
141 src = parallelsentence.get_source()
142 attributes = dict([(k,str(v)) for k,v in src.get_attributes().iteritems()])
143
144 if isinstance(src, SimpleSentence):
145
146 generator._write("\n\t\t")
147 generator.startElement(self.TAG["src"], attributes)
148 generator.characters(c(src.get_string()))
149 generator.endElement(self.TAG["src"])
150 elif isinstance(src, tuple):
151 for src in parallelsentence.get_source():
152 generator._write("\n\t\t")
153 generator.startElement(self.TAG["src"], attributes)
154 generator.characters(c(src.get_string()))
155 generator.endElement(self.TAG["src"])
156
157 translations = parallelsentence.get_translations()
158
159 if self.shuffle_translations:
160 shuffle(translations)
161
162
163
164 if self.sort_attribute:
165 translations = sorted(translations, key=lambda tgt: tgt.get_attribute(self.sort_attribute))
166
167
168 for tgt in translations:
169 generator._write("\n\t\t")
170 attributes = dict([(k,str(v)) for k,v in tgt.get_attributes().iteritems()])
171 generator.startElement(self.TAG["tgt"], attributes)
172 generator.characters(c(tgt.get_string()))
173 generator.endElement(self.TAG["tgt"])
174
175
176 ref = parallelsentence.get_reference()
177 if ref and ref.get_string() != "":
178 generator._write("\n\t\t")
179 attributes = dict([(k,str(v)) for k,v in ref.get_attributes().iteritems()])
180 generator.startElement(self.TAG["ref"], attributes)
181 generator.characters(c(ref.get_string()))
182 generator.endElement(self.TAG["ref"])
183
184 generator._write("\n\t")
185
186
187
188 generator.endElement(self.TAG["sent"])
189 generator.characters("\n")
190 generator.endElement(self.TAG["doc"])
191 generator.characters("\n")
192 generator.endDocument()
193 f.close()
194 shutil.move(tempfilename, filename)
195