Package dataprocessor :: Package sax :: Module saxjcml
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.saxjcml

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  @author: Eleftherios Avramidis 
  6  """ 
  7   
  8   
  9  from xml.sax.saxutils import XMLGenerator 
 10  from xml import sax 
 11  from sentence.sentence import SimpleSentence 
 12  from sentence.parallelsentence import ParallelSentence 
 13  import shutil 
 14  import codecs 
 15  import sys 
 16   
 17   
18 -def run_features_generator(input_file, output_file, generators, encode=False):
19 """ 20 Function that runs a jcml file through a list of featuregenerators in the SAX way 21 and adds the features directly on a target jcml file 22 @param input_file Filename for the XML-formated data used as input 23 @type input_file string 24 @param output_file Filename for the result of the featuregenerator, to be generated 25 @type output_file string 26 @param generators List of generators to be applied on each of the parallelsentences contained in the XMLs 27 """ 28 29 30 input_file_object = open(input_file, 'r' ) 31 tmpfile = "%s.tmp" % output_file 32 output_file_object = open(tmpfile, 'w' ) 33 saxhandler = SaxJCMLProcessor(output_file_object, generators) 34 sax.parse(input_file_object, saxhandler) 35 input_file_object.close() 36 output_file_object.close() 37 shutil.move(tmpfile, output_file)
38
39 -class SaxJCMLProcessor(XMLGenerator):
40 """ 41 Handles the generation of features over an XML object formatted as JCML. 42 It does processing every time a parallel sentence including its contents has been declared. 43 Processing of any other XML type should follow this example. 44 """ 45
46 - def __init__(self, out, feature_generators = []):
47 """ 48 @param out: file object to receive processed changes 49 @type out: file 50 @param feature_generators: list of feature generators to be applied 51 @type feature_generators: list 52 """ 53 54 #flags that show the current focus of the parsing 55 self.is_parallelsentence = False 56 self.is_simplesentence = False 57 self.passed_head = False #annotations declaration can only be done before any sentence has been declared 58 #the following variables function as a buffer, that gets filled as the elements are being parsed 59 #when elements are ended, then objects are created 60 self.ps_attributes = {} #attributes of the parallel sentence 61 self.ss_attributes = {} #attributes of a simple sentence 62 63 self.src = None 64 self.tgt = [] 65 self.ref = None 66 self.annotations = [] 67 68 self.ss_text = [] 69 70 self.set_tags() 71 72 self.feature_generators = feature_generators 73 self._encoding = "utf-8" 74 XMLGenerator._encoding = "utf-8" 75 XMLGenerator._out = out
76
77 - def set_tags(self):
78 """ 79 Handles the basic tags used for reading the simple XML format. 80 As tags are prone to changes, this can be done by changing values here, or overriding accordingly 81 """ 82 self.TAG_DOC = "jcml" 83 self.TAG_SENT = "judgedsentence" 84 self.TAG_SRC = "src" 85 self.TAG_TGT = "tgt" 86 self.TAG_REF = "ref" 87 self.TAG_ANNOTATIONS = "annotations" 88 self.TAG_ANNOTATION = "annotation"
89
90 - def startDocument(self):
91 XMLGenerator.startDocument(self) 92 XMLGenerator.startElement(self, self.TAG_DOC, {})
93
94 - def endDocument(self):
95 XMLGenerator.endElement(self, self.TAG_DOC) 96 XMLGenerator.endDocument(self)
97
98 - def startElement(self, name, attrs=[]):
99 """ 100 Signals the start of an element (simplesentence or parallelsentence) 101 @param name: the name of the element 102 @type name: str 103 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 104 @type attrs: Attributes 105 """ 106 if name == self.TAG_SENT: 107 108 #empty up string and attribute buffer 109 self.ss_text = [] 110 self.ps_attributes = {} 111 self.tgt = [] 112 for att_name in attrs.getNames(): 113 self.ps_attributes[att_name] = attrs.getValue(att_name) 114 self.is_parallelsentence = True 115 116 #add the newly produced feature generators to the heading of the generated file 117 # XMLGenerator.startElement(self, self.TAG_ANNOTATIONS, {}) 118 # if not self.passed_head: 119 # for featuregenerator in self.feature_generators: 120 # atts = {"name" : featuregenerator.get_annotation_name()} 121 # 122 # 123 # 124 # self.passed_head = True 125 # 126 # if name == self.TAG_ANNOTATION: 127 # if not self.passed_head: 128 # self.annotations.append(attrs.getValue("name")) 129 # #XMLGenerator.startElement(self, name, attrs) 130 # else: 131 # print "Format error. Annotation must be declared in the beginning of the document" 132 133 elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]: 134 135 #empty up string and attribute buffer 136 self.ss_text = [] 137 self.ss_attributes = {} 138 for att_name in attrs.getNames(): 139 self.ss_attributes[att_name] = attrs.getValue(att_name) 140 self.is_simplesentence = True
141 142
143 - def characters(self, ch):
144 """ 145 The Parser will call this method to report each chunk of character data. 146 We use it to store the string of the simplesentence 147 @param ch: character being parsed 148 @type ch: str 149 """ 150 if self.is_simplesentence : 151 self.ss_text.append(ch)
152 # self.ss_text = u"%s%s" % (self.ss_text, ch) 153 154
155 - def endElement(self, name):
156 """ 157 Signals the end of an element. 158 Data stored in global vars of the class, time to create our objects and fire their processing 159 @param name: the name of the element 160 @type name: str 161 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 162 @type attrs: Attributes 163 """ 164 parsed_text = "".join(self.ss_text).strip() 165 #get rid of annoying leading spaces 166 167 #all of the elements have to be declared here 168 #for each element, create the objects and clear "buffers" 169 if name == self.TAG_SRC: 170 self.src = SimpleSentence(parsed_text, self.ss_attributes) 171 self.ss_text = [] 172 elif name == self.TAG_REF: 173 self.ref = SimpleSentence(parsed_text, self.ss_attributes) 174 self.ss_text = [] 175 elif name == self.TAG_TGT: 176 self.tgt.append(SimpleSentence(parsed_text, self.ss_attributes)) 177 self.ss_text = [] 178 elif name == self.TAG_SENT: 179 #when the judged sentence gets closed, all previously inserted data have to be converted to objects 180 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes) 181 sys.stderr.write("\\") 182 #apply feature generators 183 for fg in self.feature_generators: 184 parallelsentence = fg.add_features_parallelsentence(parallelsentence) 185 #parallelsentence.add_attributes( fg.get_features_parallelsentence(parallelsentence) ) 186 sys.stderr.write("/") 187 #print parallelsentence 188 src = parallelsentence.get_source() 189 # #print src.get_string() 190 # for fg in self.feature_generators: 191 # src = fg.add_features_src(src, parallelsentence) 192 # #src.add_attributes( fg.get_features_src(src, parallelsentence) ) 193 # parallelsentence.set_source(src) 194 195 #display modifications on output file 196 XMLGenerator._write(self, "\n\t") 197 198 XMLGenerator.startElement(self, name, parallelsentence.get_attributes()) 199 200 XMLGenerator._write(self, "\n\t\t") 201 XMLGenerator.startElement(self, self.TAG_SRC, src.get_attributes()) 202 XMLGenerator.characters(self, src.get_string()) 203 XMLGenerator.endElement(self, self.TAG_SRC) 204 205 for tgt in parallelsentence.get_translations(): 206 # for fg in self.feature_generators: 207 # tgt = fg.add_features_tgt(tgt, parallelsentence) 208 # #tgt.add_attributes( fg.get_features_tgt(tgt, parallelsentence) ) 209 210 XMLGenerator._write(self, "\n\t\t") 211 XMLGenerator.startElement(self, self.TAG_TGT, tgt.get_attributes()) 212 XMLGenerator.characters(self, tgt.get_string()) 213 XMLGenerator.endElement(self, self.TAG_TGT) 214 215 216 ref = parallelsentence.get_reference() 217 218 XMLGenerator._write(self, "\n\t\t") 219 try: 220 XMLGenerator.startElement(self, self.TAG_REF, ref.get_attributes()) 221 XMLGenerator.characters(self, ref.get_string()) 222 XMLGenerator.endElement(self, self.TAG_REF) 223 XMLGenerator._write(self, "\n\t") 224 except: 225 pass 226 227 XMLGenerator.endElement(self, name)
228