Package dataprocessor :: Package sax :: Module saxprocessor
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.saxprocessor

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  @author: Eleftherios Avramidis 
  6  """ 
  7   
  8   
  9  from xml.sax.saxutils import XMLGenerator 
 10  from sentence.sentence import SimpleSentence 
 11  from sentence.parallelsentence import ParallelSentence 
 12   
13 -class SaxProcessor(XMLGenerator):
14 """ 15 Handles the generation of features over an XML object formatted as JCML. 16 It does processing every time a parallel sentence including its contents has been declared. 17 Processing of any other XML type should follow this example. 18 """ 19
20 - def __init__(self, out, feature_generators = []):
21 """ 22 @param out: file object to receive processed changes 23 @type out: file 24 @param reader: an implementation of FileReader 25 @param feature_generators: list of feature generators to be applied 26 @type feature_generators: list 27 """ 28 29 #flags that show the current focus of the parsing 30 self.is_parallelsentence = False 31 self.is_simplesentence = False 32 self.passed_head = False #annotations declaration can only be done before any sentence has been declared 33 #the following variables function as a buffer, that gets filled as the elements are being parsed 34 #when elements are ended, then objects are created 35 self.ps_attributes = {} #attributes of the parallel sentence 36 self.ss_attributes = {} #attributes of a simple sentence 37 38 self.IN_TAG = self.inputformat.get_tags() 39 self.OUT_TAG = self.outputformat.get_tags() 40 41 self.src = None 42 self.tgt = [] 43 self.ref = None 44 self.annotations = [] 45 46 self.ss_text = [] 47 48 self.set_tags() 49 50 self.feature_generators = feature_generators 51 self._encoding = "utf-8" 52 XMLGenerator._encoding = "utf-8" 53 XMLGenerator._out = out
54 55
56 - def startDocument(self):
57 XMLGenerator.startDocument(self) 58 XMLGenerator.startElement(self, self.OUT_TAG["doc"], {})
59
60 - def endDocument(self):
61 XMLGenerator.endElement(self, self.OUT_TAG["doc"]) 62 XMLGenerator.endDocument(self)
63
64 - def startElement(self, name, attrs=[]):
65 """ 66 Signals the start of an element (simplesentence or parallelsentence) 67 @param name: the name of the element 68 @type name: str 69 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 70 @type attrs: Attributes 71 """ 72 if name == self.IN_TAG["sent"]: 73 74 #empty up string and attribute buffer 75 self.ss_text = [] 76 self.ps_attributes = {} 77 self.tgt = [] 78 for att_name in attrs.getNames(): 79 self.ps_attributes[att_name] = attrs.getValue(att_name) 80 self.is_parallelsentence = True 81 82 #add the newly produced feature generators to the heading of the generated file 83 # XMLGenerator.startElement(self, self.OUT_TAG["annotations"] , {}) 84 # if not self.passed_head: 85 # for featuregenerator in self.feature_generators: 86 # atts = {"name" : featuregenerator.get_annotation_name()} 87 # 88 # 89 # 90 # self.passed_head = True 91 # 92 # if name == self.IN_TAG_ANNOTATION: 93 # if not self.passed_head: 94 # self.annotations.append(attrs.getValue("name")) 95 # #XMLGenerator.startElement(self, name, attrs) 96 # else: 97 # print "Format error. Annotation must be declared in the beginning of the document" 98 99 elif name in [self.IN_TAG["src"], self.IN_TAG["tgt"], self.IN_TAG["ref"] ]: 100 101 #empty up string and attribute buffer 102 self.ss_text = [] 103 self.ss_attributes = {} 104 for att_name in attrs.getNames(): 105 self.ss_attributes[att_name] = attrs.getValue(att_name) 106 self.is_simplesentence = True
107 108
109 - def characters(self, ch):
110 """ 111 The Parser will call this method to report each chunk of character data. 112 We use it to store the string of the simplesentence 113 @param ch: character being parsed 114 @type ch: str 115 """ 116 if self.is_simplesentence : 117 # self.ss_text = u"%s%s" % (self.ss_text, ch) 118 self.ss_text.append(ch)
119 120
121 - def endElement(self, name):
122 """ 123 Signals the end of an element. 124 Data stored in global vars of the class, time to create our objects and fire their processing 125 @param name: the name of the element 126 @type name: str 127 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 128 @type attrs: Attributes 129 """ 130 self.ss_text = "".join(self.ss_text) 131 #get rid of annoying leading spaces 132 self.ss_text = self.ss_text.strip() 133 134 #all of the elements have to be declared here 135 #for each element, create the objects and clear "buffers" 136 if name == self.IN_TAG["src"]: 137 self.src = SimpleSentence(self.ss_text, self.ss_attributes) 138 self.ss_text = [] 139 elif name == self.IN_TAG["tgt"]: 140 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes)) 141 self.ss_text = [] 142 elif name == self.IN_TAG["sent"]: 143 #when the judged sentence gets closed, all previously inserted data have to be converted to objects 144 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes) 145 146 #apply feature generators 147 for fg in self.feature_generators: 148 parallelsentence = fg.add_features_parallelsentence(parallelsentence) 149 #parallelsentence.add_attributes( fg.get_features_parallelsentence(parallelsentence) ) 150 151 #print parallelsentence 152 src = self.src 153 # #print src.get_string() 154 # for fg in self.feature_generators: 155 # src = fg.add_features_src(src, parallelsentence) 156 # #src.add_attributes( fg.get_features_src(src, parallelsentence) ) 157 # parallelsentence.set_source(src) 158 159 #display modifications on output file 160 XMLGenerator._write(self, "\n\t") 161 162 XMLGenerator.startElement(self, name, parallelsentence.get_attributes()) 163 164 XMLGenerator._write(self, "\n\t\t") 165 XMLGenerator.startElement(self, self.OUT_TAG["src"], src.get_attributes()) 166 XMLGenerator.characters(self, src.get_string()) 167 XMLGenerator.endElement(self, self.OUT_TAG["src"]) 168 169 for tgt in parallelsentence.get_translations(): 170 # for fg in self.feature_generators: 171 # tgt = fg.add_features_tgt(tgt, parallelsentence) 172 # #tgt.add_attributes( fg.get_features_tgt(tgt, parallelsentence) ) 173 174 XMLGenerator._write(self, "\n\t\t") 175 XMLGenerator.startElement(self, self.OUT_TAG["tgt"], tgt.get_attributes()) 176 XMLGenerator.characters(self, tgt.get_string()) 177 XMLGenerator.endElement(self, self.OUT_TAG["tgt"]) 178 179 XMLGenerator._write(self, "\n\t") 180 XMLGenerator.endElement(self, name)
181