Package dataprocessor :: Package sax :: Module sax_bestrank2simplefile
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.sax_bestrank2simplefile

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  @author: Eleftherios Avramidis 
  6  """ 
  7   
  8   
  9  from xml.sax.saxutils import XMLGenerator 
 10  from sentence.sentence import SimpleSentence 
 11  from sentence.parallelsentence import ParallelSentence 
 12   
13 -class SaxBestRank2SimpleFile(XMLGenerator):
14 """ 15 Handles the generation of features over an XML object formatted as JCML. 16 It does processing every time a parallel sentence including its contents has been declared. 17 Processing of any other XML type should follow this example. 18 """ 19
20 - def __init__(self, out, feature_generators, tab_filename):
21 """ 22 @param out: file object to receive processed changes 23 @type out: file 24 @param feature_generators: list of feature generators to be applied 25 @type feature_generators: list 26 """ 27 28 self.tab_file = open(tab_filename, 'w') 29 30 #flags that show the current focus of the parsing 31 self.is_parallelsentence = False 32 self.is_simplesentence = False 33 self.passed_head = False #annotations declaration can only be done before any sentence has been declared 34 #the following variables function as a buffer, that gets filled as the elements are being parsed 35 #when elements are ended, then objects are created 36 self.ps_attributes = {} #attributes of the parallel sentence 37 self.ss_attributes = {} #attributes of a simple sentence 38 39 self.src = None 40 self.tgt = [] 41 self.ref = None 42 self.annotations = [] 43 44 self.ss_text = "" 45 46 self.set_tags() 47 48 self.feature_generators = feature_generators 49 self._encoding = "utf-8" 50 XMLGenerator._encoding = "utf-8" 51 XMLGenerator._out = out
52
53 - def set_tags(self):
54 """ 55 Handles the basic tags used for reading the simple XML format. 56 As tags are prone to changes, this can be done by changing values here, or overriding accordingly 57 """ 58 self.TAG_DOC = "jcml" 59 self.TAG_SENT = "judgedsentence" 60 self.TAG_SRC = "src" 61 self.TAG_TGT = "tgt" 62 self.TAG_REF = "ref" 63 self.TAG_ANNOTATIONS = "annotations" 64 self.TAG_ANNOTATION = "annotation"
65
66 - def startDocument(self):
67 XMLGenerator.startDocument(self) 68 XMLGenerator.startElement(self, self.TAG_DOC, {})
69
70 - def endDocument(self):
71 XMLGenerator.endElement(self, self.TAG_DOC) 72 XMLGenerator.endDocument(self) 73 self.tab_file.close()
74
75 - def startElement(self, name, attrs=[]):
76 """ 77 Signals the start of an element (simplesentence or parallelsentence) 78 @param name: the name of the element 79 @type name: str 80 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 81 @type attrs: Attributes 82 """ 83 if name == self.TAG_SENT: 84 85 #empty up string and attribute buffer 86 self.ss_text = u"" 87 self.ps_attributes = {} 88 self.tgt = [] 89 for att_name in attrs.getNames(): 90 self.ps_attributes[att_name] = attrs.getValue(att_name) 91 self.is_parallelsentence = True 92 93 #add the newly produced feature generators to the heading of the generated file 94 XMLGenerator.startElement(self, self.TAG_ANNOTATIONS, {}) 95 if not self.passed_head: 96 for featuregenerator in self.feature_generators: 97 atts = {"name" : featuregenerator.get_annotation_name()} 98 99 100 101 self.passed_head = True 102 103 if name == self.TAG_ANNOTATION: 104 if not self.passed_head: 105 self.annotations.append(attrs.getValue("name")) 106 XMLGenerator.startElement(self, name, attrs) 107 else: 108 print "Format error. Annotation must be declared in the beginning of the document" 109 110 elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]: 111 112 #empty up string and attribute buffer 113 self.ss_text = u"" 114 self.ss_attributes = {} 115 for att_name in attrs.getNames(): 116 self.ss_attributes[att_name] = attrs.getValue(att_name) 117 self.is_simplesentence = True
118 119
120 - def characters(self, ch):
121 """ 122 The Parser will call this method to report each chunk of character data. 123 We use it to store the string of the simplesentence 124 @param ch: character being parsed 125 @type ch: str 126 """ 127 if self.is_simplesentence : 128 self.ss_text = u"%s%s" % (self.ss_text, ch)
129 130
131 - def endElement(self, name):
132 """ 133 Signals the end of an element. 134 Data stored in global vars of the class, time to create our objects and fire their processing 135 @param name: the name of the element 136 @type name: str 137 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 138 @type attrs: Attributes 139 """ 140 141 #get rid of annoying leading spaces 142 self.ss_text = self.ss_text.strip() 143 144 #all of the elements have to be declared here 145 #for each element, create the objects and clear "buffers" 146 if name == self.TAG_SRC: 147 self.src = SimpleSentence(self.ss_text, self.ss_attributes) 148 self.ss_text = u"" 149 elif name == self.TAG_TGT: 150 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes)) 151 self.ss_text = u"" 152 elif name == self.TAG_SENT: 153 #when the judged sentence gets closed, all previously inserted data have to be converted to objects 154 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes) 155 156 #apply feature generators 157 for fg in self.feature_generators: 158 parallelsentence = fg.add_features_parallelsentence(parallelsentence) 159 #parallelsentence.add_attributes( fg.get_features_parallelsentence(parallelsentence) ) 160 161 #print parallelsentence 162 src = parallelsentence.get_source() 163 # #print src.get_string() 164 # for fg in self.feature_generators: 165 # src = fg.add_features_src(src, parallelsentence) 166 # #src.add_attributes( fg.get_features_src(src, parallelsentence) ) 167 # parallelsentence.set_source(src) 168 169 #display modifications on output file 170 XMLGenerator._write(self, "\n\t") 171 172 XMLGenerator.startElement(self, name, parallelsentence.get_attributes()) 173 174 XMLGenerator._write(self, "\n\t\t") 175 XMLGenerator.startElement(self, self.TAG_SRC, src.get_attributes()) 176 XMLGenerator.characters(self, src.get_string()) 177 XMLGenerator.endElement(self, self.TAG_SRC) 178 179 found_best = False 180 tab_entry = "\n" 181 182 for tgt in parallelsentence.get_translations(): 183 # for fg in self.feature_generators: 184 # tgt = fg.add_features_tgt(tgt, parallelsentence) 185 # #tgt.add_attributes( fg.get_features_tgt(tgt, parallelsentence) ) 186 187 XMLGenerator._write(self, "\n\t\t") 188 XMLGenerator.startElement(self, self.TAG_TGT, tgt.get_attributes()) 189 XMLGenerator.characters(self, tgt.get_string()) 190 XMLGenerator.endElement(self, self.TAG_TGT) 191 192 193 if int(tgt.get_attribute("rank")) == 1 and not found_best: 194 string = tgt.get_string() 195 tab_entry = "%s\n" % string 196 found_best = True 197 198 199 200 201 if not found_best: 202 print "ERROR: didn't find best ranked sentence" 203 XMLGenerator._write(self, "\n\t") 204 XMLGenerator.endElement(self, name) 205 self.tab_file.write(tab_entry)
206