Package dataprocessor :: Package sax :: Module saxwmt11eval
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.saxwmt11eval

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  @author: Eleftherios Avramidis 
  6  """ 
  7   
  8   
  9  from xml.sax.saxutils import XMLGenerator 
 10  from sentence.sentence import SimpleSentence 
 11  from sentence.parallelsentence import ParallelSentence 
 12   
13 -class SaxWMTexporter(XMLGenerator):
14 """ 15 Handles the generation of features over an XML object formatted as JCML. 16 It does processing every time a parallel sentence including its contents has been declared. 17 Processing of any other XML type should follow this example. 18 """ 19
20 - def __init__(self, out, feature_generators, tab_filename, metric_name, lang_pair, test_set):
21 """ 22 @param out: file object to receive processed changes 23 @type out: file 24 @param feature_generators: list of feature generators to be applied 25 @type feature_generators: list 26 """ 27 28 self.tab_file = open(tab_filename, 'w') 29 self.metric_name = metric_name 30 self.lang_pair = lang_pair 31 self.test_set = test_set 32 #flags that show the current focus of the parsing 33 self.is_parallelsentence = False 34 self.is_simplesentence = False 35 self.passed_head = False #annotations declaration can only be done before any sentence has been declared 36 #the following variables function as a buffer, that gets filled as the elements are being parsed 37 #when elements are ended, then objects are created 38 self.ps_attributes = {} #attributes of the parallel sentence 39 self.ss_attributes = {} #attributes of a simple sentence 40 41 self.src = None 42 self.tgt = [] 43 self.ref = None 44 self.annotations = [] 45 46 self.ss_text = "" 47 48 self.set_tags() 49 50 self.feature_generators = feature_generators 51 self._encoding = "utf-8" 52 XMLGenerator._encoding = "utf-8" 53 XMLGenerator._out = out
54
55 - def set_tags(self):
56 """ 57 Handles the basic tags used for reading the simple XML format. 58 As tags are prone to changes, this can be done by changing values here, or overriding accordingly 59 """ 60 self.TAG_DOC = "jcml" 61 self.TAG_SENT = "judgedsentence" 62 self.TAG_SRC = "src" 63 self.TAG_TGT = "tgt" 64 self.TAG_REF = "ref" 65 self.TAG_ANNOTATIONS = "annotations" 66 self.TAG_ANNOTATION = "annotation"
67
68 - def startDocument(self):
69 XMLGenerator.startDocument(self) 70 XMLGenerator.startElement(self, self.TAG_DOC, {})
71
72 - def endDocument(self):
73 XMLGenerator.endElement(self, self.TAG_DOC) 74 XMLGenerator.endDocument(self)
75
76 - def startElement(self, name, attrs=[]):
77 """ 78 Signals the start of an element (simplesentence or parallelsentence) 79 @param name: the name of the element 80 @type name: str 81 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 82 @type attrs: Attributes 83 """ 84 if name == self.TAG_SENT: 85 86 #empty up string and attribute buffer 87 self.ss_text = u"" 88 self.ps_attributes = {} 89 self.tgt = [] 90 for att_name in attrs.getNames(): 91 self.ps_attributes[att_name] = attrs.getValue(att_name) 92 self.is_parallelsentence = True 93 94 #add the newly produced feature generators to the heading of the generated file 95 XMLGenerator.startElement(self, self.TAG_ANNOTATIONS, {}) 96 if not self.passed_head: 97 for featuregenerator in self.feature_generators: 98 atts = {"name" : featuregenerator.get_annotation_name()} 99 100 101 102 self.passed_head = True 103 104 if name == self.TAG_ANNOTATION: 105 if not self.passed_head: 106 self.annotations.append(attrs.getValue("name")) 107 XMLGenerator.startElement(self, name, attrs) 108 else: 109 print "Format error. Annotation must be declared in the beginning of the document" 110 111 elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]: 112 113 #empty up string and attribute buffer 114 self.ss_text = u"" 115 self.ss_attributes = {} 116 for att_name in attrs.getNames(): 117 self.ss_attributes[att_name] = attrs.getValue(att_name) 118 self.is_simplesentence = True
119 120
121 - def characters(self, ch):
122 """ 123 The Parser will call this method to report each chunk of character data. 124 We use it to store the string of the simplesentence 125 @param ch: character being parsed 126 @type ch: str 127 """ 128 if self.is_simplesentence : 129 self.ss_text = u"%s%s" % (self.ss_text, ch)
130 131
132 - def endElement(self, name):
133 """ 134 Signals the end of an element. 135 Data stored in global vars of the class, time to create our objects and fire their processing 136 @param name: the name of the element 137 @type name: str 138 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 139 @type attrs: Attributes 140 """ 141 142 #get rid of annoying leading spaces 143 self.ss_text = self.ss_text.strip() 144 145 #all of the elements have to be declared here 146 #for each element, create the objects and clear "buffers" 147 if name == self.TAG_SRC: 148 self.src = SimpleSentence(self.ss_text, self.ss_attributes) 149 self.ss_text = u"" 150 elif name == self.TAG_TGT: 151 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes)) 152 self.ss_text = u"" 153 elif name == self.TAG_SENT: 154 #when the judged sentence gets closed, all previously inserted data have to be converted to objects 155 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes) 156 157 #apply feature generators 158 for fg in self.feature_generators: 159 parallelsentence = fg.add_features_parallelsentence(parallelsentence) 160 #parallelsentence.add_attributes( fg.get_features_parallelsentence(parallelsentence) ) 161 162 #print parallelsentence 163 src = self.src 164 # #print src.get_string() 165 # for fg in self.feature_generators: 166 # src = fg.add_features_src(src, parallelsentence) 167 # #src.add_attributes( fg.get_features_src(src, parallelsentence) ) 168 # parallelsentence.set_source(src) 169 170 #display modifications on output file 171 XMLGenerator._write(self, "\n\t") 172 173 XMLGenerator.startElement(self, name, parallelsentence.get_attributes()) 174 175 XMLGenerator._write(self, "\n\t\t") 176 XMLGenerator.startElement(self, self.TAG_SRC, src.get_attributes()) 177 XMLGenerator.characters(self, src.get_string()) 178 XMLGenerator.endElement(self, self.TAG_SRC) 179 180 for tgt in parallelsentence.get_translations(): 181 # for fg in self.feature_generators: 182 # tgt = fg.add_features_tgt(tgt, parallelsentence) 183 # #tgt.add_attributes( fg.get_features_tgt(tgt, parallelsentence) ) 184 185 XMLGenerator._write(self, "\n\t\t") 186 XMLGenerator.startElement(self, self.TAG_TGT, tgt.get_attributes()) 187 XMLGenerator.characters(self, tgt.get_string()) 188 XMLGenerator.endElement(self, self.TAG_TGT) 189 190 tab_entry = "%s\t%s\t%s\t%s\t%s\t%s\n" % (self.metric_name, self.lang_pair, self.test_set, tgt.get_attribute("system"), parallelsentence.get_attribute("id"), tgt.get_attribute("rank")) 191 self.tab_file.write(tab_entry) 192 193 194 XMLGenerator._write(self, "\n\t") 195 XMLGenerator.endElement(self, name)
196