Package dataprocessor :: Package sax :: Module saxscoring
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.sax.saxscoring

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  @author: Eleftherios Avramidis 
  6  """ 
  7   
  8   
  9  from sentence.sentence import SimpleSentence 
 10  from sentence.parallelsentence import ParallelSentence 
 11  from xml.sax import handler 
 12   
13 -class SaxSystemScoring(handler.ContentHandler):
14 """ 15 """ 16
17 - def __init__(self, rank_attribute_name, outfilename, testset):
18 """ 19 @param out: file object to receive processed changes 20 @type out: file 21 @param feature_generators: list of feature generators to be applied 22 @type feature_generators: list 23 """ 24 25 self.outfilename = outfilename 26 self.rank_attribute_name = rank_attribute_name 27 self.testset = testset 28 #flags that show the current focus of the parsing 29 self.is_parallelsentence = False 30 self.is_simplesentence = False 31 self.passed_head = False #annotations declaration can only be done before any sentence has been declared 32 #the following variables function as a buffer, that gets filled as the elements are being parsed 33 #when elements are ended, then objects are created 34 self.ps_attributes = {} #attributes of the parallel sentence 35 self.ss_attributes = {} #attributes of a simple sentence 36 37 self.src = None 38 self.tgt = [] 39 self.ref = None 40 self.annotations = [] 41 42 self.ss_text = "" 43 44 self.set_tags() 45 46 self._encoding = "utf-8" 47 48 self.systems_performance = {} 49 self.parallelsentences = 0
50 51
52 - def set_tags(self):
53 """ 54 Handles the basic tags used for reading the simple XML format. 55 As tags are prone to changes, this can be done by changing values here, or overriding accordingly 56 """ 57 self.TAG_DOC = "jcml" 58 self.TAG_SENT = "judgedsentence" 59 self.TAG_SRC = "src" 60 self.TAG_TGT = "tgt" 61 self.TAG_REF = "ref" 62 self.TAG_ANNOTATIONS = "annotations" 63 self.TAG_ANNOTATION = "annotation"
64
65 - def startDocument(self):
66 pass
67
68 - def endDocument(self):
69 outfile = open(self.outfilename, 'w') 70 for system in self.systems_performance: 71 self.systems_performance[system] = 1.00 * self.systems_performance[system] / self.parallelsentences 72 entry = "dfki_parseconf\tde-en\t%s\t%s\t%01.4f\n" % (self.testset, system, self.systems_performance[system]) 73 outfile.write(entry) 74 outfile.close()
75
76 - def startElement(self, name, attrs=[]):
77 """ 78 Signals the start of an element (simplesentence or parallelsentence) 79 @param name: the name of the element 80 @type name: str 81 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 82 @type attrs: Attributes 83 """ 84 if name == self.TAG_SENT: 85 86 #empty up string and attribute buffer 87 self.ss_text = u"" 88 self.ps_attributes = {} 89 self.tgt = [] 90 for att_name in attrs.getNames(): 91 self.ps_attributes[att_name] = attrs.getValue(att_name) 92 self.is_parallelsentence = True 93 94 #add the newly produced feature generators to the heading of the generated file 95 96 97 98 99 100 101 if name == self.TAG_ANNOTATION: 102 if not self.passed_head: 103 self.annotations.append(attrs.getValue("name")) 104 105 else: 106 print "Format error. Annotation must be declared in the beginning of the document" 107 108 elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]: 109 110 #empty up string and attribute buffer 111 self.ss_text = u"" 112 self.ss_attributes = {} 113 for att_name in attrs.getNames(): 114 self.ss_attributes[att_name] = attrs.getValue(att_name) 115 self.is_simplesentence = True
116 117
118 - def characters(self, ch):
119 """ 120 The Parser will call this method to report each chunk of character data. 121 We use it to store the string of the simplesentence 122 @param ch: character being parsed 123 @type ch: str 124 """ 125 if self.is_simplesentence : 126 self.ss_text = u"%s%s" % (self.ss_text, ch)
127 128
129 - def endElement(self, name):
130 """ 131 Signals the end of an element. 132 Data stored in global vars of the class, time to create our objects and fire their processing 133 @param name: the name of the element 134 @type name: str 135 @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element. 136 @type attrs: Attributes 137 """ 138 139 #get rid of annoying leading spaces 140 self.ss_text = self.ss_text.strip() 141 142 #all of the elements have to be declared here 143 #for each element, create the objects and clear "buffers" 144 if name == self.TAG_SRC: 145 self.src = SimpleSentence(self.ss_text, self.ss_attributes) 146 self.ss_text = u"" 147 elif name == self.TAG_TGT: 148 self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes)) 149 self.ss_text = u"" 150 elif name == self.TAG_SENT: 151 #when the judged sentence gets closed, all previously inserted data have to be converted to objects 152 parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes) 153 self.parallelsentences +=1 154 rank_per_system = {} 155 #first sort the ranks by system 156 for target in parallelsentence.get_translations(): 157 system = target.get_attribute("system") 158 rank = int(float(target.get_attribute(self.rank_attribute_name))) 159 rank_per_system[system] = rank 160 #then count the times a system performs as best 161 for system in rank_per_system: 162 if rank_per_system[system] == min(rank_per_system.values()): 163 try: 164 self.systems_performance[system] += 1 165 except KeyError: 166 self.systems_performance[system] = 1
167