Package dataprocessor :: Package input :: Module xliffreader
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.input.xliffreader

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3  ''' 
  4  Created on Jun 27, 2011 
  5   
  6  @author: jogin 
  7  ''' 
  8   
  9  import string 
 10  from xml.dom.minidom import parse 
 11  from sentence.parallelsentence import ParallelSentence 
 12  from sentence.sentence import SimpleSentence 
 13  from sentence.dataset import DataSet 
 14  from xml.sax.saxutils import unescape 
 15  from dataprocessor.input.genericxmlreader import GenericXmlReader 
 16   
17 -class XliffReader(GenericXmlReader):
18 """ 19 classdocs 20 """ 21 22 23 24
25 - def load(self):
26 """ 27 Loads the data of the file into memory. It is useful if the Classes has 28 been asked not to load the filename upon initialization 29 """ 30 self.xmlObject = parse(self.input_filename)
31 32
33 - def get_dataset(self):
34 """ 35 Returs the contents of the XML file into an object structure, which is 36 represented by the DataSet object 37 Note that this will cause all the data of the XML file to be loaded 38 into system memory at once. 39 For big data sets this may not be optimal, so consider 40 sentence-by-sentence reading with SAX (saxjcml.py) 41 @rtype: sentence.dataset.DataSet 42 @return: A data set containing all the data of the XML file 43 """ 44 return DataSet(self.get_parallelsentences())
45 46
47 - def length(self):
48 return len(self.xmlObject.getElementsByTagName('trans-unit'))
49 50
51 - def get_weights(self, tool_id):
52 """ 53 Finds the global weights for particular tool ID in file. Used by 54 function get_parallelsentences(). 55 @tool_id: tool id 56 @type string 57 @return: global weights 58 @type: list 59 """ 60 weights = [] 61 tools = self.xmlObject.getElementsByTagName('tool') 62 for tool in tools: 63 if tool.getAttribute('tool-id') == tool_id: 64 for elem in tool.getElementsByTagName('metanet:weight'): 65 weights.append(('%s-%s-%s' % ('global', elem.getAttribute('type'), tool_id), \ 66 elem.getAttribute('value'))) 67 return weights
68 69
70 - def get_system_name(self, tool_id):
71 """ 72 Finds a system name of given tool id 73 @param tool_id: tool-id 74 @type tool_id: string 75 @return system_name: name of system 76 @type system_name: string 77 """ 78 system_name = '' 79 tools = self.xmlObject.getElementsByTagName('tool') 80 for tool in tools: 81 if tool.getAttribute('tool-id') == tool_id: 82 system_name = tool.getAttribute('tool-name') 83 break 84 return system_name
85 86
87 - def get_parallelsentence(self, transUnit):
88 """ 89 90 """ 91 92 # get a nodeList of alt-trans elements 93 altTranss = transUnit.getElementsByTagName('alt-trans') 94 sentence_id = transUnit.getAttribute("id") 95 96 # trans-unit source 97 src = '' 98 for transunit_src in transUnit.childNodes: 99 if transunit_src.nodeName == 'source': 100 src = SimpleSentence(unescape(transunit_src.childNodes[0].nodeValue)) 101 break 102 103 # save attributes from desired alt-trans nodes into tgt_list 104 tgt_list = [] 105 for altTrans in altTranss: 106 # alt-trans target 107 tgt = '' 108 for transunit_tgt in altTrans.childNodes: 109 if transunit_tgt.nodeName == 'target': 110 tgt = SimpleSentence(unescape(transunit_tgt.childNodes[0].nodeValue)) 111 break 112 113 # alt-trans_tool_id parsing 114 tool_id = altTrans.getAttribute('tool-id') 115 #tgt.add_attribute('tool_id', tool_id) 116 117 # system name 118 #system_name = self.get_system_name(tool_id) 119 #tgt.add_attribute('system', system_name) 120 tgt.add_attribute('system', tool_id) 121 122 # add global weights for particular tool id 123 for weight in self.get_weights(tool_id): 124 tgt.add_attribute(weight[0], weight[1]) 125 126 # alt-trans_score parsing 127 alttrans_scores = altTrans.getElementsByTagName("metanet:scores") 128 for alttrans_score in alttrans_scores: 129 if alttrans_score in altTrans.childNodes: 130 for elem in alttrans_score.getElementsByTagName("metanet:score"): 131 tgt.add_attribute('sc_%s-%s' % (tool_id, elem.getAttribute('type').replace(' ', '-')), \ 132 elem.getAttribute('value')) 133 134 # alt-trans_annotation parsing 135 alttrans_derivations = altTrans.getElementsByTagName("metanet:derivation") 136 for alttrans_derivation in alttrans_derivations: 137 derivation_id = alttrans_derivation.getAttribute("id") 138 # tgt.add_attribute("an_%s-tokens" % derivation_id, len(alttrans_derivation.getElementsByTagName("metanet:token"))) 139 labels_count = {} 140 if alttrans_derivation in altTrans.childNodes: 141 for elem in alttrans_derivation.getElementsByTagName("metanet:annotation"): 142 ann_type = elem.getAttribute('type').replace(' ', '-') 143 value = elem.getAttribute('value').replace("$", "SS") 144 if elem in alttrans_derivation.childNodes: 145 tgt.add_attribute('an_%s-%s-%s' % (tool_id, derivation_id, ann_type), value) 146 147 #count node types from Lucy parser 148 elif ann_type == "cat": 149 if labels_count.has_key(value): 150 labels_count[value] += 1 151 else: 152 labels_count[value] = 1 153 #label counts collected 154 for label in labels_count: 155 att_name = "%s-cat-%s" % (derivation_id, label) 156 tgt.add_attribute(att_name, labels_count[label]) 157 158 159 160 161 162 163 phrases = altTrans.getElementsByTagName("metanet:phrase") 164 165 if phrases: 166 tgt.add_attribute("phrases_count", str(len(phrases))) 167 168 phrase_id = 0 169 for phrase in phrases: 170 scoresets = phrase.getElementsByTagName("metanet:scores") 171 phrase_id += 1 172 if scoresets: 173 scoreset = scoresets[0] 174 for score in scoreset.getElementsByTagName("metanet:score"): 175 if score.getAttribute('value'): 176 value = score.getAttribute('value') 177 else: 178 value = string.strip(score.firstChild.nodeValue, "\n ") 179 tgt.add_attribute('ds_%s-%s-%d' % (tool_id, score.getAttribute('type').replace(' ', '-'), phrase_id), value) 180 181 182 183 184 # alt-trans_OOV_words 185 alttrans_annotations = altTrans.getElementsByTagName("metanet:annotation") 186 OOV_count = 0 187 for alttrans_annotation in alttrans_annotations: 188 if alttrans_annotation.getAttribute('type') == 'oov' \ 189 or alttrans_annotation.getAttribute('type') == 'OOV': 190 OOV_count += int(alttrans_annotation.getAttribute('value')) 191 tgt.add_attribute('%s-%s' % (tool_id, 'OOV_count'), str(OOV_count)) 192 193 # alt-trans token_count parsing 194 token_count = {} 195 tokens = altTrans.getElementsByTagName('metanet:token') 196 197 for token in tokens: 198 d = token.getAttribute('id').partition('_d')[2].partition('_')[0] 199 if d not in token_count: 200 token_count[d] = 1 201 else: 202 token_count[d] = int(token_count[d]) + 1 203 for d_count in token_count: 204 tgt.add_attribute('%s-%s%s-%s' % (tool_id, 'd', d_count, \ 205 'token-count'), token_count[d_count]) 206 207 # add a target with new attributes to the list 208 tgt_list.append(tgt) 209 210 # trans-unit reference 211 ref = '' 212 for transunit_ref in transUnit.childNodes: 213 if transunit_ref.nodeName == 'target': 214 ref = SimpleSentence(unescape(transunit_ref.childNodes[0].nodeValue)) 215 break 216 # create an object of parallel sentence 217 ps = ParallelSentence(src, tgt_list, ref, {"id" : sentence_id}) 218 print "." 219 return ps
220 221
222 - def get_parallelsentences(self):
223 """ 224 @return: a list of ParallelSentence objects 225 """ 226 xmlObject = self.xmlObject 227 228 # get a nodeList of trans-units elements 229 return [self.get_parallelsentence(transUnit) for transUnit in xmlObject.getElementsByTagName('trans-unit')]
230 231 #xmlObject.unlink() # deallocate memory 232 233 234
235 - def unload(self):
236 self.xmlObject.unlink()
237 238
239 - def __read_attributes__(self, xmlEntry):
240 """ 241 @return: a dictionary of the attributes of the current sentence (name:value) 242 """ 243 attributes = {} 244 attributeKeys = xmlEntry.attributes.keys() 245 for attributeKey in attributeKeys: 246 attributes[attributeKey] = unescape(xmlEntry.attributes[attributeKey].value) 247 return attributes
248