Package dataprocessor :: Package input :: Module posteditingreader
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.input.posteditingreader

 1  #!/usr/bin/python 
 2  # -*- coding: utf-8 -*- 
 3  ''' 
 4  Created on Jun 27, 2011 
 5   
 6  @author: jogin 
 7  ''' 
 8   
 9   
10  from xml.dom.minidom import parse 
11  from sentence.parallelsentence import ParallelSentence 
12  from sentence.sentence import SimpleSentence 
13  from xml.sax.saxutils import unescape 
14  from dataprocessor.input.genericreader import GenericReader 
15    
16   
17 -class PosteditingReader(GenericReader):
18 """ 19 Reader able to parse the ranking results from taraxu 1st evaluation round, as exported by cfedermann 20 """ 21
22 - def __init__(self, input_xml_filename, load = True):
23 """ 24 Constructor. Creates an XML object that handles ranking file data 25 @param input_xml_filename: the name of XML file 26 @type input_xml_filename: string 27 @param load: by turning this option to false, the instance will be 28 initialized without loading everything into memory 29 @type load: boolean 30 """ 31 self.input_filename = input_xml_filename 32 self.loaded = load 33 if load: 34 self.load()
35
36 - def load(self):
37 """ 38 Loads the data of the file into memory. It is useful if the Classes has 39 been asked not to load the filename upon initialization 40 """ 41 self.xmlObject = parse(self.input_filename)
42
43 - def unload(self):
44 self.xmlObject.unlink()
45 46 47
48 - def get_parallelsentences(self):
49 """ 50 This function parses a ranking xml file and returns a list of parallel 51 sentence objects. 52 @return ps_list: list of tuples in format (ranking-item_id, ParallelSentence) 53 @type ps_list: list of tuples 54 """ 55 r_items = self.xmlObject.getElementsByTagName('editing-item') 56 ps_list = [] 57 for r_item in r_items: 58 stc_id = r_item.getAttribute('sentence_id') 59 id = r_item.getAttribute('id') 60 src = '' 61 tgt_list = [] 62 for rank_child in r_item.childNodes: 63 if rank_child.nodeName == 'source': 64 src = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue)) 65 elif rank_child.nodeName == 'post-edited': 66 tgt = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue)) 67 tgt.add_attribute('system', 'post-edited') 68 tgt_list.append(tgt) 69 elif rank_child.nodeName != '#text': 70 tgt = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue)) 71 for attribute_name in rank_child.attributes.keys(): 72 attribute_value = rank_child.getAttribute(attribute_name) 73 tgt.add_attribute(attribute_name, attribute_value) 74 tgt.add_attribute('system', rank_child.getAttribute('name')) 75 tgt.add_attribute('origin','post-editing') 76 # tgt.add_attribute('rank', rank_child.getAttribute('rank')) 77 tgt_list.append(tgt) 78 79 80 ps_attributes = {} 81 for attributeKey in r_item.attributes.keys(): 82 ps_attributes[attributeKey] = unescape(r_item.attributes[attributeKey].value) 83 84 85 ps = ParallelSentence(src, tgt_list) 86 ps.add_attributes({'sentence_id': stc_id}) 87 ps.add_attributes({'id': id}) 88 ps_list.append(ps) 89 return ps_list
90