dataprocessor.input.rankreader

1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 ''' 4 Created on Jun 27, 2011 5 6 @author: Lukas Poustka, Eleftherios Avramidis 7 ''' 8 9 10 from xml.dom.minidom import parse 11 from sentence.parallelsentence import ParallelSentence 12 from sentence.sentence import SimpleSentence 13 from xml.sax.saxutils import unescape 14 from dataprocessor.input.genericreader import GenericReader 15 16

17 -class RankReader(GenericReader):

18 """ 19 Reader able to parse the ranking results from taraxu 1st evaluation round, as exported by cfedermann 20 """ 21

22 - def __init__(self, input_xml_filename, load = True):

23 """ 24 Constructor. Creates an XML object that handles ranking file data 25 @param input_xml_filename: the name of XML file 26 @type input_xml_filename: string 27 @param load: by turning this option to false, the instance will be 28 initialized without loading everything into memory 29 @type load: boolean 30 """ 31 self.input_filename = input_xml_filename 32 self.loaded = load 33 if load: 34 self.load()

35

36 - def load(self):

37 """ 38 Loads the data of the file into memory. It is useful if the Classes has 39 been asked not to load the filename upon initialization 40 """ 41 self.xmlObject = parse(self.input_filename)

42

43 - def unload(self):

44 self.xmlObject.unlink()

45 46 47

48 - def get_parallelsentences(self):

49 """ 50 This function parses a ranking xml file and returns a list of parallel 51 sentence objects. 52 @return ps_list: list of tuples in format (ranking-item_id, ParallelSentence) 53 @type ps_list: list of tuples 54 """ 55 r_items = self.xmlObject.getElementsByTagName('ranking-item') 56 ps_list = [] 57 for r_item in r_items: 58 stc_id = r_item.getAttribute('sentence_id') 59 src = '' 60 tgt_list = [] 61 62 63 for rank_child in r_item.childNodes: 64 if rank_child.nodeName == 'source': 65 src = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue)) 66 elif rank_child.nodeName != '#text': 67 tgt = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue)) 68 for attribute_name in rank_child.attributes.keys(): 69 attribute_value = rank_child.getAttribute(attribute_name) 70 tgt.add_attribute(attribute_name, attribute_value) 71 tgt.add_attribute('system', rank_child.getAttribute('name')) 72 # tgt.add_attribute('rank', rank_child.getAttribute('rank')) 73 tgt_list.append(tgt) 74 75 ps = ParallelSentence(src, tgt_list) 76 #TODO: this was old, may have to change the attribute key. Commented because overlapping with other features 77 # if not ps.get_attributes().has_key("id"): 78 # ps.add_attributes({'id': stc_id}) 79 ps.add_attributes({'sentence_id': stc_id}) 80 ps_list.append(ps) 81 return ps_list

82 83 84

85 -class R2RankReader(GenericReader):

86 """ 87 Reader able to parse the ranking results from taraxu 2nd evaluation round, as exported by cfedermann 88 """ 89

90 - def __init__(self, input_xml_filename, load = True):

91 """ 92 Constructor. Creates an XML object that handles ranking file data 93 @param input_xml_filename: the name of XML file 94 @type input_xml_filename: string 95 @param load: by turning this option to false, the instance will be 96 initialized without loading everything into memory 97 @type load: boolean 98 """ 99 self.input_filename = input_xml_filename 100 self.loaded = load 101 if load: 102 self.load()

103

104 - def load(self):

105 """ 106 Loads the data of the file into memory. It is useful if the Classes has 107 been asked not to load the filename upon initialization 108 """ 109 self.xmlObject = parse(self.input_filename)

110

111 - def unload(self):

112 self.xmlObject.unlink()

113 114 115

116 - def get_parallelsentences(self):

117 """ 118 This function parses a ranking xml file and returns a list of parallel 119 sentence objects. 120 @return ps_list: list of tuples in format (ranking-item_id, ParallelSentence) 121 @type ps_list: list of tuples 122 """ 123 r_items = self.xmlObject.getElementsByTagName('ranking-item') 124 ps_list = [] 125 for r_item in r_items: 126 stc_id = r_item.getAttribute('id') 127 src = SimpleSentence('') 128 tgt_list = [] 129 130 131 for rank_child in r_item.childNodes: 132 if rank_child.nodeName == 'source': 133 src = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue)) 134 elif rank_child.nodeName == 'translation': 135 tgt = SimpleSentence('') 136 for attribute_name in rank_child.attributes.keys(): 137 attribute_value = rank_child.getAttribute(attribute_name) 138 tgt.add_attribute(attribute_name, attribute_value) 139 # tgt.add_attribute('system', rank_child.getAttribute('name')) 140 # tgt.add_attribute('rank', rank_child.getAttribute('rank')) 141 tgt_list.append(tgt) 142 143 attributes = {'sentence_id': stc_id, 144 'id': stc_id, 145 'langsrc': 'de', 146 'langtgt': 'en' 147 } 148 149 150 ps = ParallelSentence(src, tgt_list, None, attributes) 151 #TODO: this was old, may have to change the attribute key. Commented because overlapping with other features 152 # if not ps.get_attributes().has_key("id"): 153 # ps.add_attributes({'id': stc_id}) 154 155 ps_list.append(ps) 156 return ps_list

157

Source Code for Module dataprocessor.input.rankreader