1
2
3 '''
4 Created on Jun 27, 2011
5
6 @author: jogin
7 '''
8
9
10 from xml.dom.minidom import parse
11 from sentence.parallelsentence import ParallelSentence
12 from sentence.sentence import SimpleSentence
13 from xml.sax.saxutils import unescape
14 from dataprocessor.input.genericreader import GenericReader
15
16
17 -class PosteditingReader(GenericReader):
18 """
19 Reader able to parse the ranking results from taraxu 1st evaluation round, as exported by cfedermann
20 """
21
22 - def __init__(self, input_xml_filename, load = True):
23 """
24 Constructor. Creates an XML object that handles ranking file data
25 @param input_xml_filename: the name of XML file
26 @type input_xml_filename: string
27 @param load: by turning this option to false, the instance will be
28 initialized without loading everything into memory
29 @type load: boolean
30 """
31 self.input_filename = input_xml_filename
32 self.loaded = load
33 if load:
34 self.load()
35
37 """
38 Loads the data of the file into memory. It is useful if the Classes has
39 been asked not to load the filename upon initialization
40 """
41 self.xmlObject = parse(self.input_filename)
42
44 self.xmlObject.unlink()
45
46
47
49 """
50 This function parses a ranking xml file and returns a list of parallel
51 sentence objects.
52 @return ps_list: list of tuples in format (ranking-item_id, ParallelSentence)
53 @type ps_list: list of tuples
54 """
55 r_items = self.xmlObject.getElementsByTagName('editing-item')
56 ps_list = []
57 for r_item in r_items:
58 stc_id = r_item.getAttribute('sentence_id')
59 id = r_item.getAttribute('id')
60 src = ''
61 tgt_list = []
62 for rank_child in r_item.childNodes:
63 if rank_child.nodeName == 'source':
64 src = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue))
65 elif rank_child.nodeName == 'post-edited':
66 tgt = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue))
67 tgt.add_attribute('system', 'post-edited')
68 tgt_list.append(tgt)
69 elif rank_child.nodeName != '#text':
70 tgt = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue))
71 for attribute_name in rank_child.attributes.keys():
72 attribute_value = rank_child.getAttribute(attribute_name)
73 tgt.add_attribute(attribute_name, attribute_value)
74 tgt.add_attribute('system', rank_child.getAttribute('name'))
75 tgt.add_attribute('origin','post-editing')
76
77 tgt_list.append(tgt)
78
79
80 ps_attributes = {}
81 for attributeKey in r_item.attributes.keys():
82 ps_attributes[attributeKey] = unescape(r_item.attributes[attributeKey].value)
83
84
85 ps = ParallelSentence(src, tgt_list)
86 ps.add_attributes({'sentence_id': stc_id})
87 ps.add_attributes({'id': id})
88 ps_list.append(ps)
89 return ps_list
90