1
2
3 '''
4 Created on Jun 27, 2011
5
6 @author: Lukas Poustka, Eleftherios Avramidis
7 '''
8
9
10 from xml.dom.minidom import parse
11 from sentence.parallelsentence import ParallelSentence
12 from sentence.sentence import SimpleSentence
13 from xml.sax.saxutils import unescape
14 from dataprocessor.input.genericreader import GenericReader
15
16
18 """
19 Reader able to parse the ranking results from taraxu 1st evaluation round, as exported by cfedermann
20 """
21
22 - def __init__(self, input_xml_filename, load = True):
23 """
24 Constructor. Creates an XML object that handles ranking file data
25 @param input_xml_filename: the name of XML file
26 @type input_xml_filename: string
27 @param load: by turning this option to false, the instance will be
28 initialized without loading everything into memory
29 @type load: boolean
30 """
31 self.input_filename = input_xml_filename
32 self.loaded = load
33 if load:
34 self.load()
35
37 """
38 Loads the data of the file into memory. It is useful if the Classes has
39 been asked not to load the filename upon initialization
40 """
41 self.xmlObject = parse(self.input_filename)
42
44 self.xmlObject.unlink()
45
46
47
49 """
50 This function parses a ranking xml file and returns a list of parallel
51 sentence objects.
52 @return ps_list: list of tuples in format (ranking-item_id, ParallelSentence)
53 @type ps_list: list of tuples
54 """
55 r_items = self.xmlObject.getElementsByTagName('ranking-item')
56 ps_list = []
57 for r_item in r_items:
58 stc_id = r_item.getAttribute('sentence_id')
59 src = ''
60 tgt_list = []
61
62
63 for rank_child in r_item.childNodes:
64 if rank_child.nodeName == 'source':
65 src = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue))
66 elif rank_child.nodeName != '#text':
67 tgt = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue))
68 for attribute_name in rank_child.attributes.keys():
69 attribute_value = rank_child.getAttribute(attribute_name)
70 tgt.add_attribute(attribute_name, attribute_value)
71 tgt.add_attribute('system', rank_child.getAttribute('name'))
72
73 tgt_list.append(tgt)
74
75 ps = ParallelSentence(src, tgt_list)
76
77
78
79 ps.add_attributes({'sentence_id': stc_id})
80 ps_list.append(ps)
81 return ps_list
82
83
84
86 """
87 Reader able to parse the ranking results from taraxu 2nd evaluation round, as exported by cfedermann
88 """
89
90 - def __init__(self, input_xml_filename, load = True):
91 """
92 Constructor. Creates an XML object that handles ranking file data
93 @param input_xml_filename: the name of XML file
94 @type input_xml_filename: string
95 @param load: by turning this option to false, the instance will be
96 initialized without loading everything into memory
97 @type load: boolean
98 """
99 self.input_filename = input_xml_filename
100 self.loaded = load
101 if load:
102 self.load()
103
105 """
106 Loads the data of the file into memory. It is useful if the Classes has
107 been asked not to load the filename upon initialization
108 """
109 self.xmlObject = parse(self.input_filename)
110
112 self.xmlObject.unlink()
113
114
115
117 """
118 This function parses a ranking xml file and returns a list of parallel
119 sentence objects.
120 @return ps_list: list of tuples in format (ranking-item_id, ParallelSentence)
121 @type ps_list: list of tuples
122 """
123 r_items = self.xmlObject.getElementsByTagName('ranking-item')
124 ps_list = []
125 for r_item in r_items:
126 stc_id = r_item.getAttribute('id')
127 src = SimpleSentence('')
128 tgt_list = []
129
130
131 for rank_child in r_item.childNodes:
132 if rank_child.nodeName == 'source':
133 src = SimpleSentence(unescape(rank_child.childNodes[0].nodeValue))
134 elif rank_child.nodeName == 'translation':
135 tgt = SimpleSentence('')
136 for attribute_name in rank_child.attributes.keys():
137 attribute_value = rank_child.getAttribute(attribute_name)
138 tgt.add_attribute(attribute_name, attribute_value)
139
140
141 tgt_list.append(tgt)
142
143 attributes = {'sentence_id': stc_id,
144 'id': stc_id,
145 'langsrc': 'de',
146 'langtgt': 'en'
147 }
148
149
150 ps = ParallelSentence(src, tgt_list, None, attributes)
151
152
153
154
155 ps_list.append(ps)
156 return ps_list
157