1
2
3
4
5 """
6 Created on 15 Οκτ 2010
7
8 @author: Eleftherios Avramidis
9 """
10
11
12 import re
13 import math
14 import os
15 from xml.dom import minidom
16 from sentence.parallelsentence import ParallelSentence
17 from sentence.sentence import SimpleSentence
18 from xml.sax.saxutils import unescape
19 from dataprocessor.input.genericreader import GenericReader
20 from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml
21 from collections import OrderedDict
22
24 """
25 classdocs
26 """
27
28
29 - def __init__(self, input_xml_filename, load = True, stringmode = False, **kwargs):
30 """
31 Constructor. Creates an XML object that handles ranking file data
32 @param input_xml_filename: the name of XML file
33 @type input_xml_filename: string
34 @param load: by turning this option to false, the instance will be
35 initialized without loading everything into memory
36 @type load: boolean
37 """
38
39 self.bare = kwargs.setdefault('bare', False)
40 self.input_filename = input_xml_filename
41 self.loaded = load
42 self.TAG = self.get_tags()
43 if load:
44 if stringmode:
45 self.load_str(input_xml_filename)
46 else:
47 self.load()
48
49
50
53
54
56 self.xmlObject = minidom.parseString(input)
57
58
60 """
61 Loads the data of the file into memory. It is useful if the Classes has been asked not to load the filename upon initialization
62 """
63 self.xmlObject = minidom.parse(self.input_filename)
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
93 """
94 Convenience function that splits an XML file into parts and writes them directly to the disk
95 into .part files with similar filenames. The construction of the resulting filenames defined
96 by parameters
97 @param parts
98 Number of parts to split into
99 @type int
100 @param re_split Regular expression which should define two (bracketed) groups upon the filename.
101 The resulting files will have the part number inserted in the filename between these two parts
102 """
103 parallelsentences = self.get_parallelsentences()
104 inputfilename = os.path.basename(self.input_filename)
105 length = len(parallelsentences)
106 step = int(math.ceil(1.00 * len(parallelsentences) / parts))
107 partindex = 0
108 for index in range(0, length, step):
109 partindex += 1
110 start = index
111 end = index + step
112 print start, end
113 try:
114 print inputfilename
115 filename_prefix, filename_suffix = re.findall(re_split, inputfilename)[0]
116 filename = "%s.%2.d.part.%s" % (filename_prefix, partindex, filename_suffix)
117 Parallelsentence2Jcml(parallelsentences[start:end]).write_to_file(filename)
118 except IndexError:
119 print "Please try to not have a dot in the test set name, cause you don't help me with splitting"
120
121
122
123
125 """
126 @return a list of the names of the attributes contained in the XML file
127 """
128 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG["doc"])
129 sentenceList = judgedCorpus[0].getElementsByTagName(self.TAG["sent"])
130 attributesKeySet = set()
131
132 for xml_entry in sentenceList:
133 for attributeKey in xml_entry.attributes.keys():
134 attributesKeySet.add(attributeKey)
135 return list(attributesKeySet)
136
138 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG["doc"])
139 return len(judgedCorpus[0].getElementsByTagName(self.TAG["sent"]))
140
141
177
179 """
180 @return: a list of ParallelSentence objects
181 """
182 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG["doc"])
183 if not start and not end:
184 sentenceList = judgedCorpus[0].getElementsByTagName(self.TAG["sent"])
185 else:
186 sentenceList = judgedCorpus[0].getElementsByTagName(self.TAG["sent"])[start:end]
187 newssentences = []
188 for xml_entry in sentenceList:
189 curJudgedSentence = self.get_parallelsentence(xml_entry)
190 newssentences.append(curJudgedSentence)
191 print "read {} sentences".format(len(newssentences))
192 return newssentences
193
196
198 try:
199 return unescape(xml_entry.childNodes[0].nodeValue.strip())
200 except:
201 return ""
202
203
205 """
206 @return: a dictionary of the attributes of the current sentence {name:value}
207 """
208 attributes = {}
209 attributeKeys = xml_entry.attributes.keys()
210 for attributeKey in attributeKeys:
211 myAttributeKey = attributeKey
212 attributes[myAttributeKey] = unescape(xml_entry.attributes[attributeKey].value)
213 return attributes
214