Package dataprocessor :: Package input :: Module genericxmlreader
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.input.genericxmlreader

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4   
  5  """ 
  6  Created on 15 Οκτ 2010 
  7   
  8  @author: Eleftherios Avramidis 
  9  """ 
 10   
 11   
 12  import re 
 13  import math 
 14  import os 
 15  from xml.dom import minidom 
 16  from sentence.parallelsentence import ParallelSentence 
 17  from sentence.sentence import SimpleSentence 
 18  from xml.sax.saxutils import unescape 
 19  from dataprocessor.input.genericreader import GenericReader 
 20  from dataprocessor.sax.saxps2jcml import Parallelsentence2Jcml 
 21  from collections import OrderedDict 
 22   
23 -class GenericXmlReader(GenericReader):
24 """ 25 classdocs 26 """ 27 28
29 - def __init__(self, input_xml_filename, load = True, stringmode = False, **kwargs):
30 """ 31 Constructor. Creates an XML object that handles ranking file data 32 @param input_xml_filename: the name of XML file 33 @type input_xml_filename: string 34 @param load: by turning this option to false, the instance will be 35 initialized without loading everything into memory 36 @type load: boolean 37 """ 38 39 self.bare = kwargs.setdefault('bare', False) 40 self.input_filename = input_xml_filename 41 self.loaded = load 42 self.TAG = self.get_tags() 43 if load: 44 if stringmode: 45 self.load_str(input_xml_filename) 46 else: 47 self.load()
48 49 50
51 - def get_tags(self):
52 return OrderedDict()
53 54
55 - def load_str(self, input):
56 self.xmlObject = minidom.parseString(input)
57 58
59 - def load(self):
60 """ 61 Loads the data of the file into memory. It is useful if the Classes has been asked not to load the filename upon initialization 62 """ 63 self.xmlObject = minidom.parse(self.input_filename)
64 65 66 # def get_dataset(self): 67 # """ 68 # Returs the contents of the XML file into an object structure, which is represented by the DataSet object 69 # Note that this will cause all the data of the XML file to be loaded into system memory at once. 70 # For big data sets this may not be optimal, so consider sentence-by-sentence reading with SAX (saxjcml.py) 71 # @rtype: sentence.dataset.DataSet 72 # @return: A data set containing all the data of the XML file 73 # """ 74 # #return DataSet(self.get_parallelsentences(), self.get_attributes(), self.get_annotations()) 75 # return DataSet(self.get_parallelsentences()) 76 77 78 # def get_annotations(self): 79 # """ 80 # @return a list with the names of the annotation layers that the corpus has undergone 81 # """ 82 # try: 83 # annotations_xml_container = self.xmlObject.getElementsByTagName(self.TAG["annotations"]) 84 # annotations_xml = annotations_xml_container[0].getElementsByTagName(self.TAG_ANNOTATION) 85 # return [annotation_xml["name"] for annotation_xml in annotations_xml] 86 # except: 87 # print "File doesn't contain annotation information" 88 # return [] 89 # 90 91
92 - def split_and_write(self, parts, re_split):
93 """ 94 Convenience function that splits an XML file into parts and writes them directly to the disk 95 into .part files with similar filenames. The construction of the resulting filenames defined 96 by parameters 97 @param parts 98 Number of parts to split into 99 @type int 100 @param re_split Regular expression which should define two (bracketed) groups upon the filename. 101 The resulting files will have the part number inserted in the filename between these two parts 102 """ 103 parallelsentences = self.get_parallelsentences() 104 inputfilename = os.path.basename(self.input_filename) 105 length = len(parallelsentences) 106 step = int(math.ceil(1.00 * len(parallelsentences) / parts)) #get ceiling to avoid mod 107 partindex = 0 108 for index in range(0, length, step): 109 partindex += 1 110 start = index 111 end = index + step 112 print start, end 113 try: 114 print inputfilename 115 filename_prefix, filename_suffix = re.findall(re_split, inputfilename)[0] 116 filename = "%s.%2.d.part.%s" % (filename_prefix, partindex, filename_suffix) 117 Parallelsentence2Jcml(parallelsentences[start:end]).write_to_file(filename) 118 except IndexError: 119 print "Please try to not have a dot in the test set name, cause you don't help me with splitting"
120 121 122 123
124 - def get_attributes(self):
125 """ 126 @return a list of the names of the attributes contained in the XML file 127 """ 128 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG["doc"]) 129 sentenceList = judgedCorpus[0].getElementsByTagName(self.TAG["sent"]) 130 attributesKeySet = set() 131 132 for xml_entry in sentenceList: 133 for attributeKey in xml_entry.attributes.keys(): 134 attributesKeySet.add(attributeKey) 135 return list(attributesKeySet)
136
137 - def length(self):
138 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG["doc"]) 139 return len(judgedCorpus[0].getElementsByTagName(self.TAG["sent"]))
140 141
142 - def get_parallelsentence(self, xml_entry):
143 144 srcXMLentries = xml_entry.getElementsByTagName(self.TAG["src"]) 145 tgtXMLentries = xml_entry.getElementsByTagName(self.TAG["tgt"]) 146 refXML = xml_entry.getElementsByTagName(self.TAG["ref"]) 147 148 if len(srcXMLentries) == 1 : 149 src = self._read_simplesentence(srcXMLentries[0]) 150 elif len(srcXMLentries) > 1: 151 src = [self._read_simplesentence(srcXML) for srcXML in srcXMLentries] 152 153 154 #Create a list of SimpleSentence objects out of the object 155 tgt = [self._read_simplesentence(tgtXML) for tgtXML in tgtXMLentries] 156 157 ref = SimpleSentence() 158 try: 159 ref = self._read_simplesentence(refXML[0]) 160 except LookupError: 161 pass 162 163 #Extract the XML features and attach them to the ParallelSentenceObject 164 attributes = self._read_attributes(xml_entry) 165 166 #TODO: fix this language by getting from other parts of the sentence 167 if not self.TAG["langsrc"] in attributes: 168 attributes[self.TAG["langsrc"] ] = self.TAG["default_langsrc"] 169 170 if not self.TAG["langtgt"] in attributes: 171 attributes[self.TAG["langtgt"] ] = self.TAG["default_langtgt"] 172 173 174 #create a new Parallesentence with the given content 175 curJudgedSentence = ParallelSentence(src, tgt, ref, attributes) 176 return curJudgedSentence
177
178 - def get_parallelsentences(self, start = None, end = None):
179 """ 180 @return: a list of ParallelSentence objects 181 """ 182 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG["doc"]) 183 if not start and not end: 184 sentenceList = judgedCorpus[0].getElementsByTagName(self.TAG["sent"]) 185 else: 186 sentenceList = judgedCorpus[0].getElementsByTagName(self.TAG["sent"])[start:end] 187 newssentences = [] 188 for xml_entry in sentenceList: 189 curJudgedSentence = self.get_parallelsentence(xml_entry) 190 newssentences.append(curJudgedSentence) 191 print "read {} sentences".format(len(newssentences)) 192 return newssentences
193
194 - def _read_simplesentence(self, xml_entry):
195 return SimpleSentence(self._read_string(xml_entry), self._read_attributes(xml_entry))
196
197 - def _read_string(self, xml_entry):
198 try: 199 return unescape(xml_entry.childNodes[0].nodeValue.strip()) #.encode('utf8') 200 except: 201 return ""
202 203
204 - def _read_attributes(self, xml_entry):
205 """ 206 @return: a dictionary of the attributes of the current sentence {name:value} 207 """ 208 attributes = {} 209 attributeKeys = xml_entry.attributes.keys() 210 for attributeKey in attributeKeys: 211 myAttributeKey = attributeKey #.encode('utf8') 212 attributes[myAttributeKey] = unescape(xml_entry.attributes[attributeKey].value) #.encode('utf8') 213 return attributes
214