Package dataprocessor :: Package input :: Module taraxureader
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.input.taraxureader

 1  #!/usr/bin/python 
 2  # -*- coding: utf-8 -*- 
 3   
 4   
 5  """ 
 6  Created on 15 Οκτ 2010 
 7   
 8  @author: Eleftherios Avramidis 
 9  """ 
10   
11   
12  from xml.dom.minidom import parse 
13  from sentence.parallelsentence import ParallelSentence 
14  from sentence.sentence import SimpleSentence 
15  from xml.sax.saxutils import unescape 
16  from xmlreader import XmlReader 
17   
18 -class TaraXUReader(XmlReader):
19 """ 20 classdocs 21 """ 22
23 - def __init__(self, inputFilename):
24 """ 25 Constructor. Creates an XML object that handles the XML 26 """ 27 self.TAG_DOC = "doc" 28 self.TAG_SENT = "sentence" 29 self.TAG_SRC = "source" 30 self.TAG_TGT = "target" 31 self.TAG_REF = "reference" 32 self.TAG_ANNOTATIONS = "annotations" 33 self.TAG_ANNOTATION = "annotation" 34 self.xmlObject = parse(inputFilename)
35 36 37
38 - def get_parallelsentences(self, start = None, end = None):
39 """ 40 @return: a list of ParallelSentence objects 41 """ 42 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG_DOC) 43 langsrc = judgedCorpus[0].attributes["source_language"].value 44 langtgt = judgedCorpus[0].attributes["target_language"].value 45 if not start and not end: 46 sentenceList = judgedCorpus[0].getElementsByTagName('sentence') 47 else: 48 sentenceList = judgedCorpus[0].getElementsByTagName('sentence')[start:end] 49 newssentences = [] 50 for xmlEntry in sentenceList: 51 srcXML = xmlEntry.getElementsByTagName('source') 52 tgtXML = xmlEntry.getElementsByTagName('target') 53 refXML = xmlEntry.getElementsByTagName('reference') 54 55 src = SimpleSentence (unescape(srcXML[0].childNodes[0].nodeValue.strip()) , self.__read_attributes__(srcXML[0]) ) 56 57 #Create a list of SimpleSentence objects out of the object 58 tgt = map( lambda x: SimpleSentence(unescape(x.childNodes[0].nodeValue.strip()), self.__read_attributes__(x) ) , tgtXML ) 59 60 ref = SimpleSentence() 61 try: 62 ref = SimpleSentence (unescape(refXML[0].childNodes[0].nodeValue.strip()) , self.__read_attributes__(refXML[0])) 63 except LookupError: 64 pass 65 66 #Extract the XML features and attach them to the ParallelSentenceObject 67 attributes = self.__read_attributes__(xmlEntry) 68 attributes["langsrc"] = langsrc 69 attributes["langtgt"] = langtgt 70 71 #create a new Parallesentence with the given content 72 curJudgedSentence = ParallelSentence(src, tgt, ref, attributes) 73 74 75 newssentences.append(curJudgedSentence) 76 return newssentences
77