1
2
3
4
5 """
6 Created on 15 Οκτ 2010
7
8 @author: Eleftherios Avramidis
9 """
10
11
12 from xml.dom.minidom import parse
13 from sentence.parallelsentence import ParallelSentence
14 from sentence.sentence import SimpleSentence
15 from xml.sax.saxutils import unescape
16 from xmlreader import XmlReader
17
19 """
20 classdocs
21 """
22
24 """
25 Constructor. Creates an XML object that handles the XML
26 """
27 self.TAG_DOC = "doc"
28 self.TAG_SENT = "sentence"
29 self.TAG_SRC = "source"
30 self.TAG_TGT = "target"
31 self.TAG_REF = "reference"
32 self.TAG_ANNOTATIONS = "annotations"
33 self.TAG_ANNOTATION = "annotation"
34 self.xmlObject = parse(inputFilename)
35
36
37
39 """
40 @return: a list of ParallelSentence objects
41 """
42 judgedCorpus = self.xmlObject.getElementsByTagName(self.TAG_DOC)
43 langsrc = judgedCorpus[0].attributes["source_language"].value
44 langtgt = judgedCorpus[0].attributes["target_language"].value
45 if not start and not end:
46 sentenceList = judgedCorpus[0].getElementsByTagName('sentence')
47 else:
48 sentenceList = judgedCorpus[0].getElementsByTagName('sentence')[start:end]
49 newssentences = []
50 for xmlEntry in sentenceList:
51 srcXML = xmlEntry.getElementsByTagName('source')
52 tgtXML = xmlEntry.getElementsByTagName('target')
53 refXML = xmlEntry.getElementsByTagName('reference')
54
55 src = SimpleSentence (unescape(srcXML[0].childNodes[0].nodeValue.strip()) , self.__read_attributes__(srcXML[0]) )
56
57
58 tgt = map( lambda x: SimpleSentence(unescape(x.childNodes[0].nodeValue.strip()), self.__read_attributes__(x) ) , tgtXML )
59
60 ref = SimpleSentence()
61 try:
62 ref = SimpleSentence (unescape(refXML[0].childNodes[0].nodeValue.strip()) , self.__read_attributes__(refXML[0]))
63 except LookupError:
64 pass
65
66
67 attributes = self.__read_attributes__(xmlEntry)
68 attributes["langsrc"] = langsrc
69 attributes["langtgt"] = langtgt
70
71
72 curJudgedSentence = ParallelSentence(src, tgt, ref, attributes)
73
74
75 newssentences.append(curJudgedSentence)
76 return newssentences
77