Package dataprocessor :: Package output :: Module xmlwriter
[hide private]
[frames] | no frames]

Source Code for Module dataprocessor.output.xmlwriter

  1  """ 
  2  @author: Eleftherios Avramidis 
  3  """ 
  4   
  5  from xml.dom import minidom 
  6  from sentence.parallelsentence import ParallelSentence 
  7  from xml.sax.saxutils import escape 
  8  from sentence.dataset import DataSet 
  9   
 10   
11 -class GenericWriter(object):
12 - def __init__(self, data):
13 raise NotImplementedError( "Should have implemented this" )
14
15 - def write_to_file(self, filename):
16 raise NotImplementedError( "Should have implemented this" )
17
18 - def get_parallelsentence_string(self, ps):
19 raise NotImplementedError( "Should have implemented this" )
20 21
22 -class GenericXMLWriter(GenericWriter):
23 - def __init__(self, data = None):
24 """ 25 Constructor 26 """ 27 if isinstance (data, minidom.Document): 28 self.object_xml = data 29 elif isinstance(data, list): 30 self.object_xml = self.get_document_xml(data) 31 elif isinstance(data, DataSet): 32 self.object_xml = self.get_document_xml(data.get_parallelsentences()) 33 else: 34 pass
35
36 - def get_parallelsentence_xml(self, ps):
37 raise NotImplementedError( "Should have implemented this" )
38 39
40 - def get_parallelsentence_string(self, ps):
41 return self.get_parallelsentence_xml(ps).toprettyxml("\t","\n", "utf-8")
42
43 - def get_document_xml(self, parallelsentences):
44 """ 45 Creates an XML for the document an populates that with the (parallel) sentences of the given object. 46 Resulting XML object gets stored as a variable. 47 @param parallelsentences: a list of ParallelSentence objects 48 """ 49 doc_xml = minidom.Document( ) 50 jcml = doc_xml.createElement("jcml") 51 52 i=0 53 54 55 for ps in parallelsentences: 56 parallelsentence_xml = self.get_parallelsentence_xml(ps, doc_xml) 57 jcml.appendChild(parallelsentence_xml) 58 59 #print ">", i 60 i += 1 61 62 doc_xml.appendChild(jcml) 63 return doc_xml
64 65
66 - def write_to_file(self, filename):
67 file_object = open(filename, 'w') 68 #try: 69 prettyxml = self.object_xml.toprettyxml("\t","\n", "utf-8") 70 file_object.write(prettyxml) #removed ,"utf-8" 71 #except: 72 #file_object.write(self.object_xml.toprettyxml("\t","\n", "utf-8")) 73 file_object.close()
74 75 76 77
78 -class XmlWriter(GenericXMLWriter):
79 """ 80 classdocs 81 """ 82 83 84 85 86 87 88
89 - def get_parallelsentence_xml(self, ps, doc_xml = minidom.Document()):
90 parallelsentence_xml = doc_xml.createElement("judgedsentence") 91 92 #add attributes of parallel sentence 93 for attribute_key in ps.get_attributes().keys(): 94 #try: 95 # parallelsentence_xml.setAttribute(attribute_key.decode('utf-8') , ps.get_attribute(attribute_key).decode('utf-8')) 96 #except: 97 parallelsentence_xml.setAttribute(attribute_key, ps.get_attribute(attribute_key)) 98 99 #add source as a child of parallel sentence 100 src_xml = self._create_xml_sentence(doc_xml, ps.get_source(), "src") 101 parallelsentence_xml.appendChild(src_xml) 102 103 #add translations 104 for tgt in ps.get_translations(): 105 tgt_xml = self._create_xml_sentence(doc_xml, tgt, "tgt") 106 parallelsentence_xml.appendChild(tgt_xml) 107 108 #add reference as a child of parallel sentence 109 if ps.get_reference(): 110 ref_xml = self._create_xml_sentence(doc_xml, ps.get_reference(), "ref") 111 parallelsentence_xml.appendChild(ref_xml) 112 113 #append the newly populated parallel sentence to the document 114 return parallelsentence_xml
115 116 117 118 119 120 121 122
123 - def _create_xml_sentence(self, doc_xml, sentence, tag):
124 """ 125 Helper function that fetches the text and the attributes of a sentence 126 and wraps them up into a minidom XML sentenceect 127 """ 128 129 sentence_xml = doc_xml.createElement(tag) 130 131 for attribute_key in sentence.get_attributes().keys(): 132 #try: 133 # sentence_xml.setAttribute(attribute_key.decode('utf-8'), escape(str(sentence.get_attribute(attribute_key)).decode('utf-8'))) 134 #except UnicodeEncodeError: 135 sentence_xml.setAttribute(attribute_key, escape(sentence.get_attribute(attribute_key))) 136 #try: 137 # textnode = escape(sentence.get_string().strip()).decode('utf-8') 138 #except UnicodeEncodeError: 139 textnode = escape(sentence.get_string().strip()) 140 sentence_xml.appendChild(doc_xml.createTextNode(textnode)) 141 142 return sentence_xml
143