Package featuregenerator :: Package iq :: Module acrolinxclient
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.iq.acrolinxclient

  1  ''' 
  2  Created on Feb 13, 2012 
  3   
  4  @author: jogin 
  5  ''' 
  6   
  7  #from pysimplesoap.client import SoapClient 
  8  # 
  9  #url = "http://msv-3231.sb.dfki.de:8031/acrolinx/services/core-no-mtom?wsdl" 
 10  #client = SoapClient(wsdl=url, trace=False) 
 11  #a = client.ping("") 
 12  #print a 
 13   
 14   
 15   
 16  from suds.client import Client 
 17  import base64 
 18  import re 
 19  import urllib 
 20  import sys 
 21  from urllib2 import URLError 
 22  from xml.etree import ElementTree as ET 
 23  from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator 
 24  import os 
 25  import time 
 26   
27 -class IQFeatureGenerator(LanguageFeatureGenerator):
28 """ 29 Handles communication with an Acrolinx IQ server 30 @ivar lang: 31 @ivar host: 32 @ivar user_id: 33 @ivar license_data: 34 @ivar 35 """ 36 37
38 - def __init__(self, lang, settings = {}, user_id = 'dfkitaraxu', host = "msv-3231.sb.dfki.de:8031", wsdl_path = "/acrolinx/services/core-no-mtom?wsdl", protocol = "http" , license_file = "license.dat"):
39 """ 40 @param lang: abrev. code for the language that this generator will be responsible for 41 @type lang: str 42 @param host: the hostname (and the port) of the SOAP server 43 @type host: str 44 @param wsdl_path: the wsdl path of the MTOM service, that needs to be appended to the end of the request url 45 @type wsdl_path: str 46 @param protocol: the protocol, default value http 47 @type protocol: str 48 """ 49 self.lang = lang 50 self.host = host 51 url = "{0}://{1}{2}".format(protocol, host, wsdl_path) 52 self.soap_client = Client(url, timeout=60) 53 license_file = "{}.dat".format(user_id) 54 path = os.path.dirname(__file__) #keep license file in current directory for the moment 55 self.license_data_filename = os.path.join(path, license_file) 56 57 self.user_id = user_id #if license doesn't work, delete license.dat and change user id OR remove access id 58 self.settings = settings 59 print "proceeding with IQ session initialization" 60 self._initialize_session()
61
62 - def _get_property(self, response, key):
63 """ 64 Performs a search into the response of the server, 65 and returns the value of a SOAPproperty given its key 66 @param response: 67 """ 68 for soap_property in response: 69 if soap_property['key'] == key: 70 return soap_property['value'] 71 raise KeyError
72
73 - def _update_license(self, response):
74 """ 75 Function to call every time we get a response that may change the license. 76 It extracts the license string from the response and updates the text file. 77 Then it returns the license string. 78 This should be called after registerUser or checkDocument 79 @return: the license string 80 @rtype: str 81 """ 82 self.license_data_str = self._get_property(response, "license.data") 83 license_data_file = open(self.license_data_filename, 'w') 84 license_data_file.write(self.license_data_str) 85 license_data_file.close() 86 return self.license_data_str
87 88
89 - def _read_report_url(self, report_url):
90 """ 91 Processes the xml report by Acrolinx IQ and returns a dict of the corrections suggested 92 @param report_xml: the content of the report 93 @type report_xml: str 94 @return: a dict containing the correction suggested 95 @rtype: {str: str} 96 @todo: write the function that reads the XML 97 """ 98 feed = urllib.urlopen(report_url) 99 tree = ET.parse(feed) 100 101 atts = {} 102 103 # checking stats 104 resStats = tree.find('body/statistics/checkingStats/resultStats') 105 106 for item in resStats.attrib.items(): 107 atts['%s_%s' % (resStats.tag, item[0])] = item[1] 108 109 for stat in resStats.getchildren(): 110 for item in stat.attrib.items(): 111 atts['%s_%s' % (stat.tag, item[0])] = item[1] 112 113 # grammar 114 grammar = tree.find('body/results/grammar') 115 gLangFlags = grammar.find('listOfLangFlags') 116 for gLf in gLangFlags.findall('langFlag'): 117 errorName = gLf.find('description').text 118 errorName = errorName.replace(" ", "_") 119 errorName = errorName.replace(":", "_") 120 121 # No. of particular errors 122 if not 'grammar_%s' % errorName in atts: 123 atts['grammar_%s' % errorName] = 1 124 else: 125 atts['grammar_%s' % errorName] += 1 126 127 # No. of matches for particular error 128 if not 'grammar_%s_matches' % errorName in atts: 129 atts['grammar_%s_matches' % errorName] = len(gLf.findall('match')) 130 else: 131 atts['grammar_%s_matches' % errorName] += len(gLf.findall('match')) 132 133 # No. of chars influenced by particular error 134 begin = 999999 135 end = 0 136 for match in gLf.findall('match'): 137 if int(match.get('begin')) < begin: begin = int(match.get('begin')) 138 if int(match.get('end')) > end: end = int(match.get('end')) 139 diff = end - begin 140 if not 'grammar_%s_chars' % errorName in atts: 141 atts['grammar_%s_chars' % errorName] = diff 142 else: 143 atts['grammar_%s_chars' % errorName] += diff 144 145 # style 146 style = tree.find('body/results/style') 147 sLangFlags = style.find('listOfLangFlags') 148 for sLf in sLangFlags.findall('langFlag'): 149 errorName = sLf.find('description').text 150 151 if errorName.startswith("Sentence too long"): 152 too_long = re.findall("Sentence too long\: (\d*)", errorName)[0] 153 errorName = "style_too_long" 154 atts['style_too_long'] = too_long 155 else: 156 errorName = errorName.replace(" ", "_") 157 errorName = errorName.replace(":", "_") 158 # No. of particular errors 159 if not 'style_%s' % errorName in atts: 160 atts['style_%s' % errorName] = 1 161 else: 162 atts['style_%s' % errorName] += 1 163 164 # No. of matches for particular error 165 if not 'style_%s_matches' % errorName in atts: 166 atts['style_%s_matches' % errorName] = len(sLf.findall('match')) 167 else: 168 atts['style_%s_matches' % errorName] += len(sLf.findall('match')) 169 170 # No. of chars influenced by particular error 171 begin = 999999 172 end = 0 173 for match in sLf.findall('match'): 174 if int(match.get('begin')) < begin: begin = int(match.get('begin')) 175 if int(match.get('end')) > end: end = int(match.get('end')) 176 diff = end - begin 177 if not 'style_%s_chars' % errorName in atts: 178 atts['style_%s_chars' % errorName] = diff 179 else: 180 atts['style_%s_chars' % errorName] += diff 181 182 # make strings from ints 183 for item in atts.items(): 184 atts[item[0]] = str(item[1]) 185 186 return atts
187 188
189 - def _attributes2soapproperties(self, attributes = {}):
190 """ 191 Converts a normal python dict to a list of SoapProperty instances 192 @param attrib utes: a dict containing soap properties 193 @type attributes: {str, str} 194 @return: a list of SoapProperty instances that can be sent to Soap 195 @rtype: [SoapProperty, ...] 196 @todo: replace the handwritten soapProperties with 197 """ 198 soap_properties = [] 199 for key, value in attributes.iteritems(): 200 soap_property = self.soap_client.factory.create('soapProperty') 201 soap_property['key'] = key 202 soap_property['value'] = value 203 soap_properties.append(soap_property) 204 return soap_properties
205
206 - def _initialize_session(self):
207 settings = self.settings 208 #register only once 209 try: 210 license_data_file = open(self.license_data_filename, 'r') 211 print "reusing stored license" 212 self.license_data_str = license_data_file.readline().strip() 213 license_data_file.close() 214 215 except IOError: 216 217 print "probably new user, obtaining new license" 218 # create soapProperty object with user id 219 userId = self.soap_client.factory.create('soapProperty') 220 userId['key'] = 'user_id' 221 userId['value'] = self.user_id 222 223 # get licence data string 224 print "trying to register client with userid", userId 225 register_client_response = self.soap_client.service.registerClient([userId]) 226 self._update_license(register_client_response) 227 228 print self.license_data_str 229 230 # # create soapProperty object with license data 231 # # create soapProperty object with license.user_id 232 log_in_parameters = {'license.data' : self.license_data_str 233 , 'license.user_id' : self.user_id } 234 235 log_in_soap_properties = self._attributes2soapproperties(log_in_parameters) 236 # 237 # get session id 238 print "trying get session by giving properties ", log_in_soap_properties 239 240 connected = False 241 while not connected: 242 try: 243 self.sessionIdStr = self.soap_client.service.requestClientSession(log_in_soap_properties) 244 connected = True 245 except URLError: 246 sys.stderr.write("UrlError on initializing suds client. Trying again\n") 247 time.sleep(5)
248 249 250 251 #print sessionIdStr 252
253 - def _start_new_check(self):
254 # get check id 255 settings = self.settings 256 # print "getting required check id" 257 check_id = self.soap_client.service.getCheckId() 258 259 if settings: 260 soap_attributes = settings 261 else: 262 soap_attributes = dict( 263 text_type = 'MT-preediting-DE-EN-T1', 264 check_spelling = 'true', 265 check_grammar = 'true', 266 check_style = 'true', 267 check_terms = 'MT-preediting-DE-EN-T1.modules.terms', 268 ) 269 license_data_str = open(self.license_data_filename, 'r').read() 270 soap_attributes["text_lang"] = self.lang 271 soap_attributes["client_session_id"] = self.sessionIdStr 272 soap_attributes["license.data"] = license_data_str 273 soap_attributes["user.id"] = self.user_id 274 275 soap_properties = self._attributes2soapproperties(soap_attributes) 276 return check_id, soap_properties
277 278 279
280 - def get_features_string(self, text):
281 """ 282 Receives a text and returns a dict with numerical quality observation features 283 @param text: text to be evaluated 284 @type text: str 285 @return: a dict with attributes retrieved from the quality analysis 286 @rtype: {str: str} 287 """ 288 289 tries_resp = 0 290 resp = None 291 while not resp: 292 tries = 0 293 check_id = None 294 while not check_id: 295 text64 = base64.standard_b64encode(text) 296 try: 297 check_id, soap_properties = self._start_new_check() 298 except Exception as inst: 299 check_id = None 300 sys.stderr.write("\nWhile getting check ID, server reported error: {}\n".format(inst)) 301 tries += 1 302 if tries > 5: 303 raise inst 304 time.sleep(20) 305 sys.stderr.write("retrying...") 306 # print 'soap_properties', soap_properties 307 # print 'text64', text64 308 # print 'check_id', check_id 309 # print 'resp = self.soap_client.service.checkDocumentMtom(soap_properties, text64, "utf-8", check_id)' 310 # 311 try: 312 resp = self.soap_client.service.checkDocumentMtom(soap_properties, text64, "utf-8", check_id) 313 except Exception as inst: 314 resp = None 315 sys.stderr.write("\nWhen submitted sentence, server reported error: {}\n".format(inst)) 316 sys.stderr.write("original sentence: {}\n".format(text)) 317 sys.stderr.write("b64 encoded sentence: {}\n".format(text64)) 318 tries_resp += 1 319 if tries_resp > 5: 320 raise inst 321 time.sleep(20) 322 sys.stderr.write("retrying...") 323 324 325 self._update_license(resp) 326 327 #extract document score from the response 328 document_score = self._get_property(resp, "document_score") 329 #get url of the report xml 330 report_url = self._get_property(resp, "report_url") 331 #fix the host part of the url 332 report_url = re.sub("://[^/]*/", "://{0}/".format(self.host), report_url) 333 #print "retrieving report from ", report_url 334 #report_xml = urllib.urlopen(report_url).read() 335 336 attributes = self._read_report_url(report_url) 337 print ".", 338 return attributes
339 340
341 - def get_language_options(self):
342 return self.soap_client.service.getLanguageOptions(self.lang)
343
344 - def __del__(self):
345 try: 346 self.soap_client.service.releaseClientSession(self.sessionIdStr) 347 except: 348 pass
349 # # release client session in any case 350 351 352 # 353 # 354 #if __name__ == '__main__': 355 356 # text = 'This break every possibility. Dear clients, we would like to informm you that during the latest commerccial update we recieved marvelous products, which wwe can offers in really good prices. Please keeps in touch for further notice. This break every possibility.' 357 # ac = IQFeatureGenerator("en") 358 # from dataprocessor.sax import saxjcml 359 # # 360 # saxjcml.run_features_generator("/home/Eleftherios Avramidis/taraxu_data/wmt12/qe/training_set/training.jcml", 361 # "/home/Eleftherios Avramidis/taraxu_data/wmt12/qe/training_set/training.iq.jcml", [ac]) 362 # print ac.process(text) 363