1 '''
2 Created on Feb 13, 2012
3
4 @author: jogin
5 '''
6
7
8
9
10
11
12
13
14
15
16 from suds.client import Client
17 import base64
18 import re
19 import urllib
20 import sys
21 from urllib2 import URLError
22 from xml.etree import ElementTree as ET
23 from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator
24 import os
25 import time
26
28 """
29 Handles communication with an Acrolinx IQ server
30 @ivar lang:
31 @ivar host:
32 @ivar user_id:
33 @ivar license_data:
34 @ivar
35 """
36
37
38 - def __init__(self, lang, settings = {}, user_id = 'dfkitaraxu', host = "msv-3231.sb.dfki.de:8031", wsdl_path = "/acrolinx/services/core-no-mtom?wsdl", protocol = "http" , license_file = "license.dat"):
39 """
40 @param lang: abrev. code for the language that this generator will be responsible for
41 @type lang: str
42 @param host: the hostname (and the port) of the SOAP server
43 @type host: str
44 @param wsdl_path: the wsdl path of the MTOM service, that needs to be appended to the end of the request url
45 @type wsdl_path: str
46 @param protocol: the protocol, default value http
47 @type protocol: str
48 """
49 self.lang = lang
50 self.host = host
51 url = "{0}://{1}{2}".format(protocol, host, wsdl_path)
52 self.soap_client = Client(url, timeout=60)
53 license_file = "{}.dat".format(user_id)
54 path = os.path.dirname(__file__)
55 self.license_data_filename = os.path.join(path, license_file)
56
57 self.user_id = user_id
58 self.settings = settings
59 print "proceeding with IQ session initialization"
60 self._initialize_session()
61
63 """
64 Performs a search into the response of the server,
65 and returns the value of a SOAPproperty given its key
66 @param response:
67 """
68 for soap_property in response:
69 if soap_property['key'] == key:
70 return soap_property['value']
71 raise KeyError
72
74 """
75 Function to call every time we get a response that may change the license.
76 It extracts the license string from the response and updates the text file.
77 Then it returns the license string.
78 This should be called after registerUser or checkDocument
79 @return: the license string
80 @rtype: str
81 """
82 self.license_data_str = self._get_property(response, "license.data")
83 license_data_file = open(self.license_data_filename, 'w')
84 license_data_file.write(self.license_data_str)
85 license_data_file.close()
86 return self.license_data_str
87
88
90 """
91 Processes the xml report by Acrolinx IQ and returns a dict of the corrections suggested
92 @param report_xml: the content of the report
93 @type report_xml: str
94 @return: a dict containing the correction suggested
95 @rtype: {str: str}
96 @todo: write the function that reads the XML
97 """
98 feed = urllib.urlopen(report_url)
99 tree = ET.parse(feed)
100
101 atts = {}
102
103
104 resStats = tree.find('body/statistics/checkingStats/resultStats')
105
106 for item in resStats.attrib.items():
107 atts['%s_%s' % (resStats.tag, item[0])] = item[1]
108
109 for stat in resStats.getchildren():
110 for item in stat.attrib.items():
111 atts['%s_%s' % (stat.tag, item[0])] = item[1]
112
113
114 grammar = tree.find('body/results/grammar')
115 gLangFlags = grammar.find('listOfLangFlags')
116 for gLf in gLangFlags.findall('langFlag'):
117 errorName = gLf.find('description').text
118 errorName = errorName.replace(" ", "_")
119 errorName = errorName.replace(":", "_")
120
121
122 if not 'grammar_%s' % errorName in atts:
123 atts['grammar_%s' % errorName] = 1
124 else:
125 atts['grammar_%s' % errorName] += 1
126
127
128 if not 'grammar_%s_matches' % errorName in atts:
129 atts['grammar_%s_matches' % errorName] = len(gLf.findall('match'))
130 else:
131 atts['grammar_%s_matches' % errorName] += len(gLf.findall('match'))
132
133
134 begin = 999999
135 end = 0
136 for match in gLf.findall('match'):
137 if int(match.get('begin')) < begin: begin = int(match.get('begin'))
138 if int(match.get('end')) > end: end = int(match.get('end'))
139 diff = end - begin
140 if not 'grammar_%s_chars' % errorName in atts:
141 atts['grammar_%s_chars' % errorName] = diff
142 else:
143 atts['grammar_%s_chars' % errorName] += diff
144
145
146 style = tree.find('body/results/style')
147 sLangFlags = style.find('listOfLangFlags')
148 for sLf in sLangFlags.findall('langFlag'):
149 errorName = sLf.find('description').text
150
151 if errorName.startswith("Sentence too long"):
152 too_long = re.findall("Sentence too long\: (\d*)", errorName)[0]
153 errorName = "style_too_long"
154 atts['style_too_long'] = too_long
155 else:
156 errorName = errorName.replace(" ", "_")
157 errorName = errorName.replace(":", "_")
158
159 if not 'style_%s' % errorName in atts:
160 atts['style_%s' % errorName] = 1
161 else:
162 atts['style_%s' % errorName] += 1
163
164
165 if not 'style_%s_matches' % errorName in atts:
166 atts['style_%s_matches' % errorName] = len(sLf.findall('match'))
167 else:
168 atts['style_%s_matches' % errorName] += len(sLf.findall('match'))
169
170
171 begin = 999999
172 end = 0
173 for match in sLf.findall('match'):
174 if int(match.get('begin')) < begin: begin = int(match.get('begin'))
175 if int(match.get('end')) > end: end = int(match.get('end'))
176 diff = end - begin
177 if not 'style_%s_chars' % errorName in atts:
178 atts['style_%s_chars' % errorName] = diff
179 else:
180 atts['style_%s_chars' % errorName] += diff
181
182
183 for item in atts.items():
184 atts[item[0]] = str(item[1])
185
186 return atts
187
188
190 """
191 Converts a normal python dict to a list of SoapProperty instances
192 @param attrib utes: a dict containing soap properties
193 @type attributes: {str, str}
194 @return: a list of SoapProperty instances that can be sent to Soap
195 @rtype: [SoapProperty, ...]
196 @todo: replace the handwritten soapProperties with
197 """
198 soap_properties = []
199 for key, value in attributes.iteritems():
200 soap_property = self.soap_client.factory.create('soapProperty')
201 soap_property['key'] = key
202 soap_property['value'] = value
203 soap_properties.append(soap_property)
204 return soap_properties
205
207 settings = self.settings
208
209 try:
210 license_data_file = open(self.license_data_filename, 'r')
211 print "reusing stored license"
212 self.license_data_str = license_data_file.readline().strip()
213 license_data_file.close()
214
215 except IOError:
216
217 print "probably new user, obtaining new license"
218
219 userId = self.soap_client.factory.create('soapProperty')
220 userId['key'] = 'user_id'
221 userId['value'] = self.user_id
222
223
224 print "trying to register client with userid", userId
225 register_client_response = self.soap_client.service.registerClient([userId])
226 self._update_license(register_client_response)
227
228 print self.license_data_str
229
230
231
232 log_in_parameters = {'license.data' : self.license_data_str
233 , 'license.user_id' : self.user_id }
234
235 log_in_soap_properties = self._attributes2soapproperties(log_in_parameters)
236
237
238 print "trying get session by giving properties ", log_in_soap_properties
239
240 connected = False
241 while not connected:
242 try:
243 self.sessionIdStr = self.soap_client.service.requestClientSession(log_in_soap_properties)
244 connected = True
245 except URLError:
246 sys.stderr.write("UrlError on initializing suds client. Trying again\n")
247 time.sleep(5)
248
249
250
251
252
254
255 settings = self.settings
256
257 check_id = self.soap_client.service.getCheckId()
258
259 if settings:
260 soap_attributes = settings
261 else:
262 soap_attributes = dict(
263 text_type = 'MT-preediting-DE-EN-T1',
264 check_spelling = 'true',
265 check_grammar = 'true',
266 check_style = 'true',
267 check_terms = 'MT-preediting-DE-EN-T1.modules.terms',
268 )
269 license_data_str = open(self.license_data_filename, 'r').read()
270 soap_attributes["text_lang"] = self.lang
271 soap_attributes["client_session_id"] = self.sessionIdStr
272 soap_attributes["license.data"] = license_data_str
273 soap_attributes["user.id"] = self.user_id
274
275 soap_properties = self._attributes2soapproperties(soap_attributes)
276 return check_id, soap_properties
277
278
279
281 """
282 Receives a text and returns a dict with numerical quality observation features
283 @param text: text to be evaluated
284 @type text: str
285 @return: a dict with attributes retrieved from the quality analysis
286 @rtype: {str: str}
287 """
288
289 tries_resp = 0
290 resp = None
291 while not resp:
292 tries = 0
293 check_id = None
294 while not check_id:
295 text64 = base64.standard_b64encode(text)
296 try:
297 check_id, soap_properties = self._start_new_check()
298 except Exception as inst:
299 check_id = None
300 sys.stderr.write("\nWhile getting check ID, server reported error: {}\n".format(inst))
301 tries += 1
302 if tries > 5:
303 raise inst
304 time.sleep(20)
305 sys.stderr.write("retrying...")
306
307
308
309
310
311 try:
312 resp = self.soap_client.service.checkDocumentMtom(soap_properties, text64, "utf-8", check_id)
313 except Exception as inst:
314 resp = None
315 sys.stderr.write("\nWhen submitted sentence, server reported error: {}\n".format(inst))
316 sys.stderr.write("original sentence: {}\n".format(text))
317 sys.stderr.write("b64 encoded sentence: {}\n".format(text64))
318 tries_resp += 1
319 if tries_resp > 5:
320 raise inst
321 time.sleep(20)
322 sys.stderr.write("retrying...")
323
324
325 self._update_license(resp)
326
327
328 document_score = self._get_property(resp, "document_score")
329
330 report_url = self._get_property(resp, "report_url")
331
332 report_url = re.sub("://[^/]*/", "://{0}/".format(self.host), report_url)
333
334
335
336 attributes = self._read_report_url(report_url)
337 print ".",
338 return attributes
339
340
342 return self.soap_client.service.getLanguageOptions(self.lang)
343
345 try:
346 self.soap_client.service.releaseClientSession(self.sessionIdStr)
347 except:
348 pass
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363