1 import xmlrpclib
2
3 from featuregenerator.languagefeaturegenerator import LanguageFeatureGenerator
4 from nltk.tokenize.punkt import PunktWordTokenizer
5 import sys
6 from util.freqcaser import FreqCaser
7 from numpy import average, std
8
9
10
12 '''
13 Gets all the words of a sentence through a SRILM language model and counts how many of them are unknown (unigram prob -99)
14 '''
15
16 - def __init__(self, url, lang="en", lowercase=True, tokenize=True, freqcase_file=False):
17 '''
18 Define connection with the server
19 '''
20 self.server = xmlrpclib.Server(url)
21 self.lang = lang
22 self.lowercase = lowercase
23 self.tokenize = tokenize
24 self.freqcaser = None
25 if freqcase_file:
26 self.freqcaser = FreqCaser(freqcase_file)
27
35
42
43
45 sent_string = simplesentence.get_string().replace('-',' ').strip()
46 if self.freqcaser:
47 tokenized_string = self.freqcaser.freqcase(sent_string)
48 else:
49 if self.lowercase:
50 sent_string = sent_string.lower()
51 if self.tokenize:
52 sent_string = sent_string.replace('%',' %')
53 tokenized_string = PunktWordTokenizer().tokenize(sent_string)
54 sent_string = ' '.join(tokenized_string)
55 else:
56
57 tokenized_string = [tok.strip() for tok in sent_string.split(' ') if tok.strip()]
58
59
60 return (tokenized_string, sent_string)
61
62
64 sent_string = simplesentence.get_string().replace('-',' ').strip()
65 if self.freqcaser:
66 tokenized_string = self.freqcaser.freqcase(sent_string)
67 else:
68 if self.lowercase:
69 sent_string = sent_string.lower()
70 if self.tokenize:
71 sent_string = sent_string.replace('%',' %')
72 tokenized_string = PunktWordTokenizer().tokenize(sent_string)
73 sent_string = ' '.join(tokenized_string)
74 else:
75 tokenized_string = sent_string.split(' ')
76
77
78
79
80 return unicode(tokenized_string)
81
83 std_value = std(vector)
84 avg_value = average(vector)
85 standout = 0
86
87 for value in vector:
88 if value*sign > (avg_value + sign*std_value):
89 standout += 1
90
91 return standout
92
94 std_value = std(vector)
95 avg_value = average(vector)
96 standout = []
97
98
99 for pos, value in enumerate(vector, start=1):
100 if value*sign > (avg_value + sign*std_value):
101 standout.append(pos)
102
103 return standout
104
105
107 (tokens,sent_string) = self._prepare_sentence(simplesentence)
108 unk_count = 0
109 uni_probs = 1
110 bi_probs = 1
111 tri_probs = 1
112 unk_tokens = []
113
114 prob = self._get_sentence_probability(sent_string)
115
116
117 pos = 0
118 unk_pos = []
119 uni_probs_vector = []
120 bi_probs_vector = []
121 tri_probs_vector = []
122
123 for token in tokens:
124 pos+=1
125
126 uni_prob = self.server.getUnigramProb(token)
127
128 if uni_prob == -99:
129 unk_count += 1
130 unk_pos.append(pos)
131 unk_tokens.append(token)
132 sys.stderr.write("Unknown word: %s of len %d\n" % (token, len(token)))
133 else:
134 uni_probs_vector.append(uni_prob)
135 uni_probs += uni_prob
136
137
138
139
140
141
142 for pos in range ( len(tokens) -1 ):
143 token = tokens[pos:pos+2]
144 if (token[0] not in unk_tokens) and (token[1] not in unk_tokens):
145
146 bi_prob = self.server.getBigramProb(' '.join(token))
147
148 bi_probs += bi_prob
149 bi_probs_vector.append(bi_prob)
150
151
152
153
154
155 for pos in range ( len(tokens) -2 ):
156 token = tokens[pos:pos+3]
157 if (token[0] not in unk_tokens) and (token[1] not in unk_tokens) and (token[2] not in unk_tokens):
158
159 tri_prob = self.server.getTrigramProb(' '.join(token))
160 tri_probs += tri_prob
161 tri_probs_vector.append(tri_prob)
162
163
164
165
166 unk_rel_pos = [(unk_pos_item * 1.00) / len(tokens) for unk_pos_item in unk_pos]
167 unk_len = sum([len(token) for token in unk_tokens])
168
169 if len(unk_pos) == 0:
170 unk_pos = [0]
171 unk_rel_pos = [0]
172
173 attributes = { 'lm_unk_pos_abs_avg' : str(average(unk_pos)),
174 'lm_unk_pos_abs_std' : str(std(unk_pos)),
175 'lm_unk_pos_abs_min' : str(min(unk_pos)),
176 'lm_unk_pos_abs_max' : str(max(unk_pos)),
177 'lm_unk_pos_rel_avg' : str(average(unk_rel_pos)),
178 'lm_unk_pos_rel_std' : str(std(unk_rel_pos)),
179 'lm_unk_pos_rel_min' : str(min(unk_rel_pos)),
180 'lm_unk_pos_rel_max' : str(max(unk_rel_pos)),
181 'lm_unk' : str(unk_count),
182 'lm_unk_len' : unk_len,
183
184 'lm_uni-prob' : str(uni_probs),
185 'lm_uni-prob_avg' : str(average(uni_probs_vector)),
186 'lm_uni-prob_std' : str(std(uni_probs_vector)),
187 'lm_uni-prob_low' : self._standouts(uni_probs_vector, -1),
188 'lm_uni-prob_high' : self._standouts(uni_probs_vector, +1),
189 'lm_uni-prob_low_pos_avg': average(self._standout_pos(uni_probs_vector, -1)),
190 'lm_uni-prob_low_pos_std': std(self._standout_pos(uni_probs_vector, -1)),
191
192 'lm_bi-prob' : str(bi_probs),
193 'lm_bi-prob_avg' : str(average(bi_probs_vector)),
194 'lm_bi-prob_std' : str(std(bi_probs_vector)),
195 'lm_bi-prob_low' : self._standouts(bi_probs_vector, -1),
196 'lm_bi-prob_high' : self._standouts(bi_probs_vector, +1),
197 'lm_bi-prob_low_pos_avg': average(self._standout_pos(bi_probs_vector, -1)),
198 'lm_bi-prob_low_pos_std': std(self._standout_pos(bi_probs_vector, -1)),
199
200 'lm_tri-prob' : str(tri_probs),
201 'lm_tri-prob_avg' : str(average(tri_probs_vector)),
202 'lm_tri-prob_std' : str(std(tri_probs_vector)),
203 'lm_tri-prob_low' : self._standouts(tri_probs_vector, -1),
204 'lm_tri-prob_high' : self._standouts(tri_probs_vector, +1),
205 'lm_tri-prob_low_pos_avg': average(self._standout_pos(tri_probs_vector, -1)),
206 'lm_tri-prob_low_pos_std': std(self._standout_pos(tri_probs_vector, -1)),
207 'lm_prob' : str(prob) }
208
209 return attributes
210
211
212
213
215
216 l = len(sent_string.split(" "))
217
218
219 return str (self.server.getSentenceProb(sent_string, l))
220
221
223 return self.server.getNgramFeatures_batch(batch)
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278