1 '''
2 @author: Eleftherios Avramidis
3 @note: Modified copy from Hieu Hoang's code for Moses Project
4
5 Provides:
6 cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
7 cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
8 score_cooked(alltest, n=4): Score a list of cooked test sentences.
9
10 score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
11
12 The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
13 '''
14
15 import optparse
16 import sys, math, re, xml.sax.saxutils
17 sys.path.append('/fs/clip-mteval/Programs/hiero')
18
19
20
21
22 nonorm = 0
23
24 preserve_case = False
25 eff_ref_len = "shortest"
26
27 normalize1 = [
28 ('<skipped>', ''),
29 (r'-\n', ''),
30 (r'\n', ' '),
31
32 ]
33 normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
34
35 normalize2 = [
36 (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '),
37 (r'([^0-9])([\.,])',r'\1 \2 '),
38 (r'([\.,])([^0-9])',r' \1 \2'),
39 (r'([0-9])(-)',r'\1 \2 ')
40 ]
41 normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
42
62
64 counts = {}
65 for k in xrange(1,n+1):
66 for i in xrange(len(words)-k+1):
67 ngram = tuple(words[i:i+k])
68 counts[ngram] = counts.get(ngram, 0)+1
69 return counts
70
72 '''Takes a list of reference sentences for a single segment
73 and returns an object that encapsulates everything that BLEU
74 needs to know about them.'''
75
76 refs = [normalize(ref) for ref in refs]
77 maxcounts = {}
78 for ref in refs:
79 counts = count_ngrams(ref, n)
80 for (ngram,count) in counts.iteritems():
81 maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
82 return ([len(ref) for ref in refs], maxcounts)
83
84 -def cook_test(test, (reflens, refmaxcounts), n=4):
85 '''Takes a test sentence and returns an object that
86 encapsulates everything that BLEU needs to know about it.'''
87
88 test = normalize(test)
89 result = {}
90 result["testlen"] = len(test)
91
92
93
94 if eff_ref_len == "shortest":
95 result["reflen"] = min(reflens)
96 elif eff_ref_len == "average":
97 result["reflen"] = float(sum(reflens))/len(reflens)
98 elif eff_ref_len == "closest":
99 min_diff = None
100 for reflen in reflens:
101 if min_diff is None or abs(reflen-len(test)) < min_diff:
102 min_diff = abs(reflen-len(test))
103 result['reflen'] = reflen
104
105 result["guess"] = [max(len(test)-k+1,0) for k in xrange(1,n+1)]
106
107 result['correct'] = [0]*n
108 counts = count_ngrams(test, n)
109 for (ngram, count) in counts.iteritems():
110 result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
111
112 return result
113
115 totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
116 for comps in allcomps:
117 for key in ['testlen','reflen']:
118 totalcomps[key] += comps[key]
119 for key in ['guess','correct']:
120 for k in xrange(n):
121 totalcomps[key][k] += comps[key][k]
122 logbleu = 0.0
123 for k in xrange(n):
124 if totalcomps['correct'][k] == 0:
125 return 0.0
126
127 logbleu += math.log(totalcomps['correct'][k])-math.log(totalcomps['guess'][k])
128 logbleu /= float(n)
129
130 logbleu += min(0,1-float(totalcomps['reflen'])/totalcomps['testlen'])
131 return math.exp(logbleu)
132
133
135 totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
136 for comps in allcomps:
137 for key in ['testlen','reflen']:
138 totalcomps[key] += comps[key]
139 for key in ['guess','correct']:
140 for k in xrange(n):
141 totalcomps[key][k] += comps[key][k]
142 logbleu = 0.0
143 for k in xrange(n):
144 if totalcomps['correct'][k] == 0:
145 return 0.0
146
147 logbleu += math.log(totalcomps['correct'][k]+1)-math.log(totalcomps['guess'][k])
148 logbleu /= float(n+1)
149
150 logbleu += min(0,1-float(totalcomps['reflen'])/totalcomps['testlen'])
151 return math.exp(logbleu)
152
153
154
155
157 """
158 Provides the single-sentence BLEU score for one sentence, given n references
159 @param translation: Translation text that needs to be evaluated
160 @type translation: str
161 @param references: List of reference translations to be used for the evaluation
162 @type references: [str, ...]
163 """
164 r = len(references)
165 if r == 0:
166 return 0.00
167 references = cook_refs(references, n)
168 test_set = cook_test(translation, references, n)
169 return smoothed_score_cooked([test_set], n)
170
172 """
173 Provides the single-sentence BLEU score for one sentence, given n references
174 @param translation: Translation text that needs to be evaluated
175 @type translation: str
176 @param references: List of reference translations to be used for the evaluation
177 @type references: [str, ...]
178 """
179 r = len(references)
180 if r == 0:
181 return 0.00
182 references = cook_refs(references, n)
183 test_set = cook_test(translation, references, n)
184 return score_cooked([test_set], n)
185
187 """
188 Provides BLEU calculation for many sentences.
189 @param sentence_tuples: a list of tuples generated out of the translated sentences. Each
190 tuple should contain one translated sentence and its list of references.
191 @type sentence_tuples: [tuple(str(translation), [str(reference), ...]), ...]
192 """
193
194 cooked_tests = []
195
196 for translation, references in sentence_tuples:
197 r = len(references)
198 if r == 0:
199 continue
200 cooked_references = cook_refs(references, n)
201 cooked_tests.append(cook_test(translation, cooked_references, n))
202 return score_cooked(cooked_tests, n)
203
204
206
207 import numpy as np
208 cooked_tests = []
209
210 for translations, references in sentence_tuples:
211 r = len(references)
212 if r == 0:
213 continue
214 cooked_references = cook_refs(references, n)
215
216 guess = {}
217 correct = {}
218 cooked_translations = []
219
220 for translation in translations:
221 cooked_translation = cook_test(translation, cooked_references, n)
222 cooked_translations.append(cooked_translation)
223
224 i = 0
225 for value in cooked_translation['correct']:
226 correct.setdefault(i, []).append(value)
227 i+=1
228
229 i = 0
230 for value in cooked_translation['guess']:
231 guess.setdefault(i, []).append(value)
232 i+=1
233
234
235
236
237
238
239 avg_translation = {
240 'guess' : [min(values) for key, values in guess.iteritems()],
241 'testlen': min([t['testlen'] for t in cooked_translations]),
242 'reflen': cooked_translation['reflen'],
243 'correct': [min(values) for key, values in correct.iteritems()],
244 }
245
246 cooked_tests.append(avg_translation)
247 return score_cooked(cooked_tests, n)
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297