1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 from languagefeaturegenerator import LanguageFeatureGenerator
29 from preprocessor import Tokenizer
30 from util.treetaggerwrapper import TreeTagger
31 import logging
32 import os
33 sent = False
34
35 TAGDIR = "~/taraxu_tools/treetager/"
36
37
38 -class Hjerson(LanguageFeatureGenerator):
39 """
40 This is a class that wraps the Hjerson functionality on a sentence level.
41 """
43 """
44 By initializing Hjerson, we maintain a tokenizer (if needed) and a treetager object
45 so that they are available for sentence-level calls
46 @keyword tokenize: specify if tokenizer should be run by Hjerson, false if it has already happened
47 @type tokenize: boolean
48 @keyword lang: specify which language is the content using the language 2-letter iso code
49 @type lang: str
50 @keyword tagdir: specify the directory where the treetager bin folder exists
51 @type tagdir: str
52 """
53 self.tokenize = kwargs.setdefault('tokenize', True)
54 self.lang = kwargs.setdefault('lang', 'en')
55 tagdir = kwargs.setdefault('tagdir', os.path.expanduser(TAGDIR))
56
57 if self.tokenize:
58 self.tokenizer = Tokenizer(self.lang)
59
60 self.treetager = TreeTagger(TAGLANG=self.lang,
61 TAGDIR=tagdir,
62
63
64 )
65
66 self.totalHypLength = 0.0
67 self.totalWerRefLength = 0.0
68
69 self.totalWerCount = 0.0
70 self.totalRperCount = 0.0
71 self.totalHperCount = 0.0
72
73 self.totalInflRperCount = 0.0
74 self.totalInflHperCount = 0.0
75 self.totalMissCount = 0.0
76 self.totalExtCount = 0.0
77 self.totalRefLexCount = 0.0
78 self.totalHypLexCount = 0.0
79 self.totalRefReordCount = 0.0
80 self.totalHypReordCount = 0.0
81
82 self.totalBlockInflRperCount = 0.0
83 self.totalBlockInflHperCount = 0.0
84 self.totalBlockMissCount = 0.0
85 self.totalBlockExtCount = 0.0
86 self.totalRefBlockLexCount = 0.0
87 self.totalHypBlockLexCount = 0.0
88 self.totalRefBlockReordCount = 0.0
89 self.totalHypBlockReordCount = 0.0
90
91
93 """
94 Override language feature generator function in order to return sentence level error classes
95 @param simplesentence: a simple sentence object, containing the target sentence
96 @type L{sentence.sentence.SimpleSentence}
97 @param parallelsentence: a parallel sentence object which is needed to derive the reference
98 @type L{sentence.parallelsentence.ParallelSentence}
99 @return: a dictionary with the attributes retrieved
100 @rtype: {str: object, ... }
101 """
102 target_string = simplesentence.get_string()
103 ref_string = parallelsentence.ref.get_string()
104
105 return self.get_features_strings(target_string, [ref_string])
106
107 - def _tag(self, string):
108 strings_tagged = self.treetager.TagText(string, encoding='utf-8')
109 tokens = []
110 tags = []
111 bases = []
112 for string_tagged in strings_tagged:
113
114 try:
115 token, tag, base = string_tagged.split("\t")
116 except ValueError:
117 try:
118 token, tag = string_tagged.split("\t")
119 base = token
120 except ValueError:
121 token = string_tagged
122 base = token
123 tag = "NaN"
124 tokens.append(token)
125 tags.append(tag)
126 bases.append(base)
127
128 results = (" ".join(tokens), " ".join(tags), " ".join(bases))
129 if (len(results[0].split())!=len(results[1].split()) or len(results[1].split())!=len(results[2].split()) or len(results[0].split())!=len(results[2].split())):
130 logging.debug("{}".format(results))
131 return results
132
134 """
135 Process one sentence, given the translated sentence (hypothesis) and the corresponding reference
136 @param target_string: the translation hypothesis produced by the system
137 @type target_string: str
138 @param references: a list of strings, containing the correct translations
139 @type references: list(str)
140 """
141
142 if self.tokenize:
143 target_string = self.tokenizer.process_string(target_string)
144 references = [self.tokenizer.process_string(reference) for reference in references]
145
146
147 target_string, target_tag, target_base = self._tag(target_string)
148
149
150 reference_tuples = [self._tag(reference) for reference in references]
151 reference_strings = [r[0] for r in reference_tuples]
152 reference_tags = [r[1] for r in reference_tuples]
153 reference_bases = [r[2] for r in reference_tuples]
154
155 return self.analyze(target_string, target_base, target_tag, reference_strings, reference_bases, reference_tags)
156
157
158 - def analyze(self, hline, basehline, addhline, refs, baserefs, addrefs):
159 """
160 This functions hosts the core sentence-level functionality of Hjerson, as written originally
161 by Maja Popovic. It operates after all sentence-level strings have been retrieved and passed as
162 parameters
163 @param hline:
164
165 """
166
167 p = (0,0)
168
169 Q = {}
170 Q[p] = 0
171
172 B = {}
173 B[p] = levNode(0, 0, 0)
174
175
176 minSentWer = 1000
177 bestWerRefLength = 0.0
178 bestWerRefIndex = -1
179 bestWerRefErrors = []
180 bestWerHypErrors = []
181 bestWerRefWords = []
182 bestWerHypWords = []
183 bestWerRefAdd = []
184 bestWerHypAdd = []
185
186
187 bestSentWer = 0.0
188
189 maxLength = []
190
191 hypWords = hline.split()
192 addhypWords = addhline.split()
193 if len(addhypWords) < hypWords:
194 addhypWords = [""] * len(hypWords)
195 baseHypWords = basehline.split()
196
197 self.totalHypLength += len(hypWords)
198
199
200
201
202 hyp = {}
203 addhyp = {}
204
205 adjust_indices(hypWords, hyp, addhypWords, addhyp)
206
207
208
209
210 nref = 0
211
212 for reference in refs:
213 ir = refs.index(reference)
214 refWords = reference.split()
215 addrefWords = addrefs[ir].split()
216 if len(addrefWords) < len(refWords):
217 addrefWords = [""] * len(refWords)
218 baseRefWords = baserefs[ir].split()
219
220
221
222
223 ref = {}
224 addref = {}
225
226 adjust_indices(refWords, ref, addrefWords, addref)
227
228
229
230
231 if len(refWords) > len(hypWords):
232 maxLength.append(len(refWords))
233 else:
234 maxLength.append(len(hypWords))
235
236
237
238
239
240 for nh in range(0, len(hyp)+1):
241 p = (0, nh)
242 Q[p] = nh
243 B[p] = levNode(0, nh-1, 3)
244
245
246 for nr in range(0, len(ref)+1):
247 p = (nr, 0)
248 Q[p] = nr
249 B[p] = levNode(nr-1, 0, 2)
250
251
252 p = (0, 0)
253 B[p] = levNode(-1, -1, -1)
254
255 p = (1, 0)
256 B[p] = levNode(0, 0, 2)
257
258 p = (0, 1)
259 B[p] = levNode(0, 0, 3)
260
261
262
263
264 for r in ref.keys():
265 for h in hyp.keys():
266 minQ = 1000
267 p = (r, h)
268 dp = (r-1, h)
269 ip = (r, h-1)
270 sp = (r-1, h-1)
271
272
273 s = 0
274 if hyp[h] != ref[r]:
275 s = 1
276 else:
277 s = 0
278
279 if Q[sp]+s < minQ:
280 minQ = Q[sp]+s
281 B[p] = levNode(r-1, h-1, s)
282
283 if Q[dp]+1 < minQ:
284 minQ = Q[dp]+1
285 B[p] = levNode(r-1, h, 2)
286
287 if Q[ip]+1 < minQ:
288 minQ = Q[ip]+1
289 B[p] = levNode(r, h-1, 3)
290
291 Q[p] = minQ
292
293
294
295
296
297 sentWerCount = 0.0
298 sentSubCount = 0.0
299 sentDelCount = 0.0
300 sentInsCount = 0.0
301
302 l = maxLength[nref]
303 werRefWords = []
304 werHypWords = []
305 werRefErrors = []
306 werHypErrors = []
307 werRefAdd = []
308 werHypAdd = []
309
310
311
312 p = (len(refWords), len(hypWords))
313
314 err = B[p].error
315
316
317 if err != 0:
318 if err == 1:
319 wer_errors(len(refWords), werRefWords, werRefAdd, werRefErrors, ref, addref, "sub")
320 wer_errors(len(hypWords), werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "sub")
321 sentSubCount += 1
322 elif err == 2:
323 wer_errors(len(refWords), werRefWords, werRefAdd, werRefErrors, ref, addref, "del")
324 sentDelCount += 1
325 elif err == 3:
326 wer_errors(len(hypWords), werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "ins")
327 sentInsCount += 1
328
329 else:
330 wer_errors(len(refWords), werRefWords, werRefAdd, werRefErrors, ref, addref, "x")
331 wer_errors(len(hypWords), werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "x")
332
333
334
335
336
337 rp = B[p].rpos
338 hp = B[p].hpos
339
340
341 while hp >= 0 and rp >= 0:
342 p1 = (rp, hp)
343 err = B[p1].error
344
345
346 if err != 0:
347 if err == 1:
348 wer_errors(rp, werRefWords, werRefAdd, werRefErrors, ref, addref, "sub")
349 wer_errors(hp, werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "sub")
350 sentSubCount += 1
351 elif err == 2:
352 wer_errors(rp, werRefWords, werRefAdd, werRefErrors, ref, addref, "del")
353 sentDelCount += 1
354 elif err == 3:
355 wer_errors(hp, werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "ins")
356 sentInsCount += 1
357 else:
358 wer_errors(rp, werRefWords, werRefAdd, werRefErrors, ref, addref, "x")
359 wer_errors(hp, werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "x")
360
361 l -= 1
362
363 hp = B[p1].hpos
364 rp = B[p1].rpos
365
366
367
368
369
370 sentWerCount = sentSubCount + sentDelCount + sentInsCount
371 try:
372 sentWer = sentWerCount/len(refWords)
373 except ZeroDivisionError:
374 logging.warn("Division by zero when calculating sentence WER")
375 sentWer = float("Inf")
376 if sentWer < minSentWer:
377 minSentWer = sentWer
378 bestWerRefIndex = ir
379 bestWerRefLength = len(refWords)
380 bestWerRefErrors = werRefErrors
381 bestWerHypErrors = werHypErrors
382 bestWerRefWords = werRefWords
383 bestWerBaseRefWords = baseRefWords
384 bestWerHypWords = werHypWords
385 bestWerRefAdd = werRefAdd
386 bestWerHypAdd = werHypAdd
387 bestSentWer = sentWerCount
388
389 nref += 1
390
391 Q.clear()
392 B.clear()
393
394
395 self.totalWerRefLength += bestWerRefLength
396 self.totalWerCount += bestSentWer
397
398 bestWerRefErrors.reverse()
399 bestWerHypErrors.reverse()
400 bestWerRefWords.reverse()
401 bestWerHypWords.reverse()
402 bestWerRefAdd.reverse()
403 bestWerHypAdd.reverse()
404
405
406
407
408 refWords = refs[bestWerRefIndex].split()
409
410 baseRefWords = baserefs[bestWerRefIndex].split()
411
412 if len(hypWords) == 0:
413 hLen = 0.00000001
414 else:
415 hLen = len(hypWords)
416
417
418
419
420 hperErrors = []
421 sentHperCount = 0.0
422 sentInflHperCount = 0.0
423
424 hperErrors, sentHperCount, sentInflHperCount = hyp_ref_errors(refs[bestWerRefIndex], baserefs[bestWerRefIndex], hypWords, baseHypWords, "herr")
425
426
427 sentHper = sentHperCount/hLen
428 sentInflHper = sentInflHperCount/hLen
429
430
431
432
433
434 rperErrors = []
435 sentRperCount = 0.0
436 sentInflRperCount = 0.0
437
438 rperErrors, sentRperCount, sentInflRperCount = hyp_ref_errors(hline, basehline, refWords, baseRefWords, "rerr")
439
440 try:
441 sentRper = sentRperCount/len(refWords)
442 sentInflRper = sentInflRperCount/len(refWords)
443 except ZeroDivisionError:
444 logging.warn("Division by zero when calculating sentence Rper and sentInflRper")
445 sentRper = float("Inf")
446 sentInflRper = float("Inf")
447
448 self.totalHperCount += sentHperCount
449 self.totalRperCount += sentRperCount
450 self.totalInflRperCount += sentInflRperCount
451 self.totalInflHperCount += sentInflHperCount
452
453
454 refErrorCats = []
455 hypErrorCats = []
456
457 sentMissCount = 0.0
458 sentExtCount = 0.0
459 sentRefLexCount = 0.0
460 sentHypLexCount = 0.0
461 sentRefReordCount = 0.0
462 sentHypReordCount = 0.0
463
464 sentBlockInflRperCount = 0.0
465 sentBlockInflHperCount = 0.0
466 sentBlockMissCount = 0.0
467 sentBlockExtCount = 0.0
468 sentRefBlockLexCount = 0.0
469 sentHypBlockLexCount = 0.0
470 sentRefBlockReordCount = 0.0
471 sentHypBlockReordCount = 0.0
472
473
474
475
476
477 refErrorCats, sentMissCount, sentRefLexCount = miss_ext_lex(bestWerRefErrors, bestWerRefWords, rperErrors, refErrorCats, sentMissCount, sentRefLexCount, "miss")
478
479
480
481
482 hypErrorCats, sentExtCount, sentHypLexCount = miss_ext_lex(bestWerHypErrors, bestWerHypWords, hperErrors, hypErrorCats, sentExtCount, sentHypLexCount, "ext")
483
484
485
486
487 hypErrorCats, sentHypReordCount = reord(bestWerRefErrors, bestWerRefWords, bestWerHypErrors, bestWerHypWords, hypErrorCats, sentHypReordCount)
488
489 refErrorCats, sentRefReordCount = reord(bestWerHypErrors, bestWerHypWords, bestWerRefErrors, bestWerRefWords, refErrorCats, sentRefReordCount)
490
491
492
493
494 sentBlockInflRperCount = block_count(refErrorCats, "infl", sentBlockInflRperCount)
495 sentBlockInflHperCount = block_count(hypErrorCats, "infl", sentBlockInflHperCount)
496 sentBlockMissCount = block_count(refErrorCats, "miss", sentBlockMissCount)
497 sentBlockExtCount = block_count(hypErrorCats, "ext", sentBlockExtCount)
498 sentRefBlockReordCount = block_count(refErrorCats, "reord", sentRefBlockReordCount)
499 sentHypBlockReordCount = block_count(hypErrorCats, "reord", sentHypBlockReordCount)
500 sentRefBlockLexCount = block_count(refErrorCats, "lex", sentRefBlockLexCount)
501 sentHypBlockLexCount = block_count(hypErrorCats, "lex", sentHypBlockLexCount)
502
503 self.totalMissCount += sentMissCount
504 self.totalExtCount += sentExtCount
505 self.totalRefLexCount += sentRefLexCount
506 self.totalHypLexCount += sentHypLexCount
507 self.totalRefReordCount += sentRefReordCount
508 self.totalHypReordCount += sentHypReordCount
509
510 self.totalBlockInflRperCount += sentBlockInflRperCount
511 self.totalBlockInflHperCount += sentBlockInflHperCount
512 self.totalBlockMissCount += sentBlockMissCount
513 self.totalBlockExtCount += sentBlockExtCount
514 self.totalRefBlockReordCount += sentRefBlockReordCount
515 self.totalHypBlockReordCount += sentHypBlockReordCount
516 self.totalRefBlockLexCount += sentRefBlockLexCount
517 self.totalHypBlockLexCount += sentHypBlockLexCount
518
519
520
521 res = {}
522
523 res['wer'] = 100*minSentWer
524 res['hper'] = 100*sentHper
525 res['rper'] = 100*sentRper
526
527 res['iHper'] = 100*sentInflHper
528 res['iRper'] = 100*sentInflRper
529
530 try:
531 res['missErr'] = 100*sentMissCount/bestWerRefLength
532 res['rLexErr'] = 100*sentRefLexCount/bestWerRefLength
533 res['rRer'] = 100*sentRefReordCount/bestWerRefLength
534 res['biRper'] = 100*sentBlockInflRperCount/bestWerRefLength
535 res['rbRer'] = 100*sentRefBlockReordCount/bestWerRefLength
536 res['bmissErr'] = 100*sentBlockMissCount/bestWerRefLength
537 res['rbLexErr'] = 100*sentRefBlockLexCount/bestWerRefLength
538 except ZeroDivisionError:
539 logging.warn("Divison by zero when calculating missErr, rLexErr, rRer, biRper, rbRer, bmissErr, rbLexErr")
540 for metricname in ['missErr', 'rLexErr', 'rRer', 'biRper', 'rbRer', 'bmissErr', 'rbLexErr']:
541 res[metricname] = float("Inf")
542
543 try:
544 res['extErr'] = 100*sentExtCount/hLen
545 res['hLexErr'] = 100*sentHypLexCount/hLen
546 res['hRer'] = 100*sentHypReordCount/hLen
547 res['biHper'] = 100*sentBlockInflHperCount/hLen
548 res['hbRer'] = 100*sentHypBlockReordCount/hLen
549 res['bextErr'] = 100*sentBlockExtCount/hLen
550 res['hbLexErr'] = 100*sentHypBlockLexCount/hLen
551 except ZeroDivisionError:
552 logging.warn("Divison by zero when calculating 'extErr', 'hLexErr', 'hRer', 'biHper', 'hbRer', 'bextErr', 'hbLexErr'")
553
554 for metricname in ['extErr', 'hLexErr', 'hRer', 'biHper', 'hbRer', 'bextErr', 'hbLexErr']:
555 res[metricname] = float("Inf")
556
557
558 res['aMissErr'] = sentMissCount
559 res['aExtErr'] = sentExtCount
560 res['arLexErr'] = sentRefLexCount
561 res['arRer'] = sentRefReordCount
562
563 res["refLength"] = bestWerRefLength
564 res['TER'] = (sentMissCount + sentExtCount + sentRefLexCount + sentRefReordCount)*1.00/bestWerRefLength
565 return res
566
568 self.totalWer = 100*self.totalWerCount/self.totalWerRefLength
569 self.totalHper = 100*self.totalHperCount/self.totalHypLength
570 self.totalRper = 100*self.totalRperCount/self.totalWerRefLength
571
572 self.totalInflHper = 100*self.totalInflHperCount/self.totalHypLength
573 self.totalInflRper = 100*self.totalInflRperCount/self.totalWerRefLength
574 self.totalMissErr = 100*self.totalMissCount/self.totalWerRefLength
575 self.totalExtErr = 100*self.totalExtCount/self.totalHypLength
576 self.totalrLexErr = 100*self.totalRefLexCount/self.totalWerRefLength
577 self.totalhLexErr = 100*self.totalHypLexCount/self.totalHypLength
578 self.totalrRer = 100*self.totalRefReordCount/self.totalWerRefLength
579 self.totalhRer = 100*self.totalHypReordCount/self.totalHypLength
580
581 self.totalbiHper = 100*self.totalBlockInflHperCount/self.totalHypLength
582 self.totalbiRper = 100*self.totalBlockInflRperCount/self.totalWerRefLength
583 self.totalrbRer = 100*self.totalRefBlockReordCount/self.totalWerRefLength
584 self.totalhbRer = 100*self.totalHypBlockReordCount/self.totalHypLength
585 self.totalbmissErr = 100*self.totalBlockMissCount/self.totalWerRefLength
586 self.totalbextErr = 100*self.totalBlockExtCount/self.totalHypLength
587 self.totalrbLexErr = 100*self.totalRefBlockLexCount/self.totalWerRefLength
588 self.totalhbLexErr = 100*self.totalHypBlockLexCount/self.totalHypLength
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
616 - def analyze(self, hline, basehline, addhline, refs, baserefs, addrefs):
617 features = super(BinaryHjerson, self).analyze(hline, basehline, addhline, refs, baserefs, addrefs)
618 newfeatures = {}
619 for name, value in features.iteritems():
620 if value > 0:
621 newfeatures[name] = 1
622 else:
623 newfeatures[name] = 0
624 return newfeatures
625
626
627
629 - def __init__(self, rpos=0, hpos=0, error=0):
630 self.rpos = rpos
631 self.hpos = hpos
632 self.error = error
633
634
636 if addtext:
637 addwords = addline.split()
638 else:
639 addwords = ["" for x in range(len(words))]
640 return addwords
641
643 i = 1
644 while i <= len(words):
645 adjwords[i] = words[i-1]
646 adjaddwords[i] = addwords[i-1]
647 i += 1
648
649
650 -def wer_errors(index, werwords, weradd, wererr, words, add, error):
651 werwords.append(words[index])
652 weradd.append(add[index])
653 wererr.append(error)
654
656
657 rwords = rline.split()
658 logging.debug("{}\t{}".format(len(hwords), hwords))
659 logging.debug("{}\t{}".format(len(hbases), hbases))
660 rbases = rbaseline.split()
661 errors = []
662 errorcount = 0.0
663 inflerrorcount = 0.0
664
665 for ihw, hw in enumerate(hwords):
666 if hw in rwords:
667 errors.append("x")
668 n = rwords.index(hw)
669 del rwords[n]
670 del rbases[n]
671 else:
672 errors.append(error)
673 errorcount += 1
674
675 for ihb, hb in enumerate(hbases):
676 if hb in rbases:
677 if errors[ihb] == error:
678 errors[ihb] = "i"+error
679 n = rbases.index(hb)
680 del rbases[n]
681 inflerrorcount += 1
682
683 return errors, errorcount, inflerrorcount
684
685
686 -def miss_ext_lex(wererrors, werwords, pererrors, errcats, misextcount, lexcount, misext):
687 i = 0
688 while i < len(wererrors):
689 refWerWord = werwords[i]
690 refWerError = wererrors[i]
691 rperError = pererrors[i]
692 if rperError == "irerr" or rperError == "iherr":
693 errcats.append("infl")
694 elif rperError == "rerr" or rperError == "herr":
695 if refWerError == "del" or refWerError == "ins":
696 errcats.append(misext)
697 misextcount += 1
698 elif refWerError == "sub":
699 errcats.append("lex")
700 lexcount += 1
701 else:
702 errcats.append("x")
703 else:
704 errcats.append("x")
705 i += 1
706
707 return errcats, misextcount, lexcount
708
709 -def reord(werreferrors, werrefwords, werhyperrors, werhypwords, hyperrcats, hypcount):
710 referr = []
711 i = 0
712 while i < len(werreferrors):
713 if werreferrors[i] != "x":
714 referr.append(werrefwords[i])
715 i += 1
716
717 i = 0
718 while i < len(werhyperrors):
719 hypWerWord = werhypwords[i]
720 hypWerError = werhyperrors[i]
721 if hypWerError == "ins" or hypWerError == "del" or hypWerError == "sub":
722 if hypWerWord in referr:
723 hyperrcats[i] = "reord"
724 hypcount += 1
725 n = referr.index(hypWerWord)
726 del referr[n]
727 i += 1
728
729 return hyperrcats, hypcount
730
732 i = 0
733 newblock = True
734 while i < len(errcats):
735 cat = errcats[i]
736 if cat == errcat:
737 if newblock == True:
738 blockcount += 1
739 newblock = False
740 else:
741 newblock = True
742
743 i += 1
744
745 return blockcount
746
747
749 text.write(errorname+"\t"+str("%.0f" % errorcount)+"\t"+str("%.2f" % errorrate)+"\n")
750
752 text.write(title)
753 for nr, r in enumerate(errors):
754 if addtext:
755 text.write(words[nr]+"#"+add[nr]+"~~"+r+" ")
756 else:
757 text.write(words[nr]+"~~"+r+" ")
758
759 text.write("\n")
760
761
762 if __name__ == '__main__':
763 h = Hjerson(lang="en")
764 hyp = 'En lugar de ello , es algo tan terrible como " un condenado estrangulado en secreto " .'
765 ref = 'En lugar de ello , es terriblemente como " un condenado estrangulados en secreto . "'
766 print h.get_features_strings(hyp, [ref])
767