featuregenerator.hjerson

1 #!/usr/bin/env python 2 # -*- coding: utf8 -*- 3 4 # Copyright 2011 Maja Popović, 5 # Modified 2013 Eleftherios Avramidis 6 # The program is distributed under the terms 7 # of the GNU General Public Licence (GPL) 8 9 # This program is distributed in the hope that it will be useful, 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # GNU General Public License for more details. 13 14 # You should have received a copy of the GNU General Public License 15 # along with this program. If not, see <http://www.gnu.org/licenses/>. 16 17 18 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 19 # Publications of results obtained through the use of original or 20 # modified versions of the software have to cite the authors by refering 21 # to the following publication: 22 23 # Maja Popović: "Hjerson: An Open Source Tool for Automatic Error 24 # Classification of Machine Translation Output". The Prague Bulletin of 25 # Mathematical Linguistics No. 96, pp. 59--68, October 2011 26 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 28 from languagefeaturegenerator import LanguageFeatureGenerator 29 from preprocessor import Tokenizer 30 from util.treetaggerwrapper import TreeTagger 31 import logging 32 import os 33 sent = False 34 35 TAGDIR = "~/taraxu_tools/treetager/" 36 37

38 -class Hjerson(LanguageFeatureGenerator):

39 """ 40 This is a class that wraps the Hjerson functionality on a sentence level. 41 """

42 - def __init__(self, **kwargs):

43 """ 44 By initializing Hjerson, we maintain a tokenizer (if needed) and a treetager object 45 so that they are available for sentence-level calls 46 @keyword tokenize: specify if tokenizer should be run by Hjerson, false if it has already happened 47 @type tokenize: boolean 48 @keyword lang: specify which language is the content using the language 2-letter iso code 49 @type lang: str 50 @keyword tagdir: specify the directory where the treetager bin folder exists 51 @type tagdir: str 52 """ 53 self.tokenize = kwargs.setdefault('tokenize', True) 54 self.lang = kwargs.setdefault('lang', 'en') 55 tagdir = kwargs.setdefault('tagdir', os.path.expanduser(TAGDIR)) 56 57 if self.tokenize: 58 self.tokenizer = Tokenizer(self.lang) 59 60 self.treetager = TreeTagger(TAGLANG=self.lang, 61 TAGDIR=tagdir, 62 # TAGINENC='latin1', 63 # TAGOUTENC='latin1' 64 ) 65 66 self.totalHypLength = 0.0 67 self.totalWerRefLength = 0.0 68 69 self.totalWerCount = 0.0 70 self.totalRperCount = 0.0 71 self.totalHperCount = 0.0 72 73 self.totalInflRperCount = 0.0 74 self.totalInflHperCount = 0.0 75 self.totalMissCount = 0.0 76 self.totalExtCount = 0.0 77 self.totalRefLexCount = 0.0 78 self.totalHypLexCount = 0.0 79 self.totalRefReordCount = 0.0 80 self.totalHypReordCount = 0.0 81 82 self.totalBlockInflRperCount = 0.0 83 self.totalBlockInflHperCount = 0.0 84 self.totalBlockMissCount = 0.0 85 self.totalBlockExtCount = 0.0 86 self.totalRefBlockLexCount = 0.0 87 self.totalHypBlockLexCount = 0.0 88 self.totalRefBlockReordCount = 0.0 89 self.totalHypBlockReordCount = 0.0

90 91

92 - def get_features_tgt(self, simplesentence, parallelsentence):

93 """ 94 Override language feature generator function in order to return sentence level error classes 95 @param simplesentence: a simple sentence object, containing the target sentence 96 @type L{sentence.sentence.SimpleSentence} 97 @param parallelsentence: a parallel sentence object which is needed to derive the reference 98 @type L{sentence.parallelsentence.ParallelSentence} 99 @return: a dictionary with the attributes retrieved 100 @rtype: {str: object, ... } 101 """ 102 target_string = simplesentence.get_string() 103 ref_string = parallelsentence.ref.get_string() 104 105 return self.get_features_strings(target_string, [ref_string])

106

107 - def _tag(self, string):

108 strings_tagged = self.treetager.TagText(string, encoding='utf-8') 109 tokens = [] 110 tags = [] 111 bases = [] 112 for string_tagged in strings_tagged: 113 #try net to catch failed tagging 114 try: 115 token, tag, base = string_tagged.split("\t") 116 except ValueError: 117 try: 118 token, tag = string_tagged.split("\t") 119 base = token 120 except ValueError: 121 token = string_tagged 122 base = token 123 tag = "NaN" 124 tokens.append(token) 125 tags.append(tag) 126 bases.append(base) 127 128 results = (" ".join(tokens), " ".join(tags), " ".join(bases)) 129 if (len(results[0].split())!=len(results[1].split()) or len(results[1].split())!=len(results[2].split()) or len(results[0].split())!=len(results[2].split())): 130 logging.debug("{}".format(results)) 131 return results

132

133 - def get_features_strings(self, target_string, references):

134 """ 135 Process one sentence, given the translated sentence (hypothesis) and the corresponding reference 136 @param target_string: the translation hypothesis produced by the system 137 @type target_string: str 138 @param references: a list of strings, containing the correct translations 139 @type references: list(str) 140 """ 141 142 if self.tokenize: 143 target_string = self.tokenizer.process_string(target_string) 144 references = [self.tokenizer.process_string(reference) for reference in references] 145 146 #replace target string with the one from the tagger, and also get tags and base forms 147 target_string, target_tag, target_base = self._tag(target_string) 148 149 #separate references list into two lists, one for tags and one for base forms 150 reference_tuples = [self._tag(reference) for reference in references] 151 reference_strings = [r[0] for r in reference_tuples] 152 reference_tags = [r[1] for r in reference_tuples] 153 reference_bases = [r[2] for r in reference_tuples] 154 155 return self.analyze(target_string, target_base, target_tag, reference_strings, reference_bases, reference_tags)

156 157

158 - def analyze(self, hline, basehline, addhline, refs, baserefs, addrefs):

159 """ 160 This functions hosts the core sentence-level functionality of Hjerson, as written originally 161 by Maja Popovic. It operates after all sentence-level strings have been retrieved and passed as 162 parameters 163 @param hline: 164 165 """ 166 167 p = (0,0) 168 169 Q = {} 170 Q[p] = 0 171 172 B = {} 173 B[p] = levNode(0, 0, 0) 174 175 176 minSentWer = 1000 177 bestWerRefLength = 0.0 178 bestWerRefIndex = -1 179 bestWerRefErrors = [] 180 bestWerHypErrors = [] 181 bestWerRefWords = [] 182 bestWerHypWords = [] 183 bestWerRefAdd = [] 184 bestWerHypAdd = [] 185 186 187 bestSentWer = 0.0 188 189 maxLength = [] 190 191 hypWords = hline.split() 192 addhypWords = addhline.split() 193 if len(addhypWords) < hypWords: 194 addhypWords = [""] * len(hypWords) 195 baseHypWords = basehline.split() 196 197 self.totalHypLength += len(hypWords) 198 199 200 # adjusting hypothesis indices to range from 1 to len(hypWords) (for WER calculation) 201 202 hyp = {} 203 addhyp = {} 204 205 adjust_indices(hypWords, hyp, addhypWords, addhyp) 206 207 208 # reading reference(s) 209 210 nref = 0 211 212 for reference in refs: 213 ir = refs.index(reference) 214 refWords = reference.split() 215 addrefWords = addrefs[ir].split() 216 if len(addrefWords) < len(refWords): 217 addrefWords = [""] * len(refWords) 218 baseRefWords = baserefs[ir].split() 219 220 221 # adjusting reference indices to range from 1 to len(refWords) (for WER calculation) 222 223 ref = {} 224 addref = {} 225 226 adjust_indices(refWords, ref, addrefWords, addref) 227 228 229 # maximal length (necessary for wer-alignment) 230 231 if len(refWords) > len(hypWords): 232 maxLength.append(len(refWords)) 233 else: 234 maxLength.append(len(hypWords)) 235 236 237 238 # WER errors 239 240 for nh in range(0, len(hyp)+1): 241 p = (0, nh) 242 Q[p] = nh 243 B[p] = levNode(0, nh-1, 3) 244 245 246 for nr in range(0, len(ref)+1): 247 p = (nr, 0) 248 Q[p] = nr 249 B[p] = levNode(nr-1, 0, 2) 250 251 252 p = (0, 0) 253 B[p] = levNode(-1, -1, -1) 254 255 p = (1, 0) 256 B[p] = levNode(0, 0, 2) 257 258 p = (0, 1) 259 B[p] = levNode(0, 0, 3) 260 261 262 # Qs and Bs 263 264 for r in ref.keys(): 265 for h in hyp.keys(): 266 minQ = 1000 267 p = (r, h) 268 dp = (r-1, h) 269 ip = (r, h-1) 270 sp = (r-1, h-1) 271 272 273 s = 0 274 if hyp[h] != ref[r]: 275 s = 1 276 else: 277 s = 0 278 279 if Q[sp]+s < minQ: 280 minQ = Q[sp]+s 281 B[p] = levNode(r-1, h-1, s) 282 283 if Q[dp]+1 < minQ: 284 minQ = Q[dp]+1 285 B[p] = levNode(r-1, h, 2) 286 287 if Q[ip]+1 < minQ: 288 minQ = Q[ip]+1 289 B[p] = levNode(r, h-1, 3) 290 291 Q[p] = minQ 292 293 294 295 # backtracking 296 297 sentWerCount = 0.0 298 sentSubCount = 0.0 299 sentDelCount = 0.0 300 sentInsCount = 0.0 301 302 l = maxLength[nref] 303 werRefWords = [] 304 werHypWords = [] 305 werRefErrors = [] 306 werHypErrors = [] 307 werRefAdd = [] 308 werHypAdd = [] 309 310 # 1) starting backtracking 311 312 p = (len(refWords), len(hypWords)) 313 314 err = B[p].error 315 316 317 if err != 0: 318 if err == 1: 319 wer_errors(len(refWords), werRefWords, werRefAdd, werRefErrors, ref, addref, "sub") 320 wer_errors(len(hypWords), werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "sub") 321 sentSubCount += 1 322 elif err == 2: 323 wer_errors(len(refWords), werRefWords, werRefAdd, werRefErrors, ref, addref, "del") 324 sentDelCount += 1 325 elif err == 3: 326 wer_errors(len(hypWords), werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "ins") 327 sentInsCount += 1 328 329 else: 330 wer_errors(len(refWords), werRefWords, werRefAdd, werRefErrors, ref, addref, "x") 331 wer_errors(len(hypWords), werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "x") 332 333 334 # 2) going down 335 336 337 rp = B[p].rpos 338 hp = B[p].hpos 339 340 341 while hp >= 0 and rp >= 0: 342 p1 = (rp, hp) 343 err = B[p1].error 344 345 346 if err != 0: 347 if err == 1: 348 wer_errors(rp, werRefWords, werRefAdd, werRefErrors, ref, addref, "sub") 349 wer_errors(hp, werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "sub") 350 sentSubCount += 1 351 elif err == 2: 352 wer_errors(rp, werRefWords, werRefAdd, werRefErrors, ref, addref, "del") 353 sentDelCount += 1 354 elif err == 3: 355 wer_errors(hp, werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "ins") 356 sentInsCount += 1 357 else: 358 wer_errors(rp, werRefWords, werRefAdd, werRefErrors, ref, addref, "x") 359 wer_errors(hp, werHypWords, werHypAdd, werHypErrors, hyp, addhyp, "x") 360 361 l -= 1 362 363 hp = B[p1].hpos 364 rp = B[p1].rpos 365 366 367 368 # best (minimum) sentence WER => best reference => best WER errors 369 370 sentWerCount = sentSubCount + sentDelCount + sentInsCount 371 try: 372 sentWer = sentWerCount/len(refWords) 373 except ZeroDivisionError: 374 logging.warn("Division by zero when calculating sentence WER") 375 sentWer = float("Inf") 376 if sentWer < minSentWer: 377 minSentWer = sentWer 378 bestWerRefIndex = ir 379 bestWerRefLength = len(refWords) 380 bestWerRefErrors = werRefErrors 381 bestWerHypErrors = werHypErrors 382 bestWerRefWords = werRefWords 383 bestWerBaseRefWords = baseRefWords 384 bestWerHypWords = werHypWords 385 bestWerRefAdd = werRefAdd 386 bestWerHypAdd = werHypAdd 387 bestSentWer = sentWerCount 388 389 nref += 1 390 391 Q.clear() 392 B.clear() 393 394 395 self.totalWerRefLength += bestWerRefLength 396 self.totalWerCount += bestSentWer 397 398 bestWerRefErrors.reverse() 399 bestWerHypErrors.reverse() 400 bestWerRefWords.reverse() 401 bestWerHypWords.reverse() 402 bestWerRefAdd.reverse() 403 bestWerHypAdd.reverse() 404 405 406 # preparations for HPER and RPER 407 408 refWords = refs[bestWerRefIndex].split() 409 # read_addfiles(addrtext, addrefs[bestWerRefIndex], refWords) 410 baseRefWords = baserefs[bestWerRefIndex].split() 411 412 if len(hypWords) == 0: 413 hLen = 0.00000001 414 else: 415 hLen = len(hypWords) 416 417 418 # HPER (hypothesis/precision) errors 419 420 hperErrors = [] 421 sentHperCount = 0.0 422 sentInflHperCount = 0.0 423 424 hperErrors, sentHperCount, sentInflHperCount = hyp_ref_errors(refs[bestWerRefIndex], baserefs[bestWerRefIndex], hypWords, baseHypWords, "herr") 425 426 427 sentHper = sentHperCount/hLen 428 sentInflHper = sentInflHperCount/hLen 429 430 431 432 # RPER (reference/recall) errors 433 434 rperErrors = [] 435 sentRperCount = 0.0 436 sentInflRperCount = 0.0 437 438 rperErrors, sentRperCount, sentInflRperCount = hyp_ref_errors(hline, basehline, refWords, baseRefWords, "rerr") 439 440 try: 441 sentRper = sentRperCount/len(refWords) 442 sentInflRper = sentInflRperCount/len(refWords) 443 except ZeroDivisionError: 444 logging.warn("Division by zero when calculating sentence Rper and sentInflRper") 445 sentRper = float("Inf") 446 sentInflRper = float("Inf") 447 448 self.totalHperCount += sentHperCount 449 self.totalRperCount += sentRperCount 450 self.totalInflRperCount += sentInflRperCount 451 self.totalInflHperCount += sentInflHperCount 452 453 # preparations for error categorisation 454 refErrorCats = [] 455 hypErrorCats = [] 456 457 sentMissCount = 0.0 458 sentExtCount = 0.0 459 sentRefLexCount = 0.0 460 sentHypLexCount = 0.0 461 sentRefReordCount = 0.0 462 sentHypReordCount = 0.0 463 464 sentBlockInflRperCount = 0.0 465 sentBlockInflHperCount = 0.0 466 sentBlockMissCount = 0.0 467 sentBlockExtCount = 0.0 468 sentRefBlockLexCount = 0.0 469 sentHypBlockLexCount = 0.0 470 sentRefBlockReordCount = 0.0 471 sentHypBlockReordCount = 0.0 472 473 474 # missing words, reference lexical errors, reference inflectional errors 475 476 477 refErrorCats, sentMissCount, sentRefLexCount = miss_ext_lex(bestWerRefErrors, bestWerRefWords, rperErrors, refErrorCats, sentMissCount, sentRefLexCount, "miss") 478 479 480 # extra words, hypothesis lexical errors, hypothesis inflectional errors 481 482 hypErrorCats, sentExtCount, sentHypLexCount = miss_ext_lex(bestWerHypErrors, bestWerHypWords, hperErrors, hypErrorCats, sentExtCount, sentHypLexCount, "ext") 483 484 485 # reordering errors 486 487 hypErrorCats, sentHypReordCount = reord(bestWerRefErrors, bestWerRefWords, bestWerHypErrors, bestWerHypWords, hypErrorCats, sentHypReordCount) 488 489 refErrorCats, sentRefReordCount = reord(bestWerHypErrors, bestWerHypWords, bestWerRefErrors, bestWerRefWords, refErrorCats, sentRefReordCount) 490 491 492 # block error counts and error rates 493 494 sentBlockInflRperCount = block_count(refErrorCats, "infl", sentBlockInflRperCount) 495 sentBlockInflHperCount = block_count(hypErrorCats, "infl", sentBlockInflHperCount) 496 sentBlockMissCount = block_count(refErrorCats, "miss", sentBlockMissCount) 497 sentBlockExtCount = block_count(hypErrorCats, "ext", sentBlockExtCount) 498 sentRefBlockReordCount = block_count(refErrorCats, "reord", sentRefBlockReordCount) 499 sentHypBlockReordCount = block_count(hypErrorCats, "reord", sentHypBlockReordCount) 500 sentRefBlockLexCount = block_count(refErrorCats, "lex", sentRefBlockLexCount) 501 sentHypBlockLexCount = block_count(hypErrorCats, "lex", sentHypBlockLexCount) 502 503 self.totalMissCount += sentMissCount 504 self.totalExtCount += sentExtCount 505 self.totalRefLexCount += sentRefLexCount 506 self.totalHypLexCount += sentHypLexCount 507 self.totalRefReordCount += sentRefReordCount 508 self.totalHypReordCount += sentHypReordCount 509 510 self.totalBlockInflRperCount += sentBlockInflRperCount 511 self.totalBlockInflHperCount += sentBlockInflHperCount 512 self.totalBlockMissCount += sentBlockMissCount 513 self.totalBlockExtCount += sentBlockExtCount 514 self.totalRefBlockReordCount += sentRefBlockReordCount 515 self.totalHypBlockReordCount += sentHypBlockReordCount 516 self.totalRefBlockLexCount += sentRefBlockLexCount 517 self.totalHypBlockLexCount += sentHypBlockLexCount 518 519 520 # write sentence error rates 521 res = {} 522 523 res['wer'] = 100*minSentWer 524 res['hper'] = 100*sentHper 525 res['rper'] = 100*sentRper 526 527 res['iHper'] = 100*sentInflHper 528 res['iRper'] = 100*sentInflRper 529 530 try: 531 res['missErr'] = 100*sentMissCount/bestWerRefLength 532 res['rLexErr'] = 100*sentRefLexCount/bestWerRefLength 533 res['rRer'] = 100*sentRefReordCount/bestWerRefLength 534 res['biRper'] = 100*sentBlockInflRperCount/bestWerRefLength 535 res['rbRer'] = 100*sentRefBlockReordCount/bestWerRefLength 536 res['bmissErr'] = 100*sentBlockMissCount/bestWerRefLength 537 res['rbLexErr'] = 100*sentRefBlockLexCount/bestWerRefLength 538 except ZeroDivisionError: 539 logging.warn("Divison by zero when calculating missErr, rLexErr, rRer, biRper, rbRer, bmissErr, rbLexErr") 540 for metricname in ['missErr', 'rLexErr', 'rRer', 'biRper', 'rbRer', 'bmissErr', 'rbLexErr']: 541 res[metricname] = float("Inf") 542 543 try: 544 res['extErr'] = 100*sentExtCount/hLen 545 res['hLexErr'] = 100*sentHypLexCount/hLen 546 res['hRer'] = 100*sentHypReordCount/hLen 547 res['biHper'] = 100*sentBlockInflHperCount/hLen 548 res['hbRer'] = 100*sentHypBlockReordCount/hLen 549 res['bextErr'] = 100*sentBlockExtCount/hLen 550 res['hbLexErr'] = 100*sentHypBlockLexCount/hLen 551 except ZeroDivisionError: 552 logging.warn("Divison by zero when calculating 'extErr', 'hLexErr', 'hRer', 'biHper', 'hbRer', 'bextErr', 'hbLexErr'") 553 554 for metricname in ['extErr', 'hLexErr', 'hRer', 'biHper', 'hbRer', 'bextErr', 'hbLexErr']: 555 res[metricname] = float("Inf") 556 557 558 res['aMissErr'] = sentMissCount 559 res['aExtErr'] = sentExtCount 560 res['arLexErr'] = sentRefLexCount 561 res['arRer'] = sentRefReordCount 562 563 res["refLength"] = bestWerRefLength 564 res['TER'] = (sentMissCount + sentExtCount + sentRefLexCount + sentRefReordCount)*1.00/bestWerRefLength 565 return res

566

567 - def calculate_total_scores(self):

568 self.totalWer = 100*self.totalWerCount/self.totalWerRefLength 569 self.totalHper = 100*self.totalHperCount/self.totalHypLength 570 self.totalRper = 100*self.totalRperCount/self.totalWerRefLength 571 572 self.totalInflHper = 100*self.totalInflHperCount/self.totalHypLength 573 self.totalInflRper = 100*self.totalInflRperCount/self.totalWerRefLength 574 self.totalMissErr = 100*self.totalMissCount/self.totalWerRefLength 575 self.totalExtErr = 100*self.totalExtCount/self.totalHypLength 576 self.totalrLexErr = 100*self.totalRefLexCount/self.totalWerRefLength 577 self.totalhLexErr = 100*self.totalHypLexCount/self.totalHypLength 578 self.totalrRer = 100*self.totalRefReordCount/self.totalWerRefLength 579 self.totalhRer = 100*self.totalHypReordCount/self.totalHypLength 580 581 self.totalbiHper = 100*self.totalBlockInflHperCount/self.totalHypLength 582 self.totalbiRper = 100*self.totalBlockInflRperCount/self.totalWerRefLength 583 self.totalrbRer = 100*self.totalRefBlockReordCount/self.totalWerRefLength 584 self.totalhbRer = 100*self.totalHypBlockReordCount/self.totalHypLength 585 self.totalbmissErr = 100*self.totalBlockMissCount/self.totalWerRefLength 586 self.totalbextErr = 100*self.totalBlockExtCount/self.totalHypLength 587 self.totalrbLexErr = 100*self.totalRefBlockLexCount/self.totalWerRefLength 588 self.totalhbLexErr = 100*self.totalHypBlockLexCount/self.totalHypLength

589 590 591 # write wer, rper and hper words (and additional information, such as POS, etc.) 592 593 # if errfile: 594 # write_error_words(errftxt, addrtext, bestWerRefErrors, bestWerRefWords, bestWerRefAdd, str(nSent)+"::wer-ref-errors: ") 595 # write_error_words(errftxt, addhtext, bestWerHypErrors, bestWerHypWords, bestWerHypAdd, str(nSent)+"::wer-hyp-errors: ") 596 # 597 # errftxt.write("\n") 598 # 599 # write_error_words(errftxt, addrtext, rperErrors, refWords, addrefWords, str(nSent)+"::ref-errors: ") 600 # write_error_words(errftxt, addhtext, hperErrors, hypWords, addhypWords, str(nSent)+"::hyp-errors: ") 601 # 602 # errftxt.write("\n\n") 603 # 604 # 605 # # write error categories (and additional information, such as POS, etc.) 606 # 607 # if errcatfile: 608 # write_error_cats(errcatfile, errcftxt, addrtext, refWords, bestWerRefAdd, refErrorCats, "ref") 609 # write_error_cats(errcatfile, errcftxt, addhtext, hypWords, bestWerHypAdd, hypErrorCats, "hyp") 610 # 611 # if htmlfile: 612 # write_html(htmlfile, htmltxt, addrtext, refWords, bestWerRefAdd, refErrorCats, "ref") 613 # write_html(htmlfile, htmltxt, addhtext, hypWords, bestWerHypAdd, hypErrorCats, "hyp") 614 #

615 -class BinaryHjerson(Hjerson):

616 - def analyze(self, hline, basehline, addhline, refs, baserefs, addrefs):

617 features = super(BinaryHjerson, self).analyze(hline, basehline, addhline, refs, baserefs, addrefs) 618 newfeatures = {} 619 for name, value in features.iteritems(): 620 if value > 0: 621 newfeatures[name] = 1 622 else: 623 newfeatures[name] = 0 624 return newfeatures

625 626 627

628 -class levNode:

629 - def __init__(self, rpos=0, hpos=0, error=0):

630 self.rpos = rpos 631 self.hpos = hpos 632 self.error = error

633 634

635 -def read_addfiles(addtext, addline, words):

636 if addtext: 637 addwords = addline.split() 638 else: 639 addwords = ["" for x in range(len(words))] 640 return addwords

641

642 -def adjust_indices(words, adjwords, addwords, adjaddwords):

643 i = 1 644 while i <= len(words): 645 adjwords[i] = words[i-1] 646 adjaddwords[i] = addwords[i-1] 647 i += 1

648 649

650 -def wer_errors(index, werwords, weradd, wererr, words, add, error):

651 werwords.append(words[index]) 652 weradd.append(add[index]) 653 wererr.append(error)

654

655 -def hyp_ref_errors(rline, rbaseline, hwords, hbases, error):

656 657 rwords = rline.split() 658 logging.debug("{}\t{}".format(len(hwords), hwords)) 659 logging.debug("{}\t{}".format(len(hbases), hbases)) 660 rbases = rbaseline.split() 661 errors = [] 662 errorcount = 0.0 663 inflerrorcount = 0.0 664 665 for ihw, hw in enumerate(hwords): 666 if hw in rwords: 667 errors.append("x") 668 n = rwords.index(hw) 669 del rwords[n] 670 del rbases[n] 671 else: 672 errors.append(error) 673 errorcount += 1 674 675 for ihb, hb in enumerate(hbases): 676 if hb in rbases: 677 if errors[ihb] == error: 678 errors[ihb] = "i"+error 679 n = rbases.index(hb) 680 del rbases[n] 681 inflerrorcount += 1 682 683 return errors, errorcount, inflerrorcount

684 685

686 -def miss_ext_lex(wererrors, werwords, pererrors, errcats, misextcount, lexcount, misext):

687 i = 0 688 while i < len(wererrors): 689 refWerWord = werwords[i] 690 refWerError = wererrors[i] 691 rperError = pererrors[i] 692 if rperError == "irerr" or rperError == "iherr": 693 errcats.append("infl") 694 elif rperError == "rerr" or rperError == "herr": 695 if refWerError == "del" or refWerError == "ins": 696 errcats.append(misext) 697 misextcount += 1 698 elif refWerError == "sub": 699 errcats.append("lex") 700 lexcount += 1 701 else: 702 errcats.append("x") 703 else: 704 errcats.append("x") 705 i += 1 706 707 return errcats, misextcount, lexcount

708

709 -def reord(werreferrors, werrefwords, werhyperrors, werhypwords, hyperrcats, hypcount):

710 referr = [] 711 i = 0 712 while i < len(werreferrors): 713 if werreferrors[i] != "x": 714 referr.append(werrefwords[i]) 715 i += 1 716 717 i = 0 718 while i < len(werhyperrors): 719 hypWerWord = werhypwords[i] 720 hypWerError = werhyperrors[i] 721 if hypWerError == "ins" or hypWerError == "del" or hypWerError == "sub": 722 if hypWerWord in referr: 723 hyperrcats[i] = "reord" 724 hypcount += 1 725 n = referr.index(hypWerWord) 726 del referr[n] 727 i += 1 728 729 return hyperrcats, hypcount

730

731 -def block_count(errcats, errcat, blockcount):

732 i = 0 733 newblock = True 734 while i < len(errcats): 735 cat = errcats[i] 736 if cat == errcat: 737 if newblock == True: 738 blockcount += 1 739 newblock = False 740 else: 741 newblock = True 742 743 i += 1 744 745 return blockcount

746 747

748 -def write_error_rates(text, errorname, errorcount, errorrate):

749 text.write(errorname+"\t"+str("%.0f" % errorcount)+"\t"+str("%.2f" % errorrate)+"\n")

750

751 -def write_error_words(text, addtext, errors, words, add, title):

752 text.write(title) 753 for nr, r in enumerate(errors): 754 if addtext: 755 text.write(words[nr]+"#"+add[nr]+"~~"+r+" ") 756 else: 757 text.write(words[nr]+"~~"+r+" ") 758 759 text.write("\n")

760 761 762 if __name__ == '__main__': 763 h = Hjerson(lang="en") 764 hyp = 'En lugar de ello , es algo tan terrible como " un condenado estrangulado en secreto " .' 765 ref = 'En lugar de ello , es terriblemente como " un condenado estrangulados en secreto . "' 766 print h.get_features_strings(hyp, [ref]) 767

Source Code for Module featuregenerator.hjerson