Package featuregenerator :: Package wer :: Module wer
[hide private]
[frames] | no frames]

Source Code for Module featuregenerator.wer.wer

  1  """ 
  2  @author Eleftherios Avramidis based on original by Maja Popovic 
  3  """ 
  4   
  5  #!/usr/bin/env python 
  6   
  7  import sys 
  8  import gzip 
  9  from nltk.tokenize.punkt import PunktWordTokenizer 
 10   
 11   
 12  sent = False 
 13   
 14   
15 -class levNode:
16 - def __init__(self, rpos=0, hpos=0, error=0):
17 self.rpos = rpos 18 self.hpos = hpos 19 self.error = error
20 21
22 -def wer(hypWords, refs):
23 24 if isinstance(hypWords, str): 25 hypWords = PunktWordTokenizer().tokenize(hypWords) 26 if isinstance(refs, str): 27 refs = PunktWordTokenizer().tokenize(refs) 28 29 totalHypLength = 0.0 30 totalRefLength = 0.0 31 32 totalWerCount = 0.0 33 34 nsent = 0 35 36 37 p = (0,0) 38 39 Q = {} 40 Q[p] = 0 41 42 B = {} 43 B[p] = levNode(0, 0, 0) 44 45 46 47 48 49 #preparation 50 51 nsent += 1 52 53 minWer = 1000 54 bestRefLength = 0.0 55 bestSentWerCount = 0.0 56 57 58 maxLength=[] 59 60 #refs = rline.split("#") 61 62 #reading hypothesis 63 64 totalHypLength += len(hypWords) 65 66 67 # adjusting indices to range from 1 to len(hypWords) 68 69 hyp = {} 70 71 i=1 72 73 while i <= len(hypWords): 74 hyp[i] = hypWords[i-1] 75 i+=1 76 77 #reading reference(s) 78 79 nref = 0 80 81 for refWords in refs: 82 83 84 # adjusting indices to range from 1 to len(refWords) 85 86 i=1 87 88 ref={} 89 90 while i <= len(refWords): 91 ref[i]=refWords[i-1] 92 i+=1 93 94 95 #maximal length (necessary for alignment) 96 97 if len(refWords) > len(hypWords): 98 maxLength.append(len(refWords)) 99 else: 100 maxLength.append(len(hypWords)) 101 102 103 104 #Wer errors 105 106 for nh in range(0, len(hyp)+1): 107 p = (0, nh) 108 Q[p]=nh 109 B[p] = levNode(0, nh-1, 3) 110 111 112 113 114 for nr in range(0, len(ref)+1): 115 p = (nr, 0) 116 Q[p]=nr 117 B[p] = levNode(nr-1, 0, 2) 118 119 120 p = (0, 0) 121 B[p] = levNode(-1, -1, -1) 122 123 p = (1, 0) 124 B[p] = levNode(0, 0, 2) 125 126 p = (0, 1) 127 B[p] = levNode(0, 0, 3) 128 129 130 # Qs and Bs 131 132 for r in ref.keys(): 133 for h in hyp.keys(): 134 minQ = 1000 135 p = (r, h) 136 dp = (r-1, h) 137 ip = (r, h-1) 138 sp = (r-1, h-1) 139 140 141 142 s = 0 143 if hyp[h] != ref[r]: 144 s = 1 145 else: 146 s = 0 147 148 if Q[sp]+s < minQ: 149 minQ = Q[sp]+s 150 B[p] = levNode(r-1, h-1, s) 151 152 if Q[dp]+1 < minQ: 153 minQ = Q[dp]+1 154 B[p] = levNode(r-1, h, 2) 155 156 if Q[ip]+1 < minQ: 157 minQ = Q[ip]+1 158 B[p] = levNode(r, h-1, 3) 159 160 Q[p] = minQ 161 162 163 164 165 166 167 # backtracking 168 169 sentWerCount = 0.0 170 171 l = maxLength[nref] 172 173 174 175 # 1) starting backtracking 176 177 p = (len(refWords), len(hypWords)) 178 179 err = B[p].error 180 181 182 if err == 1 or err == 2 or err == 3: 183 sentWerCount+=1 184 185 186 187 rp = B[p].rpos 188 hp = B[p].hpos 189 190 # 2) going down 191 192 193 while hp >= 0 and rp >= 0: 194 p1 = (rp, hp) 195 err = B[p1].error 196 197 198 if err == 1 or err == 2 or err ==3: 199 sentWerCount+=1 200 201 l -= 1 202 203 hp = B[p1].hpos 204 rp = B[p1].rpos 205 206 207 208 # best sentence wer & best reference 209 210 rLen = 0.00000001 211 if len(refWords) > 0: 212 rLen = len(refWords) 213 214 sentWer = sentWerCount/rLen 215 if sentWer < minWer: 216 minWer = sentWer 217 bestRefLength = len(refWords) 218 bestSentWerCount = sentWerCount 219 220 nref += 1 221 222 Q.clear() 223 B.clear() 224 225 226 227 totalRefLength += bestRefLength 228 229 totalWerCount += bestSentWerCount 230 231 232 233 #sys.stdout.write(str(nsent)+"::Wer: "+str("%.4f" % minWer)+"\n") 234 return minWer
235