Package util :: Module freqcaser
[hide private]
[frames] | no frames]

Source Code for Module util.freqcaser

 1  # -*- coding: utf-8 -*- 
 2  ''' 
 3  Created on Sep 16, 2011 
 4   
 5  @author: David Vilar 
 6  ''' 
 7  import gzip 
 8  import sys 
 9  from nltk.tokenize.punkt import PunktWordTokenizer 
10   
11 -class FreqCaser():
12 13
14 - def __init__(self, case_file = None):
15 self.caseDict = self.getCaseFreqs(self.openFile(case_file)) 16 self.dontChangeBeginningOfSentence = [unicode(s, "utf-8") for s in ["-", "(", "¡", "¿", '"']]
17
18 - def isZipFile(self, fname):
19 fp = open(fname, "rb") 20 magic = fp.read(2) 21 fp.close() 22 return magic == '\x1f\x8b'
23
24 - def openFile(self, file, allowNone=True):
25 """ Returns (fp, name, isZipFile) """ 26 if not file: 27 if allowNone: 28 return None 29 else: 30 raise IOError 31 if isinstance(file, str): 32 if self.isZipFile(file): 33 return gzip.GzipFile(file) 34 else: 35 return open(file) 36 elif isinstance(file, file): 37 return file
38
39 - def getCaseFreqs(self, fp):
40 allDict = {} 41 for l in fp: 42 line = unicode(l, "utf-8") 43 words = line.split() 44 beginningOfSentence = True 45 for w in words: 46 if not beginningOfSentence: 47 thisWordDict = allDict.setdefault(w.lower(), {}) 48 thisWordDict[w] = 1 + thisWordDict.get(w, 0) 49 if w not in self.dontChangeBeginningOfSentence: 50 beginningOfSentence = (w == ".") or (w == "?") or (w == "!") 51 52 caseDict = {} 53 for d in allDict.iteritems(): 54 keysWithFreq = d[1].items() 55 keysWithFreq.sort(key = lambda x: x[1], reverse=True) 56 caseDict[d[0]] = keysWithFreq[0][0] 57 58 return caseDict
59 60 61 62
63 - def freqcase(self, string):
64 words = PunktWordTokenizer().tokenize(string) 65 beginningOfSentence = True 66 cased_words = [] 67 for w in words: 68 69 lowerW = w.lower() 70 if self.caseDict.has_key(lowerW): 71 cased_words.append ( "%s " % self.caseDict[lowerW].encode("utf-8")) 72 else: 73 cased_words.append("%s " % w.lower().encode("utf-8")) 74 if w not in self.dontChangeBeginningOfSentence: 75 beginningOfSentence = (w == ".") or (w == "?") or (w == "!") or (w == "#") # \# for mrefs 76 return ' '.join(cased_words)
77