1
2 '''
3 Created on Sep 16, 2011
4
5 @author: David Vilar
6 '''
7 import gzip
8 import sys
9 from nltk.tokenize.punkt import PunktWordTokenizer
10
12
13
15 self.caseDict = self.getCaseFreqs(self.openFile(case_file))
16 self.dontChangeBeginningOfSentence = [unicode(s, "utf-8") for s in ["-", "(", "¡", "¿", '"']]
17
19 fp = open(fname, "rb")
20 magic = fp.read(2)
21 fp.close()
22 return magic == '\x1f\x8b'
23
24 - def openFile(self, file, allowNone=True):
25 """ Returns (fp, name, isZipFile) """
26 if not file:
27 if allowNone:
28 return None
29 else:
30 raise IOError
31 if isinstance(file, str):
32 if self.isZipFile(file):
33 return gzip.GzipFile(file)
34 else:
35 return open(file)
36 elif isinstance(file, file):
37 return file
38
40 allDict = {}
41 for l in fp:
42 line = unicode(l, "utf-8")
43 words = line.split()
44 beginningOfSentence = True
45 for w in words:
46 if not beginningOfSentence:
47 thisWordDict = allDict.setdefault(w.lower(), {})
48 thisWordDict[w] = 1 + thisWordDict.get(w, 0)
49 if w not in self.dontChangeBeginningOfSentence:
50 beginningOfSentence = (w == ".") or (w == "?") or (w == "!")
51
52 caseDict = {}
53 for d in allDict.iteritems():
54 keysWithFreq = d[1].items()
55 keysWithFreq.sort(key = lambda x: x[1], reverse=True)
56 caseDict[d[0]] = keysWithFreq[0][0]
57
58 return caseDict
59
60
61
62
64 words = PunktWordTokenizer().tokenize(string)
65 beginningOfSentence = True
66 cased_words = []
67 for w in words:
68
69 lowerW = w.lower()
70 if self.caseDict.has_key(lowerW):
71 cased_words.append ( "%s " % self.caseDict[lowerW].encode("utf-8"))
72 else:
73 cased_words.append("%s " % w.lower().encode("utf-8"))
74 if w not in self.dontChangeBeginningOfSentence:
75 beginningOfSentence = (w == ".") or (w == "?") or (w == "!") or (w == "#")
76 return ' '.join(cased_words)
77