#!/usr/bin/env python """ This script counts Unicode characters in every file specified via command line args and writes statistics on non-ASCII characters to standard output" USAGE [python] [./]analyze_unicode_files.py [--encoding=UTF-8] file1 file2 ... """ from __future__ import with_statement import os import codecs import unicodedata from optparse import OptionParser __author__ = 'Ulrich Schaefer ' __date__ = '$Date: 2008/11/26 17:10:00 $' __version__ = '$Revision: 1.0 $' __project__ = 'HyLaP/MMCI' class CharInfo(): total_occ = 0 occ_per_doc = {} def pluralize(word, count): if count==1: return "%s %s" % (count, word) else: return "%s %ss" % (count, word) def main(): op = OptionParser() op.add_option("-e", "--encoding", default="UTF-8", help="input encoding") options, args = op.parse_args() if not args: print "No input file given." return chardict = {} try: for name in args: try: with codecs.open(name, "r", options.encoding) as f: fstr = unicode(f.read()) for char in fstr: info = chardict.get(char) if not info: info = CharInfo() info.total_occ = 1 info.occ_per_doc = { name:1 } chardict[char] = info else: info.total_occ += 1 occdoc = info.occ_per_doc.get(name) if not occdoc: info.occ_per_doc[name] = 1 else: info.occ_per_doc[name] += 1 except UnicodeDecodeError: print "UnicodeDecodeError in file", file continue for info in sorted(chardict.items()): if ord(info[0]) > 127: print("%s %06X (%s): %s in %s" % (info[0], ord(info[0]), unicodedata.name(info[0], "unknown"), pluralize("occurrence", info[1].total_occ), pluralize("file", len(info[1].occ_per_doc)))) except KeyboardInterrupt: pass finally: print "done." main()