Print lines/words/chars stats of files by extension : File Utility « Utility « Python






Print lines/words/chars stats of files by extension

"""

PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
--------------------------------------------

1. This LICENSE AGREEMENT is between the Python Software Foundation
("PSF"), and the Individual or Organization ("Licensee") accessing and
otherwise using this software ("Python") in source or binary form and
its associated documentation.

2. Subject to the terms and conditions of this License Agreement, PSF
hereby grants Licensee a nonexclusive, royalty-free, world-wide
license to reproduce, analyze, test, perform and/or display publicly,
prepare derivative works, distribute, and otherwise use Python
alone or in any derivative version, provided, however, that PSF's
License Agreement and PSF's notice of copyright, i.e., "Copyright (c)
2001, 2002, 2003, 2004 Python Software Foundation; All Rights Reserved"
are retained in Python alone or in any derivative version prepared
by Licensee.

3. In the event Licensee prepares a derivative work that is based on
or incorporates Python or any part thereof, and wants to make
the derivative work available to others as provided herein, then
Licensee hereby agrees to include in any such work a brief summary of
the changes made to Python.

4. PSF is making Python available to Licensee on an "AS IS"
basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
INFRINGE ANY THIRD PARTY RIGHTS.

5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.

6. This License Agreement will automatically terminate upon a material
breach of its terms and conditions.

7. Nothing in this License Agreement shall be deemed to create any
relationship of agency, partnership, or joint venture between PSF and
Licensee.  This License Agreement does not grant permission to use PSF
trademarks or trade name in a trademark sense to endorse or promote
products or services of Licensee, or any third party.

8. By copying, installing or otherwise using Python, Licensee
agrees to be bound by the terms and conditions of this License
Agreement.
"""
#! /usr/bin/env python

"""Show file statistics by extension."""

import os
import sys

class Stats:

    def __init__(self):
        self.stats = {}

    def statargs(self, args):
        for arg in args:
            if os.path.isdir(arg):
                self.statdir(arg)
            elif os.path.isfile(arg):
                self.statfile(arg)
            else:
                sys.stderr.write("Can't find %s\n" % file)
                self.addstats("<???>", "unknown", 1)

    def statdir(self, dir):
        self.addstats("<dir>", "dirs", 1)
        try:
            names = os.listdir(dir)
        except os.error, err:
            sys.stderr.write("Can't list %s: %s\n" % (file, err))
            self.addstats(ext, "unlistable", 1)
            return
        names.sort()
        for name in names:
            if name.startswith(".#"):
                continue # Skip CVS temp files
            if name.endswith("~"):
                continue# Skip Emacs backup files
            full = os.path.join(dir, name)
            if os.path.islink(full):
                self.addstats("<lnk>", "links", 1)
            elif os.path.isdir(full):
                self.statdir(full)
            else:
                self.statfile(full)

    def statfile(self, file):
        head, ext = os.path.splitext(file)
        head, base = os.path.split(file)
        if ext == base:
            ext = "" # E.g. .cvsignore is deemed not to have an extension
        ext = os.path.normcase(ext)
        if not ext:
            ext = "<none>"
        self.addstats(ext, "files", 1)
        try:
            f = open(file, "rb")
        except IOError, err:
            sys.stderr.write("Can't open %s: %s\n" % (file, err))
            self.addstats(ext, "unopenable", 1)
            return
        data = f.read()
        f.close()
        self.addstats(ext, "bytes", len(data))
        if '\0' in data:
            self.addstats(ext, "binary", 1)
            return
        if not data:
            self.addstats(ext, "empty", 1)
        #self.addstats(ext, "chars", len(data))
        lines = data.splitlines()
        self.addstats(ext, "lines", len(lines))
        del lines
        words = data.split()
        self.addstats(ext, "words", len(words))

    def addstats(self, ext, key, n):
        d = self.stats.setdefault(ext, {})
        d[key] = d.get(key, 0) + n

    def report(self):
        exts = self.stats.keys()
        exts.sort()
        # Get the column keys
        columns = {}
        for ext in exts:
            columns.update(self.stats[ext])
        cols = columns.keys()
        cols.sort()
        colwidth = {}
        colwidth["ext"] = max([len(ext) for ext in exts])
        minwidth = 6
        self.stats["TOTAL"] = {}
        for col in cols:
            total = 0
            cw = max(minwidth, len(col))
            for ext in exts:
                value = self.stats[ext].get(col)
                if value is None:
                    w = 0
                else:
                    w = len("%d" % value)
                    total += value
                cw = max(cw, w)
            cw = max(cw, len(str(total)))
            colwidth[col] = cw
            self.stats["TOTAL"][col] = total
        exts.append("TOTAL")
        for ext in exts:
            self.stats[ext]["ext"] = ext
        cols.insert(0, "ext")
        def printheader():
            for col in cols:
                print "%*s" % (colwidth[col], col),
            print
        printheader()
        for ext in exts:
            for col in cols:
                value = self.stats[ext].get(col, "")
                print "%*s" % (colwidth[col], value),
            print
        printheader() # Another header at the bottom

def main():
    args = sys.argv[1:]
    if not args:
        args = [os.curdir]
    s = Stats()
    s.statargs(args)
    s.report()

if __name__ == "__main__":
    main()

           
       








Related examples in the same category

1.Print the product of age and size of each file, in suitable units.
2. Copy one file's atime and mtime to another
3.Change CRLF line endings to LF (Windows to Unix)
4.Print a list of files that are mentioned in CVS directories.
5.Print file diffs in context, unified, or ndiff formats
6.Format du(1) output as a tree sorted by size
7.Recursively find symbolic links to a given path prefix
8.Find a program in PATH system Variable
9.Replace tabs with spaces in argument files
10.Convert GNU texinfo files into HTML
11.Reverse grep through a file (useful for big logfiles)
12.Intelligent diff between text files (Tim Peters)
13.Python utility to print MD5 checksums of argument files
14.Make a copy of a directory tree with symbolic links to all files in the original tree
15.Change LF line endings to CRLF (Unix to Windows)