#!/usr/bin/python

import sys
import md5
import getopt
import string
import os
import stat

def usage():
    sys.stderr.write(sys.argv[0]+' [-n skip] [-p] [-t time] dir1 dir2 ...dirm\n')
    sys.stderr.write('\n')
    sys.stderr.write(sys.argv[0]+' will compare every nth file in\n')
    sys.stderr.write('directories dir1, dir2 through dirm using md5\n')
    sys.stderr.write('hashes.\n')
    sys.stderr.write('\n')
    sys.stderr.write("-n only compares every skip'th file.  Default: 1000\n")
    sys.stderr.write('\n')
    sys.stderr.write('-p ignores permissions\n')
    sys.stderr.write('\n')
    sys.stderr.write('-t time says to ignore which are newer in the source\n')
    sys.stderr.write('collection, within a tolerence of "time" seconds\n')
    sys.stderr.write('\n')
    sys.stderr.write('m can be as high as you want (limited by VM),\n')
    sys.stderr.write('and a meaningful comparison will require at least\n')
    sys.stderr.write('2 directories.  If you just want to see if you can\n')
    sys.stderr.write('read every nth file, then a dirlist of length 1 is\n')
    sys.stderr.write('sort of useful.  To see if you are getting consistent\n')
    sys.stderr.write('data from a directory hierarchy, list the same dir\n')
    sys.stderr.write('more than once.\n')
    sys.stderr.write('\n')
    sys.stderr.write('Only files in the first directory hierarchy (dir1) are\n')
    sys.stderr.write('sought in the 2..mth directories\n')
    sys.exit(0)

def different(filename):
    print 'DIFFERENT',filename
    sys.stdout.flush()

def same(filename):
    print 'SAME',filename
    sys.stdout.flush()

def filessame(dirlist,filename,ignore_perms,tolerance):
    #print 'in filessame, dirlist is',dirlist
    blocksize=2**20
    statbufs=[]
    for i in range(len(dirlist)):
        fullfilename = os.path.join(dirlist[i],filename)
        try:
            os.chdir(dirlist[i])
            statbufs.append(os.lstat(fullfilename))
        except:
            sys.stdout.write('MISSING '+filename+'\n')
            sys.stdout.flush()
            return 0
    # come up with a list of stat fields to check for each file that should be the same
    for i in range(len(dirlist)-1):
        if ignore_perms:
            stat_field_list=[]
            if not stat.S_ISFIFO(statbufs[i][stat.ST_MODE]) and not stat.S_ISDIR(statbufs[i][stat.ST_MODE]) and not stat.S_ISBLK(statbufs[i][stat.ST_MODE]) and not stat.S_ISCHR(statbufs[i][stat.ST_MODE]):
                stat_field_list.append(stat.ST_SIZE)
        else:
            if stat.S_ISFIFO(statbufs[i][stat.ST_MODE]) :
                # for FIFO's, don't compare at all!
                #sys.stderr.write('FIFO detected - ignoring\n')
                stat_field_list=[]
            elif stat.S_ISDIR(statbufs[i][stat.ST_MODE]):
                # for directories, it's important to ignore the "length",
                # which is usually a function of how many files have been in
                # a directory at any point in the past.  UID, GID and
                # permissions bits are still meaningful though
                stat_field_list=[stat.ST_GID, stat.ST_UID, stat.ST_MODE]
            elif stat.S_ISBLK(statbufs[i][stat.ST_MODE]) or stat.S_ISCHR(statbufs[i][stat.ST_MODE]):
                sys.stdout.write('Warning: device file found - not comparing major and minor: %s' % filename)
                stat_field_list=[stat.ST_GID, stat.ST_UID, stat.ST_MODE]
            else:
                stat_field_list=[stat.ST_GID, stat.ST_UID, stat.ST_MODE, stat.ST_SIZE]
        for field in stat_field_list:
            if statbufs[i][field] != statbufs[i+1][field]:
                different(filename)
                return 0
    for i in range(1,len(dirlist)):
        if statbufs[0][stat.ST_MTIME] > statbufs[i][stat.ST_MTIME] + tolerance:
            # this test must come last!
            # if the source directory's file is newer than any of the
            # the destination directories' file, then don't compare, just
            # say the source is newer
            sys.stdout.write("NEWER_WTIHIN_TOLERANCE "+filename+'\n')
            return 0
    if stat.S_ISLNK(statbufs[i][stat.ST_MODE]) or \
        stat.S_ISFIFO(statbufs[i][stat.ST_MODE]) or \
        stat.S_ISBLK(statbufs[i][stat.ST_MODE]) or \
        stat.S_ISCHR(statbufs[i][stat.ST_MODE]) or \
        stat.S_ISDIR(statbufs[i][stat.ST_MODE]):
        # for symlinks, directories, device files and FIFO's, we only check the stat info
        same(filename)
        return 1
    mdfivers=[]
    for i in range(len(dirlist)):
        mdfivers.append(md5.new())
        #print 'in filessame, joining',dirlist[i],filename
        fullfilename = os.path.join(dirlist[i],filename)
        #print 'opening',fullfilename
        file = open(fullfilename,'r')
        while 1:
            buf = file.read(blocksize)
            if not buf:
                break
            mdfivers[i].update(buf)
    for i in range(len(dirlist)-1):
        a = mdfivers[i].digest()
        b = mdfivers[i+1].digest()
        #print i,a,b
        if a != b:
            #print 'exiting over inequal md5'
            different(filename)
            return 0
    # phew, we made it
    #print 'exiting at bottom'
    same(filename)
    return 1

def shorten(full,prefix):
    lngth=len(prefix)
    #print lngth,full,full[0:lngth],prefix
    #sys.stdout.flush()
    if full[0:lngth] == prefix:
        #print lngth,full[lngth:]
        return full[lngth:]
    else:
        sys.stderr.write('Internal error in shorten()\n')
        sys.exit(1)
            
def func(extralist, dirname, fnames):
    skipval=extralist[0]
    countval=extralist[1]
    dirlist=extralist[2]
    ignore_perms=extralist[3]
    tolerance=extralist[4]
    for basename in fnames:
        countval += 1
        if countval==skipval:
            countval=0
            #filename=os.path.join(shorten(dirname,dirlist[0]),basename)
            filename=os.path.join(dirname,basename)
            #print 'in func, filename is',filename
            dummy=filessame(dirlist,filename,ignore_perms,tolerance)
    # write countval back into the extralist!
    extralist[1] = countval

def main():
    try:
        (list,dirs) = getopt.getopt(sys.argv[1:],'n:pt:')
    except:
        usage()
    if len(dirs) == 0:
        usage()
    skipval=1000
    ignore_perms = 0
    tolerance=0
    for item in list:
        if item[0] == '-n':
            skipval=string.atoi(item[1])
        if item[0] == '-t':
            tolerance=string.atoi(item[1])
        if item[0] == '-p':
            ignore_perms = 1
    for dir in dirs:
        os.chdir(dir)
    os.chdir(dirs[0])
    count=0
    os.path.walk('.', func, [skipval,count,dirs,ignore_perms,tolerance])

main()