#!/usr/bin/python import sys import md5 import getopt import string import os import stat def usage(): sys.stderr.write(sys.argv[0]+' [-n skip] [-p] [-t time] dir1 dir2 ...dirm\n') sys.stderr.write('\n') sys.stderr.write(sys.argv[0]+' will compare every nth file in\n') sys.stderr.write('directories dir1, dir2 through dirm using md5\n') sys.stderr.write('hashes.\n') sys.stderr.write('\n') sys.stderr.write("-n only compares every skip'th file. Default: 1000\n") sys.stderr.write('\n') sys.stderr.write('-p ignores permissions\n') sys.stderr.write('\n') sys.stderr.write('-t time says to ignore which are newer in the source\n') sys.stderr.write('collection, within a tolerence of "time" seconds\n') sys.stderr.write('\n') sys.stderr.write('m can be as high as you want (limited by VM),\n') sys.stderr.write('and a meaningful comparison will require at least\n') sys.stderr.write('2 directories. If you just want to see if you can\n') sys.stderr.write('read every nth file, then a dirlist of length 1 is\n') sys.stderr.write('sort of useful. To see if you are getting consistent\n') sys.stderr.write('data from a directory hierarchy, list the same dir\n') sys.stderr.write('more than once.\n') sys.stderr.write('\n') sys.stderr.write('Only files in the first directory hierarchy (dir1) are\n') sys.stderr.write('sought in the 2..mth directories\n') sys.exit(0) def different(filename): print 'DIFFERENT',filename sys.stdout.flush() def same(filename): print 'SAME',filename sys.stdout.flush() def filessame(dirlist,filename,ignore_perms,tolerance): #print 'in filessame, dirlist is',dirlist blocksize=2**20 statbufs=[] for i in range(len(dirlist)): fullfilename = os.path.join(dirlist[i],filename) try: os.chdir(dirlist[i]) statbufs.append(os.lstat(fullfilename)) except: sys.stdout.write('MISSING '+filename+'\n') sys.stdout.flush() return 0 # come up with a list of stat fields to check for each file that should be the same for i in range(len(dirlist)-1): if ignore_perms: stat_field_list=[] if not stat.S_ISFIFO(statbufs[i][stat.ST_MODE]) and not stat.S_ISDIR(statbufs[i][stat.ST_MODE]) and not stat.S_ISBLK(statbufs[i][stat.ST_MODE]) and not stat.S_ISCHR(statbufs[i][stat.ST_MODE]): stat_field_list.append(stat.ST_SIZE) else: if stat.S_ISFIFO(statbufs[i][stat.ST_MODE]) : # for FIFO's, don't compare at all! #sys.stderr.write('FIFO detected - ignoring\n') stat_field_list=[] elif stat.S_ISDIR(statbufs[i][stat.ST_MODE]): # for directories, it's important to ignore the "length", # which is usually a function of how many files have been in # a directory at any point in the past. UID, GID and # permissions bits are still meaningful though stat_field_list=[stat.ST_GID, stat.ST_UID, stat.ST_MODE] elif stat.S_ISBLK(statbufs[i][stat.ST_MODE]) or stat.S_ISCHR(statbufs[i][stat.ST_MODE]): sys.stdout.write('Warning: device file found - not comparing major and minor: %s' % filename) stat_field_list=[stat.ST_GID, stat.ST_UID, stat.ST_MODE] else: stat_field_list=[stat.ST_GID, stat.ST_UID, stat.ST_MODE, stat.ST_SIZE] for field in stat_field_list: if statbufs[i][field] != statbufs[i+1][field]: different(filename) return 0 for i in range(1,len(dirlist)): if statbufs[0][stat.ST_MTIME] > statbufs[i][stat.ST_MTIME] + tolerance: # this test must come last! # if the source directory's file is newer than any of the # the destination directories' file, then don't compare, just # say the source is newer sys.stdout.write("NEWER_WTIHIN_TOLERANCE "+filename+'\n') return 0 if stat.S_ISLNK(statbufs[i][stat.ST_MODE]) or \ stat.S_ISFIFO(statbufs[i][stat.ST_MODE]) or \ stat.S_ISBLK(statbufs[i][stat.ST_MODE]) or \ stat.S_ISCHR(statbufs[i][stat.ST_MODE]) or \ stat.S_ISDIR(statbufs[i][stat.ST_MODE]): # for symlinks, directories, device files and FIFO's, we only check the stat info same(filename) return 1 mdfivers=[] for i in range(len(dirlist)): mdfivers.append(md5.new()) #print 'in filessame, joining',dirlist[i],filename fullfilename = os.path.join(dirlist[i],filename) #print 'opening',fullfilename file = open(fullfilename,'r') while 1: buf = file.read(blocksize) if not buf: break mdfivers[i].update(buf) for i in range(len(dirlist)-1): a = mdfivers[i].digest() b = mdfivers[i+1].digest() #print i,a,b if a != b: #print 'exiting over inequal md5' different(filename) return 0 # phew, we made it #print 'exiting at bottom' same(filename) return 1 def shorten(full,prefix): lngth=len(prefix) #print lngth,full,full[0:lngth],prefix #sys.stdout.flush() if full[0:lngth] == prefix: #print lngth,full[lngth:] return full[lngth:] else: sys.stderr.write('Internal error in shorten()\n') sys.exit(1) def func(extralist, dirname, fnames): skipval=extralist[0] countval=extralist[1] dirlist=extralist[2] ignore_perms=extralist[3] tolerance=extralist[4] for basename in fnames: countval += 1 if countval==skipval: countval=0 #filename=os.path.join(shorten(dirname,dirlist[0]),basename) filename=os.path.join(dirname,basename) #print 'in func, filename is',filename dummy=filessame(dirlist,filename,ignore_perms,tolerance) # write countval back into the extralist! extralist[1] = countval def main(): try: (list,dirs) = getopt.getopt(sys.argv[1:],'n:pt:') except: usage() if len(dirs) == 0: usage() skipval=1000 ignore_perms = 0 tolerance=0 for item in list: if item[0] == '-n': skipval=string.atoi(item[1]) if item[0] == '-t': tolerance=string.atoi(item[1]) if item[0] == '-p': ignore_perms = 1 for dir in dirs: os.chdir(dir) os.chdir(dirs[0]) count=0 os.path.walk('.', func, [skipval,count,dirs,ignore_perms,tolerance]) main()