#!/usr/bin/env python import os import sys import md5 import stat import string try: import psyco #import nopers if 1 == 1: psyco.full() else: psyco.profile(0.02) except: if os.uname()[4] in [ 'i386', 'i486', 'i586', 'i686' ]: sys.stderr.write('Psyco initialization failed. You appear to be on an x86 system, so you might want to install it to get better performance.\n') else: if not os.uname()[4] in [ 'i386', 'i486', 'i586', 'i686' ]: sys.stderr.write('Psyco initialization succeeded and that is interesting, because I thought it was x86-only :)\n') sys.path.insert(0,os.path.expanduser('~/lib')) sys.path.insert(0,'/usr/local/lib') import readline0 fields=[] verbosity = 0 class stats_class: def __init__(self): self.verbosity = 0 self.number_of_files = 0 self.number_of_stats = 0 self.number_of_prefixes = 0 self.number_of_hashes = 0 self.number_of_comparisons = 0 self.number_of_full_compares = 0 def bump_number_of_files(self): self.number_of_files += 1 if self.verbosity >= 2 or self.verbosity >= 1 and self.number_of_files % 1000 == 0: sys.stderr.write('bumped number_of_files to %d\n' % self.number_of_files) def bump_number_of_stats(self): self.number_of_stats += 1 def bump_number_of_prefixes(self): self.number_of_prefixes += 1 def bump_number_of_hashes(self): self.number_of_hashes += 1 def bump_number_of_full_compares(self): self.number_of_full_compares += 1 def bump_number_of_comparisons(self): self.number_of_comparisons += 1 if self.verbosity >= 2 or self.verbosity >= 1 and self.number_of_comparisons % 1000 == 0: self.report() def report(self): sys.stderr.write('files: %d, comparisons: %d, stats: %d, prefixes: %d, hashes: %d, full compares: %d\n' % \ (self.number_of_files, self.number_of_comparisons, self.number_of_stats, self.number_of_prefixes, self.number_of_hashes, self.number_of_full_compares)) class file_class: max_prefix_length=16 bufsize = 2**18 stats = stats_class() def set_prefix_length(self,length): self.max_prefix_length=length def __init__(self, name, no_full_comparisons): self.name = name self.device = -1 self.inode = -1 self.size = -1 self.prefix = -1 self.hash = '' self.stats.bump_number_of_files() self.no_full_comparisons = no_full_comparisons # print self.stats.number_of_files # if self.stats.verbosity >= 2 or self.stats.verbosity >= 1 and self.stats.number_of_files % 1000 == 0: # sys.stderr.write('Read %d files\n' % self.stats.number_of_files) def gen_hash(self): if self.stats.verbosity >= 1: self.stats.bump_number_of_hashes() if self.stats.verbosity >= 2: sys.stderr.write('Hashing file %s\n' % self.name) file = open(self.name, 'r') m = md5.new() while 1: buf = file.read(self.bufsize) if not buf: break m.update(buf) self.hash = m.digest() file.close() def __cmp__(self,other): # sys.stderr.write('Comparing %s and %s\n' % (self.name, other.name)) if self.stats.verbosity >= 1: self.stats.bump_number_of_comparisons() if self.stats.verbosity >= 2 and self.stats.number_of_comparisons % 1000 == 0: sys.stderr.write('%d comparisons of whatever depth\n' % self.stats.number_of_files) if self.device == -1: if self.stats.verbosity >= 1: self.stats.bump_number_of_stats() if self.stats.verbosity >= 2: sys.stderr.write('Getting stat data for file %s\n' % self.name) result = os.stat(self.name) self.device = result[stat.ST_DEV] self.inode = result[stat.ST_INO] self.size = result[stat.ST_SIZE] if other.device == -1: if self.stats.verbosity >= 1: other.stats.bump_number_of_stats() if self.stats.verbosity >= 2: sys.stderr.write('Getting stat data for file %s\n' % self.name) result = os.stat(other.name) other.device = result[stat.ST_DEV] other.inode = result[stat.ST_INO] other.size = result[stat.ST_SIZE] if self.device == other.device and self.inode == other.inode: # same device, same inode, must be identical files except with a broken webdav client return 0 # sys.stderr.write('foo %s %s\n' % (self.size, other.size)) if self.size < other.size: return -1 elif self.size > other.size: return 1 # if we've reached this point, the files are not hardlinks, and their lengths are identical # so slurp in the prefixes if needed, then compare them if self.prefix == -1: if self.stats.verbosity >= 1: self.stats.bump_number_of_prefixes() if self.stats.verbosity >= 2: sys.stderr.write('Getting prefix for file %s\n' % self.name) file = open(self.name, 'r') self.prefix = file.read(self.max_prefix_length) file.close() if other.prefix == -1: if self.stats.verbosity >= 1: other.stats.bump_number_of_prefixes() if self.stats.verbosity >= 2: sys.stderr.write('Getting prefix for file %s\n' % other.name) file = open(other.name, 'r') other.prefix = file.read(self.max_prefix_length) file.close() if self.prefix < other.prefix: return -1 elif self.prefix > other.prefix: return 1 # if we've reached this point, we know that: # 1) The files are not hardlinks of each other # 2) They have identical sizes # 3) Their prefixes are identical # 4) We haven't yet tried doing a cryptographic hash, so compute them if needed, and compare them if self.hash == '': self.gen_hash() if other.hash == '': other.gen_hash() if self.hash < other.hash: return -1 elif self.hash > other.hash: return 1 # if we were asked not to do full comparisons, just trust the result so far - don't do the ultra-expensive # "compare every single byte" procedure if self.no_full_comparisons: return 0 # if we've reached this point, we know that: # 1) The files are not hardlinks of each other # 2) They have identical sizes # 3) Their prefixes are identical # 4) The cryptographic hashes are identical # 5) All that remains is a "long form" comparison, so bite the bullet and do the I/O if self.stats.verbosity >= 1: self.stats.bump_number_of_full_compares() if self.stats.verbosity >= 2: sys.stderr.write('Doing byte for byte comparison of %s and %s\n' % (self.name, other.name)) self_fp = open(self.name, 'r') other_fp = open(other.name, 'r') while 1: self_buf = self_fp.read(self.bufsize) other_buf = other_fp.read(self.bufsize) if self_buf < other_buf: self_fp.close() other_fp.close() return -1 elif self_buf > other_buf: self_fp.close() other_fp.close() return 1 if not self_buf and not other_buf: self_fp.close() other_fp.close() return 0 if not self_buf: self_fp.close() other_fp.close() return -1 if not other_buf: self_fp.close() other_fp.close() return 1 def __repr_(self): return self.name def usage(retval): sys.stderr.write('Usage: %s [-v] [-s] [-0] [-h] [-f file1 file2 ... filen]\n' % sys.argv[0]) sys.stderr.write('-v says to operate verbosely\n') sys.stderr.write('-s says to get filenames from stdin instead of from the command line\n') sys.stderr.write('-0 says that when getting filenames from stdin, assume null termination, not newline termination\n') sys.stderr.write(' Also changes the default output delimeter to a null byte, because some shells cannot handle that\n') sys.stderr.write('-h says to give this help message\n') sys.stderr.write('-f file1 file2 ... filen says to use the listed files, not files fromstdin\n') sys.stderr.write('-p bytes says to cache this many bytes of each file for faster comparisons (may be 0)\n') sys.stderr.write('-d delim says to use "delim" as the output delimeter character within a line\n') sys.stderr.write('-c says to not do full comparisions - instead trust hash comparisons.\n') sys.stderr.write(' Faster at the expense of some accuracy\n') sys.exit(retval) def main(): global verbosity equivs=[] null_termination=0 stdin=0 filenames=[] prefix_length=16 output_delimeter=' ' no_full_comparisons=0 while sys.argv[1:]: if sys.argv[1] == '-v': verbosity += 1 elif sys.argv[1] == '-c': no_full_comparisons=1 elif sys.argv[1] == '-s': stdin=1 elif sys.argv[1] == '-p': prefix_length=string.atoi(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-d': output_delimeter=sys.argv[2] del sys.argv[1] elif sys.argv[1] == '-h': usage(0) elif sys.argv[1] == '-0': null_termination = 1 output_delimeter='\0' elif sys.argv[1] == '-f': del sys.argv[1] filenames=sys.argv[1:] break else: usage(1) del sys.argv[1] if stdin + (len(filenames) != 0) != 1: sys.stderr.write('You must give -s or -f, but not both\n') usage(1) if not stdin and null_termination: sys.stderr.write('Sorry, -0 requires -s\n') usage(1) sys.stderr.write('%s initializing\n' % sys.argv[0]) file_list_to_sort = [] if len(filenames) != 0: for filename in filenames: file_list_to_sort.append(file_class(filename, no_full_comparisons)) elif stdin: if null_termination: for line in readline0.readline0(sys.stdin): # strip off the trailing newline if line[-1:] == '\n': line = line[:-1] file_list_to_sort.append(file_class(line, no_full_comparisons)) else: while 1: line = sys.stdin.readline() if not line: break # strip off the trailing newline if line[-1:] == '\n': line = line[:-1] file_list_to_sort.append(file_class(line, no_full_comparisons)) sys.stderr.write('About to sort %d files\n' % len(file_list_to_sort)) file_list_to_sort.sort() sys.stderr.write('Done sorting %d files, will now compare files to neighbors\n' % len(file_list_to_sort)) # for f in file_list_to_sort: # print f.name if file_list_to_sort[0:]: previous=file_list_to_sort[0] previous.set_prefix_length(prefix_length) del file_list_to_sort[0] sys.stdout.write(previous.name) fileno = 0 for f in file_list_to_sort: fileno += 1 if fileno % 1000 == 0: sys.stderr.write('Wrote %d files\n' % fileno) if previous == f: sys.stdout.write('%s%s' % (output_delimeter, f.name)) else: previous = f sys.stdout.write('\n%s' % f.name) sys.stdout.write('\n') main()