#!/usr/bin/env python # Areas for improvement: # 3) There's little sense in processing 0 length files - they're always equal to each other # 4) We might be able to revive the dev+ino test # 5) b4b_class should raise an exception on premature (inconsistent) EOF for callers to use (perhaps especially modified_merge_sort) import os import sys import stat import time try: import hashlib hash_module = hashlib except ImportError: # Ubuntu 9.04's pypy has no hashlib yet (2009-06-12) import md5 hash_module = md5 sys.path.insert(0, os.path.expanduser('~/lib')) verbosity = 0 prefix_length = 1024 block_size = 2**18 null_delimited = False show_uniques = True show_duplicates = True one_per_duplicate = False use_dev_ino = False use_old_sort_code = False sort_test = False filename_to_devino = {} devino_to_prefix_hash = {} devino_to_full_hash = {} def usage(retval): sys.stderr.write('Usage: %s\n' % sys.argv[0]) sys.stderr.write('\t--verbosity n set verbosity to n\n') sys.stderr.write('\t-v increase verbosity by 1\n') sys.stderr.write('\t-h|--help output this message\n') sys.stderr.write('\t--prefix-length n set prefix hash length to n (default: %d)\n' % prefix_length) sys.stderr.write('\t--block-size n set block size to n (default: %d)\n' % block_size) sys.stderr.write('\t--skip-uniques do not output unique files\n') sys.stderr.write('\t--skip-duplicates do not output duplicate files\n') sys.stderr.write('\t--one-per-duplicate for duplicates, output one filename for a given file content\n') sys.stderr.write('\t--use-dev-ino optimize for lots of hard links. Does not work on Windows!\n') sys.stderr.write('\t--sort-test assume all full hashes are collisions so the double merge sort code is tested\n') sys.stderr.write('\t-0 read filenames null separated, not newline separated\n') sys.exit(retval) while sys.argv[1:]: if sys.argv[1] == '--verbosity': verbosity = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-h': usage(0) elif sys.argv[1] == '-v': verbosity += 1 elif sys.argv[1] == '--prefix-length': prefix_length = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '--block-size': block_size = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '--skip-uniques': show_uniques = False elif sys.argv[1] == '--skip-duplicates': show_duplicates = False elif sys.argv[1] == '--one-per-duplicate': one_per_duplicate = True elif sys.argv[1] == '--use-dev-ino': use_dev_ino = True elif sys.argv[1] == '--sort-test': sort_test = True elif sys.argv[1] == '-0': null_delimited = True else: sys.stderr.write('Illegal argument: %s\n' % sys.argv[1]) usage(1) del sys.argv[1] try: import psyco psyco.full() except: if verbosity >= 1: sys.stderr.write("Warning: psyco not found. We'll run slower this way, but that's just a microoptimization.\n") else: if verbosity >= 1: sys.stderr.write("Good, we have psyco - we'll run faster this way.\n") if null_delimited: import readline0 unique = [] total_file_count = 0 possible_remaining_dups = 0 def separate_uniques(dict, initial_time, end = False): global unique global possible_remaining_dups if verbosity >= 2: sys.stderr.write('Separating out the unique values...\n') new_uniques = 0 if verbosity >= 2: sys.stderr.write('Got a unique value\n') tuples = dict.items() for key, values in tuples: if values[0:] and not values[1:]: new_uniques += 1 unique.append(values[0]) del dict[key] possible_remaining_dups -= new_uniques if verbosity >= 1: if end: s = 'duplicates' t = '' else: s = 'ambiguous' t = ' - but this stage informs later stages' denominator = time.time() - initial_time if denominator == 0.0: # can't divide by zero of course denominator = 0.000001 uniques_per_second = new_uniques / denominator sys.stderr.write('Got %d new unique values, %d %s remain\n%f uniques found per second%s\n' % \ (new_uniques, possible_remaining_dups, s, uniques_per_second, t)) def by_size(single_iterator, strip_newlines = False): global total_file_count size_dict = {} for filename in single_iterator: if strip_newlines and filename[-1:] == '\n': filename = filename[:-1] total_file_count += 1 if verbosity >= 3: sys.stderr.write('stat()ing %s\n' % filename) try: statbuf = os.stat(filename) except OSError: sys.stderr.write('Error accessing %s - removing from list and continuing\n' % filename) continue length = statbuf.st_size if size_dict.has_key(length): size_dict[length].append(filename) else: size_dict[length] = [filename] filename_to_devino[filename] = (statbuf.st_dev, statbuf.st_ino) return size_dict def get_prefix_hash(filename): if verbosity >= 3: sys.stderr.write('prefix hashing %s\n' % filename) if use_dev_ino: devino = filename_to_devino[filename] if devino_to_prefix_hash.has_key(devino): if verbosity >= 2: sys.stderr.write('Pulled prefix hash from devino_to_prefix_hash\n') return devino_to_prefix_hash[devino] m = hash_module.md5() file = open(filename, 'r') m.update(file.read(prefix_length)) file.close() result = m.hexdigest() if use_dev_ino: devino_to_prefix_hash[devino] = result return result def get_full_hash(filename): if verbosity >= 3: sys.stderr.write('full hashing %s\n' % filename) if use_dev_ino: devino = filename_to_devino[filename] if devino_to_full_hash.has_key(devino): if verbosity >= 2: sys.stderr.write('Pulled prefix hash from devino_to_prefix_hash\n') return devino_to_full_hash[devino] m = hash_module.md5() file = open(filename, 'r') while 1: block = file.read(block_size) if not block: break m.update(block) file.close() result = m.hexdigest() if use_dev_ino: devino_to_full_hash[devino] = result return result def by_prefix_hash(double_iterator): prefix_hash_dict = {} pass_through_count = 0 for original_key, values in double_iterator: if original_key <= prefix_length: # This is a "small" file - there's no sense in prefix hashing it now, and full hashing it later. # So pass it through this stage unchecked. if verbosity >= 2: len_values = len(values) sys.stderr.write('Passing through %d short files of length %d\n' % (len_values, original_key)) prefix_hash_dict['%d/' % original_key] = values pass_through_count += len(values) else: if verbosity >= 2: len_values = len(values) sys.stderr.write('Doing prefix hash of %d items for %s\n' % (len_values, original_key)) for filename in values: try: new_key = '%s/%s' % (original_key, get_prefix_hash(filename)) if prefix_hash_dict.has_key(new_key): prefix_hash_dict[new_key].append(filename) else: prefix_hash_dict[new_key] = [filename] except IOError: sys.stderr.write('Error accessing %s - removing from list and continuing\n' % filename) if verbosity >= 1: sys.stderr.write('Passed through %d small files without prefix hashing\n' % pass_through_count) return prefix_hash_dict def by_full_hash(double_iterator): full_hash_dict = {} pass_through_count = 0 for original_key, values in double_iterator: len_values = len(values) if len_values == 1: sys.stderr.write('Error: Unique vaue in by_full_hash\n') sys.exit(1) elif len_values == 2: # If there are exactly 2 values that may or may not be equal, skip the full hash, as the b4b comparison # that comes after this will take the same or less time, and give greater assurance that the files # are different if verbosity >= 2: sys.stderr.write('Skipping full hash of two items (because there are but two and hashing won\'t add anything)\n') full_hash_dict[original_key+'/'] = values pass_through_count += 1 else: if verbosity >= 2: sys.stderr.write('Doing full hash of %d items for %s\n' % (len_values, original_key)) for filename in values: try: new_key = '%s/%s' % (original_key, get_full_hash(filename)) if full_hash_dict.has_key(new_key): full_hash_dict[new_key].append(filename) else: full_hash_dict[new_key] = [filename] except IOError: sys.stderr.write('Error accessing %s - removing from list and continuing\n' % filename) if verbosity >= 1: sys.stderr.write('Passed through %d pairs without full hashing\n' % pass_through_count) return full_hash_dict # Our lists (passed to double_merge_sort) will normally be of equal values, because (at one time at least) the values all had the # same length, prefix hash and full hash. So instead of incurring a O(nlogn) sort right off the bat, first we check if the elements # are all the same in O(n) time. # # -And- we optimize a little bit as we go - we make this scan for sameness a degenerate first pass through the sort. The algorithm # in this function is to compare the 1..n-1 elements to the 0th element, each in turn (this helps maximize the benefit of the buffer # cache too). # # If the 1..n-1 elements are all the same as the first element, the list is equal already - no need to sort. # # If they're not all the same as the first element, hopefully the first element at least isn't unique, so we get started on setting # up buckets of equal values - which is really what double_merge_sort is doing. # # In short, we get started on setting up buckets of equal values before we start sorting: So if we don't need to sort, we don't, # and if we do need to start, we have a headstart def list_of_lists_from_likely_same(lst): if not lst[1:]: # if there's no second element, the list is either empty, or has a single value. Return the list return lst magic_element = lst[0] sames = [ magic_element ] maybe_differents = [] for consider_element in lst[1:]: comparison_result = cmp(magic_element, consider_element) if magic_element == consider_element: sames.append(consider_element) else: maybe_differents.append(consider_element) lst2 = [ sames ] + [ [x] for x in maybe_differents ] return lst2 # we first check if the list is a bunch of equal values, because frequently, the list will be def double_merge_sort(lst): # convert to a list of lists and then sort recursively lst2 = list_of_lists_from_likely_same(lst) if lst2: # lst2 is not empty if not lst2[1:]: # the list is not empty, but we have no second element, so the values in lst were already all equal: return lst2 return lst2 if lst2[1:] and not lst2[2:]: # we have two buckets but not a third - this too is already "sorted" for our purposes - because # we don't care if a < b or a > b, we only care that a != b return lst2 else: # lst2 is empty - of course it's already sorted return lst2 return double_merge_sort_backend(lst2) def merge_or_start_new(result, bucket): if result[0:] and bucket[0] == result[-1][0]: # bucket has elements that match the end of result, so merge result[-1] += bucket else: # bucket has elements that are different from the end of result, so append, starting a new bucket in result result.append(bucket) # we merge sorted lists, as is traditional, but we additionally merge buckets of same values together # we expect as input something like [ [1], [2], [5], [4], [3], [2] ] and return [ [1], [2, 2], [3], [4], [5] ] # FIXME: # We also should eliminate files that give I/O errors or similar, reporting an error def double_merge_sort_backend(lst): length = len(lst) if length <= 1: return lst midpoint = length/2 list1 = double_merge_sort_backend(lst[:midpoint]) list2 = double_merge_sort_backend(lst[midpoint:]) index1 = 0 index2 = 0 len1 = len(list1) len2 = len(list2) result = [] while index1 < len1 and index2 < len2: # sometimes we do get empty lists - because of the bad file removals if not list1[index1]: index1 += 1 continue if not list2[index2]: index2 += 1 continue try: comparison = cmp(list1[index1][0], list2[index2][0]) except IOError: # One or both of the files we just tried to compare gave an IOError. # Skip the file or files that caused the error. if not good_file(list1[index1][0]): del list1[index1][0] if list1[index1]: index1 += 1 else: del list1[index1] if not good_file(list2[index2][0]): del list2[index2][0] if list2[index2]: index2 += 1 else: del list2[index2] continue if comparison < 0: merge_or_start_new(result, list1[index1]) index1 += 1 elif comparison > 0: merge_or_start_new(result, list2[index2]) index2 += 1 else: # they're equal - collapse #merge_or_start_new(result, list1[index1]) #merge_or_start_new(result, list2[index2]) # these are lists of lists of equal values, so we can concatenate lists of equal values to save some time merge_or_start_new(result, list1[index1] + list2[index2]) index1 += 1 index2 += 1 while index1 < len1: merge_or_start_new(result, list1[index1]) index1 += 1 while index2 < len2: merge_or_start_new(result, list2[index2]) index2 += 1 return result class b4b_class: def __init__(self, filename): self.filename = filename def __str__(self): return self.filename def __cmp__(self, other): left_filename = self.filename right_filename = other.filename if verbosity >= 3: sys.stderr.write('byte for byte comparing %s and %s\n' % (left_filename, right_filename)) if use_dev_ino: left_devino = filename_to_devino[left_filename] right_devino = filename_to_devino[right_filename] if use_dev_ino and left_devino == right_devino: return 0 left_file = open(left_filename, 'r') right_file = open(right_filename, 'r') left_eof = False right_eof = False while 1: left_block = left_file.read(block_size) if not left_block: left_eof = True right_block = right_file.read(block_size) if not right_block: right_eof = True if left_eof: if right_eof: # good, we hit EOF at the same time return 0 else: sys.stderr.write('Warning: EOF on %s but not %s, even though they previously had the same length\n' % (left_filename, right_filename)) return -1 else: if right_eof: sys.stderr.write('Warning: EOF on %s but not %s, even though they previously had the same length\n' % (right_filename, left_filename)) # good, we hit EOF at the same time return 1 else: # We still have more to compare if left_block < right_block: return -1 elif left_block > right_block: return 1 # we should never reach this point sys.stderr.write('This should never happen\n') exit(1) def good_file(filename): try: file = open(filename, 'r') while 1: block = file.read(blocksize) if not block: break file.close() except: return False return True # we should replace the values.sort() with something that will coalesce identical items into a single item! def by_byte_for_byte_comparison(single_iterator): # b4b_dict[distinguisher] perhaps should be a list! It's basically an array with hash overhead b4b_dict = {} distinguisher = 0 for values in single_iterator: len_values = len(values) if len_values == 1: sys.stderr.write('Error: Unique vaue in by_byte_for_byte_comparison\n') sys.exit(1) continue b4b_list = map(b4b_class, values) if verbosity >= 2: sys.stderr.write('Sorting %d values, byte for byte, via double_merge_sort\n' % (len_values)) # Yes, we're sorting, but we're sorting lots of tiny lists - should be very fast. # Also, we only rarely sort - when there's a hash collision or a file's content changes during the run. sorted_buckets = double_merge_sort(b4b_list) for bucket in sorted_buckets: distinguisher += 1 b4b_dict[distinguisher] = bucket return b4b_dict if False: # quick double_merge_sort test lst = [ [1], [2], [3], [2], [5], [2], [2], [2], [3] ] #import random #lst = [ int(random.random()*100) for x in xrange(200) ] print double_merge_sort(lst) # this annoying sleep fixes an even more annoying sys.excepthook error time.sleep(0.1) sys.exit(0) # actually, it'd be a little faster to merge the separate_uniques calls into the various by_* functions, but: # 1) That's not as good software engineering. # 2) It gives the user a warm fuzzy feeling to get some indication of how well a given stage did. if verbosity >= 1: sys.stderr.write('----------\nGetting size_dict: %s\n' % time.ctime()) initial_time = time.time() if null_delimited: iterator = readline0.readline0(sys.stdin) size_dict = by_size(iterator) else: iterator = sys.stdin size_dict = by_size(iterator, strip_newlines=True) if verbosity >= 1: sys.stderr.write('Total file count is %d\n' % total_file_count) possible_remaining_dups = total_file_count separate_uniques(size_dict, initial_time) if verbosity >= 1: sys.stderr.write('----------\nGetting prefix_hash_dict from size_dict: %s\n' % (time.ctime())) initial_time = time.time() prefix_hash_dict = by_prefix_hash(size_dict.items()) separate_uniques(prefix_hash_dict, initial_time) del size_dict if verbosity >= 1: sys.stderr.write('----------\nGetting full_hash_dict from prefix_hash_dict: %s\n' % (time.ctime())) initial_time = time.time() full_hash_dict = by_full_hash(prefix_hash_dict.items()) separate_uniques(full_hash_dict, initial_time) del prefix_hash_dict if verbosity >= 1: sys.stderr.write('----------\nGetting duplicates_only_dict from full_hash_dict: %s\n' % (time.ctime())) initial_time = time.time() duplicates_only_dict = by_byte_for_byte_comparison(full_hash_dict.values()) separate_uniques(duplicates_only_dict, initial_time, end=True) del full_hash_dict if verbosity >= 1: sys.stderr.write('Final result: Got %d unique files and %d distinct duplicates\n' % (len(unique), len(duplicates_only_dict))) # display all the unique files if show_uniques: for u in unique: print u # now display all the duplicates if show_duplicates: for dups in duplicates_only_dict.values(): if one_per_duplicate: print str(dups[0]) else: print ' '.join(map(str, dups))