#!/usr/bin/python3 # Interestingly, pypy (2.x) and pypy3 both run this code much more slowly than cpython 2.7 or cpython 3.3... # pylint: disable=superfluous-parens # superfluous-parens: Parentheses are good for clarity and portability # Pylint note: We're ignoring the hashlib.md5 thing in this-pylint instead of here, so we don't have to turn # off missing attribute checking globally """Divide a group of files into like classes - rapidly.""" # Areas for improvement: # 3) There's little sense in processing 0 length files - they're always equal to each other # 4) We might be able to revive the dev+ino test # 5) B4bClass should raise an exception on premature (inconsistent) EOF for callers to use (perhaps especially modified_merge_sort) import os import sys # import stat import time # import pprint import hashlib import functools from typing import List, Dict, Iterable, Tuple, Any, ByteString sys.path.insert(0, os.path.expanduser('~/lib')) sys.path.insert(0, '/usr/local/lib') # pylint: disable=wrong-import-position import bufsock # noqa: ignore=E402 try: import readline0 except ImportError: HAVE_READLINE0 = False else: HAVE_READLINE0 = True def usage(retval: int) -> None: """Output a usage message.""" sys.stderr.write('Usage: %s\n' % sys.argv[0]) sys.stderr.write('\t--verbosity n set verbosity to n\n') sys.stderr.write('\t-v increase verbosity by 1\n') sys.stderr.write('\t-h|--help output this message\n') sys.stderr.write('\t--prefix-length n set prefix hash length to n\n') sys.stderr.write('\t--block-size n set block size to n\n') sys.stderr.write('\t--skip-uniques do not output unique files\n') sys.stderr.write('\t--skip-duplicates do not output duplicate files\n') sys.stderr.write('\t--one-per-duplicate for duplicates, output one filename for a given file content\n') sys.stderr.write('\t--use-dev-ino optimize for lots of hard links. Does not work on Windows!\n') sys.stderr.write('\t--delete-newer for a given group of like files, delete all but the oldest\n') sys.stderr.write('\t-0 read filenames null separated, not newline separated\n') sys.stderr.write('\t--canonicalize Use some extra sorting for the sake of comparison with equivs2\n') sys.exit(retval) class DevinoStuffClass(object): # pylint: disable=too-few-public-methods """Just a container for the device # and inode # stuff.""" def __init__(self): """Initialize.""" self.filename_to_devino = {} self.devino_to_prefix_hash = {} self.devino_to_full_hash = {} class LeftException(Exception): """A simple exception for when the left file in a file comparison gives some sort of error during open or read.""" pass class RightException(Exception): """A simple exception for when the right file in a file comparison gives some sort of error during open or read.""" pass class B4bClass(object): """Handle byte for byte comparisons of two files.""" blocksize = 2**20 devino_stuff = None # type: DevinoStuffClass verbosity = 1 def __init__(self, filename) -> None: """Initialize.""" self.filename = filename def __str__(self) -> str: """Convert to string.""" if isinstance(self.filename, str): return 'B4BClass({})'.format(self.filename) return 'B4BClass({})'.format(self.filename.decode('ISO-8859-1')) __repr__ = __str__ def bytes(self) -> ByteString: """Convert to bytes.""" if isinstance(self.filename, str): return bytes(self.filename, 'latin-1') return self.filename def __lt__(self, other) -> bool: """Return True iff self < other.""" if isinstance(other, B4bClass): cmp_result = self.compare(other) return cmp_result < 0 return NotImplemented def __gt__(self, other) -> bool: """Return True iff self > other.""" if isinstance(other, B4bClass): cmp_result = self.compare(other) return cmp_result > 0 return NotImplemented def __eq__(self, other) -> bool: """Return True iff self == other.""" if isinstance(other, B4bClass): cmp_result = self.compare(other) return cmp_result == 0 return NotImplemented # def __cmp__(self, other: 'B4bClass') -> int: # """Return -1 for <, 1 for > or 0 for ==.""" # return self.compare(other) def compare(self, other: 'B4bClass') -> int: # noqa pylint: disable=too-many-branches """Compare two files in a rather strange, but quite effective, way.""" left_filename = self.filename right_filename = other.filename log(self.verbosity, 3, 'byte for byte comparing %s and %s\n' % (left_filename, right_filename)) if self.devino_stuff is not None: left_devino = self.devino_stuff.filename_to_devino[left_filename] right_devino = self.devino_stuff.filename_to_devino[right_filename] if left_devino == right_devino: return 0 try: left_file = open(left_filename, 'rb') except (IOError, OSError): raise LeftException try: right_file = open(right_filename, 'rb') except (IOError, OSError): left_file.close() raise RightException try: left_eof = False right_eof = False while True: try: left_block = left_file.read(self.blocksize) except (IOError, OSError): raise LeftException if not left_block: left_eof = True try: right_block = right_file.read(self.blocksize) except (IOError, OSError): raise RightException if not right_block: right_eof = True if left_eof: if right_eof: # good, we hit EOF at the same time return 0 sys.stderr.write('Warning: EOF on %s but not %s, even though they previously had the same length\n' % ( left_filename, right_filename, )) return -1 else: if right_eof: sys.stderr.write('Warning: EOF on %s but not %s, even though they previously had the same length\n' % ( right_filename, left_filename, )) # good, we hit EOF at the same time return 1 else: # We still have more to compare # comparison_result = cmp(left_block, right_block) if left_block < right_block: return -1 elif left_block > right_block: return 1 finally: left_file.close() right_file.close() # we should never reach this point sys.stderr.write('This should never happen\n') exit(1) def separate_uniques( unique: List[str], num_possible_remaining_dups: int, verbosity: int, possible_dups_dict: Dict[str, List[str]], initial_time: float, end: bool = False, ): # pylint: disable=R0913 # R0913: we need lots of arguments """Separate out the recently-recognized unique values from the still-possibly-duplicate files.""" log(verbosity, 2, 'Separating out the unique values...\n') new_uniques = 0 log(verbosity, 2, 'Got a unique value\n') tuples = list(possible_dups_dict.items()) for key, values in tuples: if values[0:] and not values[1:]: new_uniques += 1 unique.append(values[0]) del possible_dups_dict[key] num_possible_remaining_dups -= new_uniques if verbosity >= 1: if end: dupli_ambi = 'duplicates' informs = '' else: dupli_ambi = 'ambiguous' informs = ' - but this stage informs later stages' denominator = time.time() - initial_time if denominator == 0.0: # can't divide by zero of course denominator = 0.000001 uniques_per_second = new_uniques / denominator sys.stderr.write('Got %d new unique values, %d %s remain\n%f uniques found per second%s\n' % (new_uniques, num_possible_remaining_dups, dupli_ambi, uniques_per_second, informs)) return num_possible_remaining_dups def by_size( verbosity: int, devino_stuff: DevinoStuffClass, total_file_count: int, single_iterator: Iterable[str], ): """Divide files into possibly-same groups based on their sizes. This isn't that accurate, but it's quite fast, and large, expensive files tend to have unique lengths """ size_dict = {} # type: Dict[int, List[str]] for filename in single_iterator: total_file_count += 1 log(verbosity, 3, 'stat()ing %s\n' % filename) try: statbuf = os.stat(filename) except OSError: sys.stderr.write('Error accessing %s - removing from list and continuing\n' % filename) continue length = statbuf.st_size if length in size_dict: size_dict[length].append(filename) else: size_dict[length] = [filename] if devino_stuff: devino_stuff.filename_to_devino[filename] = (statbuf.st_dev, statbuf.st_ino) return total_file_count, size_dict def get_prefix_hash( verbosity: int, devino_stuff: DevinoStuffClass, prefix_length: int, filename: str, ): """Get a hash from a file's initial prefix.""" log(verbosity, 3, 'prefix hashing %s\n' % filename) if devino_stuff: devino = devino_stuff.filename_to_devino[filename] if devino in devino_stuff.devino_to_prefix_hash: if verbosity >= 2: sys.stderr.write('Pulled prefix hash from devino_to_prefix_hash\n') return devino_stuff.devino_to_prefix_hash[devino] md5_hash = hashlib.md5() file_ = open(filename, 'rb') md5_hash.update(file_.read(prefix_length)) file_.close() result = md5_hash.hexdigest() if devino_stuff: devino_stuff.devino_to_prefix_hash[devino] = result return result def get_full_hash( verbosity: int, devino_stuff: DevinoStuffClass, blocksize: int, filename: str, ): """Get a file's full hash.""" log(verbosity, 3, 'full hashing %s\n' % filename) if devino_stuff: devino = devino_stuff.filename_to_devino[filename] if devino in devino_stuff.devino_to_full_hash: if verbosity >= 2: sys.stderr.write('Pulled prefix hash from devino_to_prefix_hash\n') return devino_stuff.devino_to_full_hash[devino] md5_hash = hashlib.md5() file_ = open(filename, 'rb') while True: block = file_.read(blocksize) if not block: break md5_hash.update(block) file_.close() result = md5_hash.hexdigest() if devino_stuff: devino_stuff.devino_to_full_hash[devino] = result return result def by_prefix_hash( verbosity: int, devino_stuff: DevinoStuffClass, prefix_length: int, double_iterator: Iterable[Tuple[int, List[str]]], ): # pylint: disable=too-many-nested-blocks """ Compare files by a hash of an initial prefix of the file. If they have the same prefix hash, they might be equal files. If they do not have the same prefix hash, they are definitely different files. This isn't as thorough as a full hash or byte for byte compare, but it's fast. """ prefix_hash_dict = {} small_file_pass_through_count = 0 pair_pass_through_count = 0 for original_key, values in double_iterator: if original_key <= prefix_length: # This is a "small" file - there's no sense in prefix hashing it now, and full hashing it later. # So pass it through this stage unchecked. if verbosity >= 2: len_values = len(values) sys.stderr.write('Passing through %d short files of length %d\n' % (len_values, original_key)) prefix_hash_dict['%d/' % original_key] = values small_file_pass_through_count += len(values) else: if values[1:] and not values[2:]: if verbosity >= 2: sys.stderr.write('Skipping prefix hash of two items ') sys.stderr.write('because there are but two and hashing won\'t add anything)\n') new_key = '%s/' % original_key prefix_hash_dict[new_key] = values pair_pass_through_count += 1 else: if verbosity >= 2: len_values = len(values) sys.stderr.write('Doing prefix hash of %d items for %s\n' % (len_values, original_key)) for filename in values: try: new_key = '%s/%s' % (original_key, get_prefix_hash(verbosity, devino_stuff, prefix_length, filename)) if new_key in prefix_hash_dict: prefix_hash_dict[new_key].append(filename) else: prefix_hash_dict[new_key] = [filename] except (IOError, OSError): sys.stderr.write('Error accessing %s - removing from list and continuing\n' % filename) log(verbosity, 1, 'Passed through %d small files without prefix hashing\n' % small_file_pass_through_count) log(verbosity, 1, 'Passed through %d pairs without prefix hashing\n' % pair_pass_through_count) return prefix_hash_dict def by_full_hash( verbosity: int, devino_stuff: DevinoStuffClass, blocksize: int, double_iterator: Iterable[Tuple[str, List[str]]], ): """ Compare values by their complete hashes. If they have the same hash, they're almost certainly equal files. If they do not have the same hash, they are definitely different files. This isn't as thorough as a byte for byte compare, but it's fast-ish. """ full_hash_dict = {} # type: Dict[str, List[str]] pass_through_count = 0 for original_key, values in double_iterator: len_values = len(values) if len_values == 1: sys.stderr.write('Error: Unique value in by_full_hash\n') sys.exit(1) elif len_values == 2: # If there are exactly 2 values that may or may not be equal, skip the full hash, as the b4b comparison # that comes after this will take the same or less time, and give greater assurance that the files # are different if verbosity >= 2: sys.stderr.write('Skipping full hash of two items (because there are but two and hashing won\'t add anything)\n') full_hash_dict[original_key+'/'] = values pass_through_count += 1 else: if verbosity >= 2: sys.stderr.write('Doing full hash of %d items for %s\n' % (len_values, original_key)) for filename in values: try: new_key = '%s/%s' % (original_key, get_full_hash(verbosity, devino_stuff, blocksize, filename)) if new_key in full_hash_dict: full_hash_dict[new_key].append(filename) else: full_hash_dict[new_key] = [filename] except (IOError, OSError): sys.stderr.write('Error accessing %s - removing from list and continuing\n' % filename) log(verbosity, 1, 'Passed through %d pairs without full hashing\n' % pass_through_count) return full_hash_dict def list_of_lists_from_likely_same( verbosity: int, lst: List[B4bClass], ): """ Optimization for lists that almost-must be the same. Our lists (passed to double_merge_sort) will normally be of equal values, because (at one time at least) the values all had the same length, prefix hash and full hash. So instead of incurring a O(nlogn) sort right off the bat, first we check if the elements are all the same in O(n) time. -And- we optimize a little bit as we go - we make this scan for sameness a degenerate first pass through the sort. The algorithm in this function is to compare the 1..n-1 elements to the 0th element, each in turn (this helps maximize the benefit of the buffer cache too). If the 1..n-1 elements are all the same as the first element, the list is equal already - no need to sort. If they're not all the same as the first element, hopefully the first element at least isn't unique, so we get started on setting up buckets of equal values - which is really what double_merge_sort is doing. In short, we get started on setting up buckets of equal values before we start sorting: So if we don't need to sort, we don't, and if we do need to start, we have a little headstart. """ if not lst[1:]: # if there's no second element, the list is either empty, or has a single value. Return the list return lst magic_element = lst[0] sames = [magic_element] maybe_differents = [] tail = lst[1:] len_tail = len(tail) for consider_element_no in range(len_tail-1, -1, -1): consider_element = tail[consider_element_no] try: comparison_result = magic_element.compare(consider_element) except LeftException: sys.stderr.write('Error accessing %s - skipping linear sameness check and continuing\n' % sames[0]) del sames[0] maybe_differents.extend(tail[:consider_element_no+1]) break except RightException: sys.stderr.write('Error accessing %s - removing from list and continuing\n' % consider_element) continue else: if comparison_result == 0: sames.append(consider_element) else: maybe_differents.append(consider_element) if sames: lst2 = [sames] + [[x] for x in maybe_differents] else: lst2 = [[x] for x in maybe_differents] if maybe_differents: log(verbosity, 2, 'One or more files were found different via linear check\n') else: log(verbosity, 2, 'All files were found same via linear check\n') return lst2 # we first check if the list is a bunch of equal values, because frequently, the list will be def double_merge_sort( verbosity: int, lst: List[B4bClass], ): """Merge sort that compresses things down to a list of sublists, where the sublists have equal values.""" # convert to a list of lists and then sort recursively lst2 = list_of_lists_from_likely_same(verbosity, lst) if lst2: # lst2 is not empty if not lst2[1:]: # the list is not empty, but we have no second element, so the values in lst were already all equal: return lst2 return lst2 if lst2[1:] and not lst2[2:]: # we have two buckets but not a third - this too is already "sorted" for our purposes - because # we don't care if a < b or a > b, we only care that a != b return lst2 else: # lst2 is empty - of course it's already sorted return lst2 return double_merge_sort_backend(verbosity, lst2) class InvalidComparison(Exception): """ An exception to raise when one or both of two lists of previously-equal values are no longer legitimate. That is, there are no longer any valid values in one or both lists. """ pass def any_equal(list1: List[Any], list2: List[Any]): """ Return True iff any element of list1 == any element of list2. We pretend that all elements are good, even though some may be unusable. """ len_list1 = len(list1) len_list2 = len(list2) list1_index = 0 list2_index = 0 while list1_index < len_list1 and list2_index < len_list2: try: if list1[list1_index] == list2[list2_index]: result = True break else: result = False break except LeftException: # list1[list1_index] has problems. Skip it and try again. list1_index += 1 continue except RightException: # list2[list2_index] has problems. Skip it and try again. list2_index += 1 continue else: # list1 and/or list2 is/are 100% invalid. Raise an exception. raise InvalidComparison return result def merge_or_start_new(result: List[List[str]], bucket: List[str]): """Either start a new bucket, or add to an existing bucket. Part of our double merge sort.""" try: if result[0:] and any_equal(bucket, result[-1]): # bucket has elements that match the end of result, so merge result[-1] += bucket else: # bucket has elements that are different from the end of result, so # append, starting a new bucket in result result.append(bucket) except InvalidComparison: sys.stderr.write('{}: One or more files that used to be readable no longer are. Results will be suspect\n'.format( sys.argv[0], )) def double_merge_sort_backend( verbosity: int, lst: List[List[str]], ): # pylint: disable=too-many-locals,too-many-branches,too-many-statements """ We merge sorted lists, as is traditional, but we additionally merge buckets of same values together. We expect as input something like [ [1], [2], [5], [4], [3], [2] ] and return [ [1], [2, 2], [3], [4], [5] ]. """ length = len(lst) if length <= 1: return lst midpoint = int(length / 2) list1 = double_merge_sort_backend(verbosity, lst[:midpoint]) list2 = double_merge_sort_backend(verbosity, lst[midpoint:]) index1 = 0 index2 = 0 len1 = len(list1) len2 = len(list2) result = [] # type: List[List[str]] while index1 < len1 and index2 < len2: # sometimes we do get empty lists - because of the bad file removals if not list1[index1:]: len1 = len(list1) break if not list2[index2:]: len2 = len(list2) break if not list1[index1]: index1 += 1 continue if not list2[index2]: index2 += 1 continue try: comparison = list1[index1][0].compare(list2[index2][0]) except LeftException: log(verbosity, 1, 'Error accessing %s - removing from list and continuing\n' % list1[index1][0]) del list1[index1][0] if list1[index1]: index1 += 1 else: del list1[index1] continue except RightException: log(verbosity, 2, 'Error accessing %s - removing from list and continuing\n' % list2[index2][0]) del list2[index2][0] if list2[index2]: index2 += 1 else: del list2[index2] continue if comparison < 0: merge_or_start_new(result, list1[index1]) index1 += 1 elif comparison > 0: merge_or_start_new(result, list2[index2]) index2 += 1 else: # they're equal - collapse # these are lists of lists of equal values, so we can concatenate lists of equal values to save some time merge_or_start_new(result, list1[index1] + list2[index2]) index1 += 1 index2 += 1 while index1 < len1: merge_or_start_new(result, list1[index1]) index1 += 1 while index2 < len2: merge_or_start_new(result, list2[index2]) index2 += 1 return result # we should replace the values.sort() with something that will coalesce identical items into a single item! def by_byte_for_byte_comparison( verbosity: int, double_iterator: Iterable[Tuple[str, List[str]]], ): """Compare files by byte for byte comparisons.""" # b4b_dict[distinguisher] perhaps should be a list! It's basically an array with hash overhead b4b_dict = {} distinguisher = 0 for key, values in double_iterator: len_values = len(values) if len_values == 1: sys.stderr.write('Error: Unique value in by_byte_for_byte_comparison\n') sys.exit(1) continue b4b_list = [B4bClass(value) for value in values] if verbosity >= 2: sys.stderr.write('Sorting %d values, byte for byte, via double_merge_sort for %s\n' % (len_values, key)) # Yes, we're sorting, but we're sorting lots of tiny lists - should be very fast. # Also, we only rarely sort - when there's a hash collision or a file's content changes during the run. sorted_buckets = double_merge_sort(verbosity, b4b_list) for bucket in sorted_buckets: distinguisher += 1 b4b_dict[distinguisher] = [b.filename for b in bucket] return b4b_dict def log( verbosity: int, level: int, message: str, ): """Log a message to stderr, but only if we're at an appopriate log level.""" if verbosity >= level: sys.stderr.write(message) def deal_with_command_line_options(): # pylint: disable=too-many-branches """Pretty self-explanatory.""" verbosity = 0 prefix_length = 1024 blocksize = 2**18 null_delimited = False show_uniques = True show_duplicates = True one_per_duplicate = False use_dev_ino = False # sort_test = False delete_newer = False canonicalize = False while sys.argv[1:]: if sys.argv[1] == '--verbosity': verbosity = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-h': usage(0) elif sys.argv[1] == '-v': verbosity += 1 elif sys.argv[1] == '--prefix-length': prefix_length = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '--block-size': blocksize = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '--skip-uniques': show_uniques = False elif sys.argv[1] == '--skip-duplicates': show_duplicates = False elif sys.argv[1] == '--one-per-duplicate': one_per_duplicate = True elif sys.argv[1] == '--use-dev-ino': use_dev_ino = True elif sys.argv[1] == '--delete-newer': delete_newer = True elif sys.argv[1] == '--canonicalize': canonicalize = True elif sys.argv[1] == '-0': if HAVE_READLINE0: null_delimited = True else: sys.stderr.write('%s: Apologies - we have no readline0 so -0 is unavailable\n' % sys.argv[0]) else: sys.stderr.write('Illegal argument: %s\n' % sys.argv[1]) usage(1) del sys.argv[1] return \ verbosity, \ prefix_length, \ blocksize, \ show_uniques, \ show_duplicates, \ one_per_duplicate, \ use_dev_ino, \ null_delimited, \ delete_newer, \ canonicalize def newline_delimited_lines() -> Iterable[str]: """Yield a series of lines, but strip off any trailing newlines for consistency with readline0.readline0.""" for filename in sys.stdin: if filename[-1:] == '\n': filename = filename[:-1] yield filename.rstrip('\n') def get_mtime(filename: str) -> float: """Return the modification time of filename.""" stat_buf = os.stat(filename) return stat_buf.st_mtime @functools.total_ordering class MtimeSortable(object): # pylint: disable=too-few-public-methods """A class to facilitate sortable filenames by modification time.""" def __init__(self, filename: str) -> None: """Initialize.""" self.filename = filename self.mtime = get_mtime(filename) def __lt__(self, other) -> bool: """Less than, the 3.x way.""" return self.mtime < other.mtime def __eq__(self, other) -> bool: """==, the 3.x way.""" return self.mtime == other.mtime # def __cmp__(self, other) -> int: # """cmp a la 2.x.""" # if self.mtime < other.mtime: # return -1 # elif self.mtime > other.mtime: # return 1 # else: # return 0 def unlink(self): """Remove this file.""" os.unlink(self.filename) def main(): # pylint: disable=too-many-locals,too-many-branches,too-many-statements """Divide files into like groups.""" verbosity, \ prefix_length, \ blocksize, \ show_uniques, \ show_duplicates, \ one_per_duplicate, \ use_dev_ino, \ null_delimited, \ delete_newer, \ canonicalize = \ deal_with_command_line_options() if use_dev_ino: devino_stuff = DevinoStuffClass() else: devino_stuff = None B4bClass.devino_stuff = devino_stuff B4bClass.verbosity = verbosity unique = [] total_file_count = 0 # actually, it'd be a little faster to merge the separate_uniques calls into the various by_* functions, but: # 1) That's not as good software engineering. # 2) It gives the user a warm fuzzy feeling to get some indication of how well a given stage did. log(verbosity, 1, '----------\nGetting size_dict: %s\n' % time.ctime()) initial_time = time.time() if null_delimited: iterator = readline0.readline0(bufsock.rawio(handle=0)) else: iterator = newline_delimited_lines() total_file_count, size_dict = by_size(verbosity, devino_stuff, total_file_count, iterator) # All files are considered possible duplicates at this point, because we haven't separated out the ones that have to be unique # based on their sizes yet. num_possible_remaining_dups = total_file_count log(verbosity, 1, 'Total file count is %d\n' % total_file_count) num_possible_remaining_dups = separate_uniques(unique, num_possible_remaining_dups, verbosity, size_dict, initial_time) log(verbosity, 1, '----------\nGetting prefix_hash_dict from size_dict: %s\n' % (time.ctime())) initial_time = time.time() prefix_hash_dict = by_prefix_hash(verbosity, devino_stuff, prefix_length, size_dict.items()) num_possible_remaining_dups = separate_uniques(unique, num_possible_remaining_dups, verbosity, prefix_hash_dict, initial_time) del size_dict log(verbosity, 1, '----------\nGetting full_hash_dict from prefix_hash_dict: %s\n' % (time.ctime())) initial_time = time.time() full_hash_dict = by_full_hash(verbosity, devino_stuff, blocksize, prefix_hash_dict.items()) num_possible_remaining_dups = separate_uniques(unique, num_possible_remaining_dups, verbosity, full_hash_dict, initial_time) del prefix_hash_dict log(verbosity, 1, '----------\nGetting duplicates_only_dict from full_hash_dict: %s\n' % (time.ctime())) initial_time = time.time() duplicates_only_dict = by_byte_for_byte_comparison(verbosity, full_hash_dict.items()) num_possible_remaining_dups = separate_uniques( unique, num_possible_remaining_dups, verbosity, duplicates_only_dict, initial_time, end=True, ) del full_hash_dict log(verbosity, 1, 'Final result: Got %d unique files and %d distinct duplicates\n' % (len(unique), len(duplicates_only_dict))) # display all the unique files canonical_result = [] if show_uniques: if canonicalize: for uniq in unique: canonical_result.append([uniq]) else: for unique_filename in unique: if isinstance(unique_filename, str): print(unique_filename) elif isinstance(unique_filename, bytes): os.write(1, unique_filename) os.write(1, b'\n') else: os.write(1, unique_filename.bytes()) os.write(1, b'\n') # now display all the duplicates, and optionally delete newer duplicates if show_duplicates: values = list(duplicates_only_dict.values()) if canonicalize: canonical_result.extend(values) else: for dups in values: if one_per_duplicate: if isinstance(dups[0], str): print(dups[0]) elif isinstance(dups[0], bytes): os.write(1, dups[0]) os.write(1, b'\n') else: os.write(1, dups[0].bytes()) os.write(1, b'\n') else: if isinstance(dups[0], str): print(' '.join(str(filename) for filename in dups)) else: os.write(1, b' '.join(dups) + b'\n') if canonicalize: # sort each bucket by filename for bucket in canonical_result: bucket.sort() # sort the buckets by their first, second... filename canonical_result.sort() for bucket in canonical_result: if isinstance(bucket[0], str): print(' '.join(str(filename) for filename in bucket)) else: os.write(1, b' '.join(bucket) + b'\n') if delete_newer: buckets_of_equals = duplicates_only_dict.values() for bucket_of_equals in buckets_of_equals: # print('fred 1: converting bucket_of_equals to MtimeSortable') # print(bucket_of_equals) mtime_sortables = [MtimeSortable(like_filename) for like_filename in bucket_of_equals] mtime_sortables.sort() for mtime_sortable in mtime_sortables[1:]: log(verbosity, 1, "unlinking %s\n" % mtime_sortable.filename) mtime_sortable.unlink() if __name__ == '__main__': main()