#!/usr/bin/python3 """Divide a potentially large number of files up into like (equal) groups.""" # Tell pycodestyle that we do not care about late imports. # pylint: disable=E402 import hashlib import os import stat import string import sys sys.path.insert(0, os.path.expanduser('~/lib')) sys.path.insert(0, '/usr/local/lib') import readline0 fields = [] verbosity = 0 class stats_class: """Class to hold some statistics about what we do.""" def __init__(self): """Initialize.""" self.verbosity = 0 self.number_of_files = 0 self.number_of_stats = 0 self.number_of_prefixes = 0 self.number_of_hashes = 0 self.number_of_comparisons = 0 self.number_of_full_compares = 0 def bump_number_of_files(self): """Increase the file count.""" self.number_of_files += 1 if self.verbosity >= 2 or self.verbosity >= 1 and self.number_of_files % 1000 == 0: sys.stderr.write('bumped number_of_files to %d\n' % self.number_of_files) def bump_number_of_stats(self): """Increase the number of stats.""" self.number_of_stats += 1 def bump_number_of_prefixes(self): """Increase the number of prefixes.""" self.number_of_prefixes += 1 def bump_number_of_hashes(self): """Increase the number of hashes.""" self.number_of_hashes += 1 def bump_number_of_full_compares(self): """Increase the number of complete comparisons.""" self.number_of_full_compares += 1 def bump_number_of_comparisons(self): """Increase the number of comparisons.""" self.number_of_comparisons += 1 if self.verbosity >= 2 or self.verbosity >= 1 and self.number_of_comparisons % 1000 == 0: self.report() def report(self): """Output a report summarizing what we did.""" tuple_ = ( self.number_of_files, self.number_of_comparisons, self.number_of_stats, self.number_of_prefixes, self.number_of_hashes, self.number_of_full_compares, ) sys.stderr.write('files: %d, comparisons: %d, stats: %d, prefixes: %d, hashes: %d, full compares: %d\n' % tuple_) class file_class: """A class to facilitate comparing two files rapidly.""" max_prefix_length = 16 bufsize = 2**18 stats = stats_class() def set_prefix_length(self, length): """Set the maximum length of a prefix of a file, used for partial comparison.""" self.max_prefix_length = length def __init__(self, name, no_full_comparisons): """Initialize.""" self.name = name self.device = -1 self.inode = -1 self.size = -1 self.prefix = -1 self.hash = '' self.stats.bump_number_of_files() self.no_full_comparisons = no_full_comparisons def gen_hash(self): """Generate and remember a cryptographic digest the file.""" if verbosity >= 1: self.stats.bump_number_of_hashes() if verbosity >= 2: sys.stderr.write('Hashing file %s\n' % self.name) with open(self.name, 'rb') as file_: m = hashlib.md5() while True: buf = file_.read(self.bufsize) if not buf: break m.update(buf) self.hash = m.digest() def __lt__(self, other): """Return true iff self < other.""" result = self.cmp(other) return result < 0 def __gt__(self, other): """Return true iff self > other.""" result = self.cmp(other) return result > 0 def __eq__(self, other): """Return true iff self == other.""" result = self.cmp(other) return result == 0 def cmp(self, other): """Compare two files.""" if verbosity >= 1: self.stats.bump_number_of_comparisons() if verbosity >= 2 and self.stats.number_of_comparisons % 1000 == 0: sys.stderr.write('%d comparisons of whatever depth\n' % self.stats.number_of_files) if self.device == -1: if verbosity >= 1: self.stats.bump_number_of_stats() if verbosity >= 2: sys.stderr.write('Getting stat data for file %s\n' % self.name) result = os.stat(self.name) self.device = result[stat.ST_DEV] self.inode = result[stat.ST_INO] self.size = result[stat.ST_SIZE] if other.device == -1: if verbosity >= 1: other.stats.bump_number_of_stats() if verbosity >= 2: sys.stderr.write('Getting stat data for file %s\n' % self.name) result = os.stat(other.name) other.device = result[stat.ST_DEV] other.inode = result[stat.ST_INO] other.size = result[stat.ST_SIZE] if self.device == other.device and self.inode == other.inode: # same device, same inode, must be identical files except with a broken webdav client return 0 # sys.stderr.write('foo %s %s\n' % (self.size, other.size)) if self.size < other.size: return -1 elif self.size > other.size: return 1 # if we've reached this point, the files are not hardlinks, and their lengths are identical # so slurp in the prefixes if needed, then compare them if self.prefix == -1: if verbosity >= 1: self.stats.bump_number_of_prefixes() if verbosity >= 2: sys.stderr.write('Getting prefix for file %s\n' % self.name) file = open(self.name, 'rb') self.prefix = file.read(self.max_prefix_length) file.close() if other.prefix == -1: if verbosity >= 1: other.stats.bump_number_of_prefixes() if verbosity >= 2: sys.stderr.write('Getting prefix for file %s\n' % other.name) file = open(other.name, 'rb') other.prefix = file.read(self.max_prefix_length) file.close() if self.prefix < other.prefix: return -1 elif self.prefix > other.prefix: return 1 # if we've reached this point, we know that: # 1) The files are not hardlinks of each other # 2) They have identical sizes # 3) Their prefixes are identical # 4) We haven't yet tried doing a cryptographic hash, so compute them if needed, and compare them if self.hash == '': self.gen_hash() if other.hash == '': other.gen_hash() if self.hash < other.hash: return -1 elif self.hash > other.hash: return 1 # if we were asked not to do full comparisons, just trust the result so far - don't do the ultra-expensive # "compare every single byte" procedure if self.no_full_comparisons: return 0 # if we've reached this point, we know that: # 1) The files are not hardlinks of each other # 2) They have identical sizes # 3) Their prefixes are identical # 4) The cryptographic hashes are identical # 5) All that remains is a "long form" comparison, so bite the bullet and do the I/O if verbosity >= 1: self.stats.bump_number_of_full_compares() if verbosity >= 2: sys.stderr.write('Doing byte for byte comparison of %s and %s\n' % (self.name, other.name)) self_fp = open(self.name, 'rb') other_fp = open(other.name, 'rb') while True: self_buf = self_fp.read(self.bufsize) other_buf = other_fp.read(self.bufsize) if self_buf < other_buf: self_fp.close() other_fp.close() return -1 elif self_buf > other_buf: self_fp.close() other_fp.close() return 1 if not self_buf and not other_buf: self_fp.close() other_fp.close() return 0 if not self_buf: self_fp.close() other_fp.close() return -1 if not other_buf: self_fp.close() other_fp.close() return 1 def __repr_(self): return self.name def usage(retval): """Output a usage message.""" sys.stderr.write('Usage: %s [-v] [-s] [-0] [-h] [-f file1 file2 ... filen]\n' % sys.argv[0]) sys.stderr.write('-v says to operate verbosely\n') sys.stderr.write('-s says to get filenames from stdin instead of from the command line\n') sys.stderr.write('-0 says that when getting filenames from stdin, assume null termination, not newline termination\n') sys.stderr.write(' Also changes the default output delimiter to a null byte, because some shells cannot handle that\n') sys.stderr.write('-h says to give this help message\n') sys.stderr.write('-f file1 file2 ... filen says to use the listed files, not files from stdin\n') sys.stderr.write('-p bytes says to cache this many bytes of each file for faster comparisons (may be 0)\n') sys.stderr.write('-d delim says to use "delim" as the output delimiter character within a line\n') sys.stderr.write('-c says to not do full comparisions - instead trust hash comparisons.\n') sys.stderr.write(' Faster at the expense of some accuracy\n') sys.stderr.write('--canonicalize sort filenames for comparison with other tools\n') sys.exit(retval) def main(): """Divide up te files into like groups.""" global verbosity null_termination = 0 stdin = 0 filenames = [] prefix_length = 16 output_delimiter = b' ' no_full_comparisons = 0 canonicalize = 0 while sys.argv[1:]: if sys.argv[1] == '-v': verbosity += 1 elif sys.argv[1] == '--canonicalize': canonicalize = 1 elif sys.argv[1] == '-c': no_full_comparisons = 1 elif sys.argv[1] == '-s': stdin = 1 elif sys.argv[1] == '-p': prefix_length = string.atoi(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-d': output_delimiter = sys.argv[2].encode('UTF-8') del sys.argv[1] elif sys.argv[1] == '-h': usage(0) elif sys.argv[1] == '-0': null_termination = 1 output_delimiter = b'\0' elif sys.argv[1] == '-f': del sys.argv[1] filenames = sys.argv[1:] break else: usage(1) del sys.argv[1] if stdin + (len(filenames) != 0) != 1: sys.stderr.write('You must give -s or -f, but not both\n') usage(1) if not stdin and null_termination: sys.stderr.write('Sorry, -0 requires -s\n') usage(1) file_list_to_sort = [] if len(filenames) != 0: for filename in filenames: file_list_to_sort.append(file_class(filename, no_full_comparisons)) elif stdin: if null_termination: terminator = b'\0' else: terminator = b'\n' for line in readline0.readline0(sys.stdin.buffer, separator=terminator): # strip off the trailing newline if line[-1:] == '\n': line = line[:-1] file_list_to_sort.append(file_class(line, no_full_comparisons)) file_list_to_sort.sort() if canonicalize: file_class_list = [] bucket = [] if file_list_to_sort: # go from a flat, sorted list; to a list of lists previous_file_class_obj = file_list_to_sort[0] bucket.append(previous_file_class_obj) for file_class_obj in file_list_to_sort[1:]: if previous_file_class_obj == file_class_obj: bucket.append(file_class_obj) else: previous_file_class_obj = file_class_obj file_class_list.append(bucket) bucket = [file_class_obj] file_class_list.append(bucket) # covert to a list of lists of filenames filename_list = [] for file_class_bucket in file_class_list: filename_bucket = [file_class_obj.name for file_class_obj in file_class_bucket] filename_bucket.sort() filename_list.append(filename_bucket) # sort and output the result filename_list.sort() for filename_bucket in filename_list: os.write(1, output_delimiter.join(filename_bucket) + b'\n') else: if file_list_to_sort: previous_file_class_obj = file_list_to_sort[0] previous_file_class_obj.set_prefix_length(prefix_length) del file_list_to_sort[0] os.write(1, previous_file_class_obj.name) for file_ in file_list_to_sort: if previous_file_class_obj == file_: os.write(1, b'%s%s' % (output_delimiter, file_.name)) else: previous_file_class_obj = file_ os.write(1, b'\n%s' % file_.name) os.write(1, b'\n') main()