#!/usr/bin/python3

"""Divide a potentially large number of files up into like (equal) groups."""

# Tell pycodestyle that we do not care about late imports.
# pylint: disable=E402

import hashlib
import os
import stat
import string
import sys

sys.path.insert(0, os.path.expanduser('~/lib'))
sys.path.insert(0, '/usr/local/lib')

import readline0

fields = []
verbosity = 0


class stats_class:
    """Class to hold some statistics about what we do."""

    def __init__(self):
        """Initialize."""
        self.verbosity = 0
        self.number_of_files = 0
        self.number_of_stats = 0
        self.number_of_prefixes = 0
        self.number_of_hashes = 0
        self.number_of_comparisons = 0
        self.number_of_full_compares = 0

    def bump_number_of_files(self):
        """Increase the file count."""
        self.number_of_files += 1
        if self.verbosity >= 2 or self.verbosity >= 1 and self.number_of_files % 1000 == 0:
            sys.stderr.write('bumped number_of_files to %d\n' % self.number_of_files)

    def bump_number_of_stats(self):
        """Increase the number of stats."""
        self.number_of_stats += 1

    def bump_number_of_prefixes(self):
        """Increase the number of prefixes."""
        self.number_of_prefixes += 1

    def bump_number_of_hashes(self):
        """Increase the number of hashes."""
        self.number_of_hashes += 1

    def bump_number_of_full_compares(self):
        """Increase the number of complete comparisons."""
        self.number_of_full_compares += 1

    def bump_number_of_comparisons(self):
        """Increase the number of comparisons."""
        self.number_of_comparisons += 1
        if self.verbosity >= 2 or self.verbosity >= 1 and self.number_of_comparisons % 1000 == 0:
            self.report()

    def report(self):
        """Output a report summarizing what we did."""
        tuple_ = (
            self.number_of_files,
            self.number_of_comparisons,
            self.number_of_stats,
            self.number_of_prefixes,
            self.number_of_hashes,
            self.number_of_full_compares,
        )
        sys.stderr.write('files: %d, comparisons: %d, stats: %d, prefixes: %d, hashes: %d, full compares: %d\n' % tuple_)


class file_class:
    """A class to facilitate comparing two files rapidly."""

    max_prefix_length = 16
    bufsize = 2**18
    stats = stats_class()

    def set_prefix_length(self, length):
        """Set the maximum length of a prefix of a file, used for partial comparison."""
        self.max_prefix_length = length

    def __init__(self, name, no_full_comparisons):
        """Initialize."""
        self.name = name
        self.device = -1
        self.inode = -1
        self.size = -1
        self.prefix = -1
        self.hash = ''
        self.stats.bump_number_of_files()
        self.no_full_comparisons = no_full_comparisons

    def gen_hash(self):
        """Generate and remember a cryptographic digest the file."""
        if verbosity >= 1:
            self.stats.bump_number_of_hashes()
            if verbosity >= 2:
                sys.stderr.write('Hashing file %s\n' % self.name)
        with open(self.name, 'rb') as file_:
            m = hashlib.md5()
            while True:
                buf = file_.read(self.bufsize)
                if not buf:
                    break
                m.update(buf)
            self.hash = m.digest()

    def __lt__(self, other):
        """Return true iff self < other."""
        result = self.cmp(other)
        return result < 0

    def __gt__(self, other):
        """Return true iff self > other."""
        result = self.cmp(other)
        return result > 0

    def __eq__(self, other):
        """Return true iff self == other."""
        result = self.cmp(other)
        return result == 0

    def cmp(self, other):
        """Compare two files."""
        if verbosity >= 1:
            self.stats.bump_number_of_comparisons()
            if verbosity >= 2 and self.stats.number_of_comparisons % 1000 == 0:
                sys.stderr.write('%d comparisons of whatever depth\n' % self.stats.number_of_files)
        if self.device == -1:
            if verbosity >= 1:
                self.stats.bump_number_of_stats()
                if verbosity >= 2:
                    sys.stderr.write('Getting stat data for file %s\n' % self.name)
            result = os.stat(self.name)
            self.device = result[stat.ST_DEV]
            self.inode = result[stat.ST_INO]
            self.size = result[stat.ST_SIZE]
        if other.device == -1:
            if verbosity >= 1:
                other.stats.bump_number_of_stats()
                if verbosity >= 2:
                    sys.stderr.write('Getting stat data for file %s\n' % self.name)
            result = os.stat(other.name)
            other.device = result[stat.ST_DEV]
            other.inode = result[stat.ST_INO]
            other.size = result[stat.ST_SIZE]
        if self.device == other.device and self.inode == other.inode:
            # same device, same inode, must be identical files except with a broken webdav client
            return 0
#        sys.stderr.write('foo %s %s\n' % (self.size, other.size))
        if self.size < other.size:
            return -1
        elif self.size > other.size:
            return 1
        # if we've reached this point, the files are not hardlinks, and their lengths are identical
        # so slurp in the prefixes if needed, then compare them
        if self.prefix == -1:
            if verbosity >= 1:
                self.stats.bump_number_of_prefixes()
                if verbosity >= 2:
                    sys.stderr.write('Getting prefix for file %s\n' % self.name)
            file = open(self.name, 'rb')
            self.prefix = file.read(self.max_prefix_length)
            file.close()
        if other.prefix == -1:
            if verbosity >= 1:
                other.stats.bump_number_of_prefixes()
                if verbosity >= 2:
                    sys.stderr.write('Getting prefix for file %s\n' % other.name)
            file = open(other.name, 'rb')
            other.prefix = file.read(self.max_prefix_length)
            file.close()
        if self.prefix < other.prefix:
            return -1
        elif self.prefix > other.prefix:
            return 1
        # if we've reached this point, we know that:
        # 1) The files are not hardlinks of each other
        # 2) They have identical sizes
        # 3) Their prefixes are identical
        # 4) We haven't yet tried doing a cryptographic hash, so compute them if needed, and compare them
        if self.hash == '':
            self.gen_hash()
        if other.hash == '':
            other.gen_hash()
        if self.hash < other.hash:
            return -1
        elif self.hash > other.hash:
            return 1
        # if we were asked not to do full comparisons, just trust the result so far - don't do the ultra-expensive
        # "compare every single byte" procedure
        if self.no_full_comparisons:
            return 0
        # if we've reached this point, we know that:
        # 1) The files are not hardlinks of each other
        # 2) They have identical sizes
        # 3) Their prefixes are identical
        # 4) The cryptographic hashes are identical
        # 5) All that remains is a "long form" comparison, so bite the bullet and do the I/O
        if verbosity >= 1:
            self.stats.bump_number_of_full_compares()
            if verbosity >= 2:
                sys.stderr.write('Doing byte for byte comparison of %s and %s\n' % (self.name, other.name))
        self_fp = open(self.name, 'rb')
        other_fp = open(other.name, 'rb')
        while True:
            self_buf = self_fp.read(self.bufsize)
            other_buf = other_fp.read(self.bufsize)
            if self_buf < other_buf:
                self_fp.close()
                other_fp.close()
                return -1
            elif self_buf > other_buf:
                self_fp.close()
                other_fp.close()
                return 1
            if not self_buf and not other_buf:
                self_fp.close()
                other_fp.close()
                return 0
            if not self_buf:
                self_fp.close()
                other_fp.close()
                return -1
            if not other_buf:
                self_fp.close()
                other_fp.close()
                return 1

    def __repr_(self):
        return self.name


def usage(retval):
    """Output a usage message."""
    sys.stderr.write('Usage: %s [-v] [-s] [-0] [-h] [-f file1 file2 ... filen]\n' % sys.argv[0])
    sys.stderr.write('-v says to operate verbosely\n')
    sys.stderr.write('-s says to get filenames from stdin instead of from the command line\n')
    sys.stderr.write('-0 says that when getting filenames from stdin, assume null termination, not newline termination\n')
    sys.stderr.write('   Also changes the default output delimiter to a null byte, because some shells cannot handle that\n')
    sys.stderr.write('-h says to give this help message\n')
    sys.stderr.write('-f file1 file2 ... filen says to use the listed files, not files from stdin\n')
    sys.stderr.write('-p bytes says to cache this many bytes of each file for faster comparisons (may be 0)\n')
    sys.stderr.write('-d delim says to use "delim" as the output delimiter character within a line\n')
    sys.stderr.write('-c says to not do full comparisions - instead trust hash comparisons.\n')
    sys.stderr.write('   Faster at the expense of some accuracy\n')
    sys.stderr.write('--canonicalize sort filenames for comparison with other tools\n')
    sys.exit(retval)


def main():
    """Divide up te files into like groups."""
    global verbosity
    null_termination = 0
    stdin = 0
    filenames = []
    prefix_length = 16
    output_delimiter = b' '
    no_full_comparisons = 0
    canonicalize = 0
    while sys.argv[1:]:
        if sys.argv[1] == '-v':
            verbosity += 1
        elif sys.argv[1] == '--canonicalize':
            canonicalize = 1
        elif sys.argv[1] == '-c':
            no_full_comparisons = 1
        elif sys.argv[1] == '-s':
            stdin = 1
        elif sys.argv[1] == '-p':
            prefix_length = string.atoi(sys.argv[2])
            del sys.argv[1]
        elif sys.argv[1] == '-d':
            output_delimiter = sys.argv[2].encode('UTF-8')
            del sys.argv[1]
        elif sys.argv[1] == '-h':
            usage(0)
        elif sys.argv[1] == '-0':
            null_termination = 1
            output_delimiter = b'\0'
        elif sys.argv[1] == '-f':
            del sys.argv[1]
            filenames = sys.argv[1:]
            break
        else:
            usage(1)
        del sys.argv[1]
    if stdin + (len(filenames) != 0) != 1:
        sys.stderr.write('You must give -s or -f, but not both\n')
        usage(1)
    if not stdin and null_termination:
        sys.stderr.write('Sorry, -0 requires -s\n')
        usage(1)
    file_list_to_sort = []
    if len(filenames) != 0:
        for filename in filenames:
            file_list_to_sort.append(file_class(filename, no_full_comparisons))
    elif stdin:
        if null_termination:
            terminator = b'\0'
        else:
            terminator = b'\n'
        for line in readline0.readline0(sys.stdin.buffer, separator=terminator):
            # strip off the trailing newline
            if line[-1:] == '\n':
                line = line[:-1]
            file_list_to_sort.append(file_class(line, no_full_comparisons))
    file_list_to_sort.sort()
    if canonicalize:
        file_class_list = []
        bucket = []
        if file_list_to_sort:
            # go from a flat, sorted list; to a list of lists
            previous_file_class_obj = file_list_to_sort[0]
            bucket.append(previous_file_class_obj)
            for file_class_obj in file_list_to_sort[1:]:
                if previous_file_class_obj == file_class_obj:
                    bucket.append(file_class_obj)
                else:
                    previous_file_class_obj = file_class_obj
                    file_class_list.append(bucket)
                    bucket = [file_class_obj]
            file_class_list.append(bucket)
            # covert to a list of lists of filenames
            filename_list = []
            for file_class_bucket in file_class_list:
                filename_bucket = [file_class_obj.name for file_class_obj in file_class_bucket]
                filename_bucket.sort()
                filename_list.append(filename_bucket)
            # sort and output the result
            filename_list.sort()
            for filename_bucket in filename_list:
                os.write(1, output_delimiter.join(filename_bucket) + b'\n')
    else:
        if file_list_to_sort:
            previous_file_class_obj = file_list_to_sort[0]
            previous_file_class_obj.set_prefix_length(prefix_length)
            del file_list_to_sort[0]
            os.write(1, previous_file_class_obj.name)
            for file_ in file_list_to_sort:
                if previous_file_class_obj == file_:
                    os.write(1, b'%s%s' % (output_delimiter, file_.name))
                else:
                    previous_file_class_obj = file_
                    os.write(1, b'\n%s' % file_.name)
            os.write(1, b'\n')


main()