#!/usr/bin/env python3 r""" Read duplicates, report on them and/or hardlink them together. The input format looks like: dup1\\0dup2\\0dup3\\0\\0dup4\\0dup5\\0\\0 In this example, dup1 == dup2 == dup3, and dup4 == dup5. """ import os import sys import readline0 def usage(retval: int) -> None: """Output a usage message and exit.""" if retval == 0: file_ = sys.stdout else: file_ = sys.stderr print(f"Usage: {sys.argv[0]} --report --hardlinks", file=file_) print(file=file_) print("Report and/or hardlink duplicate files. For already-hardlinked duplicates, we only report one.", file=file_) sys.exit(retval) class Options: """Parse and hold command line options.""" def __init__(self) -> None: self.report = False self.hardlinks = False while sys.argv[1:]: match sys.argv[1]: case "--help" | "-h": usage(0) case "--report": self.report = True case "--hardlinks": self.hardlinks = True case _: print(f"{sys.argv[0]}: unrecognized option: {sys.argv[1]}", file=sys.stderr) usage(1) del sys.argv[1] preflight_good = True if self.report + self.hardlinks < 1: print(f"{sys.argv[0]}: you must specify at least one of --report and --hardlinks", file=sys.stderr) preflight_good = False if not preflight_good: print(f"{sys.argv[0]}: one or more items in preflight check failed", file=sys.stderr) usage(1) def get_st_nlink(filename: str) -> int: """Look up the number of links this file has.""" s = os.stat(filename) return s.st_nlink def main() -> None: """Start the ball rolling.""" options = Options() first_time = True for equal_files in readline0.readline0(file_=0, separator=b"\0\0"): filenames = equal_files.split(b"\0") if options.hardlinks: # If we're hardlinking, we need to be careful not to split any groups of pre-existing hardlinks. So we pick # the file with the highest hard link count to link the others to. # BTW, this could be done in O(n) time, even though this currently does it in O(nlogn) - where n is the # number of identical files. list_ = [(get_st_nlink(filename), filename) for filename in filenames] list_.sort(reverse=True) filenames = [tuple_[1] for tuple_ in list_] if first_time: first_time = False else: print() filename0 = filenames[0] if options.report: print(filename0) for filename in filenames[1:]: if options.report: print(" ", filename) if options.hardlinks: # Hardlink the current filename. # We do this temp_file stuff to gain atomicity. temp_file = filename0 + b".temp" os.link(filename0, temp_file) os.rename(temp_file, filename) if __name__ == "__main__": main()