#!/usr/bin/env python3 """ Get full hashes. Remove end hashes. Each record in our input corresponds to one file. These records in our input have 2 columsn: the end hash, and the pathname. To form our output, we remove the first column, then prepend the "full hash", which is the blake2b hash of the last Kilobyte of the file. There's not a lot of opportunity for reuse here, so we forgo most of the usual parameterization. """ import hashlib import os import sys import readline0 def get_full_hash(filename: bytes) -> bytes: """Get a cryptographic digest of `filename` in its entirety.""" hasher = hashlib.blake2b(digest_size=32) # Can produce digests between 1 and 64 bytes with open(filename, "rb") as file_: while True: buffer = file_.read(2**20) if not buffer: break hasher.update(buffer) return hasher.hexdigest().encode("UTF-8") def usage(retval: int) -> None: """Output a usage message.""" if retval == 0: file_ = sys.stdout else: file_ = sys.stderr print(f"Usage: {sys.argv[0]} --write-file-to /path/name.txt --help", file=file_) sys.exit(retval) def main() -> None: """Start the ball rolling.""" write_count_to = "" while sys.argv[1:]: match sys.argv[1]: case "--write-count-to": write_count_to = sys.argv[2] del sys.argv[1] case "--help" | "-h": usage(0) case _: print(f"{sys.argv[0]}: unrecognized option: {sys.argv[1]}", file=sys.stderr) usage(1) del sys.argv[1] for index, record in enumerate(readline0.readline0(file_=0, separator=b"\0")): if not record: break input_fields = record.split(b",", 2) full_hash = get_full_hash(input_fields[1]) output_fields = [full_hash, input_fields[1]] os.write(1, b",".join(output_fields) + b"\0") if write_count_to: with open(write_count_to, "w") as file_: file_.write(str(index) + "\n") main()