Source code for expire_mod


# pylint: disable=superfluous-parens

"""
Functions related to expiring old data from a repo.

Sort of belongs in repo_mod, but doesn't really have much overlap, and repo_mod is getting kind of big.
"""

import decimal
import glob
import os
import re
import shutil
import sys
import time

import chunk_mod
import dirops_mod
import helpers
import repo_mod
import saveset_summary_mod
import xz_mod


[docs]def make_used(*var): """Persuade linters that var is used.""" assert True or var
[docs]def expire(current_time, check_hashes): """Expire old chunks, files and savesets from this repository.""" # Please note that we decide whether something is a hash based on directory+file lengths. We don't (yet?) attempt to # confirm that a directory+file is lower case hexadecimal or anything like that. our_directory = 'overall' dirops_mod.my_mkdir(our_directory) maximum_age = get_maximum_age(our_directory) expire_savesets(current_time, maximum_age) expire_files(current_time, maximum_age) expire_hashes(current_time, maximum_age, check_hashes)
[docs]def get_maximum_age(our_directory): """Get the maximum age for the repo, one way or another.""" our_filename = os.path.join(our_directory, 'maximum-age') try: file_ = open(our_filename, 'r') except (OSError, IOError): # We don't appear to have a maximum-age file yet - ask the user for a time, and create one sys.stderr.write('This appears to be the first time you have used --expire;\n') sys.stderr.write('Please enter the maximum age of files in seconds.\n') sys.stderr.write('For a 365 day year, compute 60*60*24*365 == 31536000 and enter 31536000.\n') sys.stderr.write('To account for leap years, etcetera, you can use 60*60*24*365.2421891 == 31556925.\n') sys.stderr.write('> ') line = sys.stdin.readline() maximum_age = float(line) file_ = open(our_filename, 'w') file_.write('%f\n' % maximum_age) file_.close() else: # Reuse a previous maximum-age file line = file_.readline() maximum_age = float(line) return maximum_age
[docs]def expire_hashes(current_time, maximum_age, check_hashes): """Expire all old hashes. Build a list of current hashes.""" sys.stderr.write('Removing old chunks\n') sample_hash = repo_mod.get_hash(helpers.null_byte) hash_len = len(sample_hash) chunks_string = 'chunks' chunks_expiration_traversal(current_time, maximum_age, chunks_string, hash_len, check_hashes)
[docs]def expire_files(current_time, maximum_age): """Expire all old hashes. Build a list of current hashes.""" current_time = time.time() files_string = 'files' files_expiration(current_time, maximum_age, files_string)
[docs]def rmtree_onerror(function, path, exception): """Deal with errors from rmtree.""" make_used(function, exception) sys.stderr.write('{}: {} failed to unlink\n'.format(sys.argv[0], path))
[docs]def files_expiration(current_time, maximum_age, files_string): """Recursively remove files directories that are now too old.""" for dirname in glob.glob('%s/*' % files_string): dirtime = os.stat(dirname).st_mtime if current_time - dirtime > maximum_age: sys.stdout.write('Removing files hierarchy %s\n' % dirname) shutil.rmtree(dirname, onerror=rmtree_onerror)
[docs]def is_empty_dir(prefix, list_of_subdirs, list_of_filenames, hash_len): """ Return True iff directory is empty. prefix is the hexadecial derived from this directory. list_of_subdirs is a (possibly empty) list of subdirectories of this directory. list_of_filenames is a (possibly empty) list of filenames in this directory. hash_len is the length we expect a good hash to have. """ # sys.stderr.write('checking directory %s\n' % (directory, )) if list_of_subdirs: # We have subdirectories, so this is not empty return False # This gives us a list of filenames which are hashes. It intentionally excludes temporary filenames as # created by chunk_mod_traditional.py # We could improve it a bit by checking for temporary files that are recent/old. But there would still be a race condition. hash_filenames = [] for filename in list_of_filenames: # Note that it's possible for full_hash to not be an actual hash! EG if someone stashed a README in a chunk directory. full_hash = prefix + filename # temp files have underscores in them, until they are renamed. if len(full_hash) == hash_len and '_' not in filename: hash_filenames.append(filename) if hash_filenames: # we have at least one hash_filename, so this directory is not 'empty' return False return True
[docs]def is_old_dir(directory, high_age): """Return True iff directory is old.""" current_time = time.time() directory_time = os.path.getmtime(directory) age = current_time - directory_time return age >= high_age
# high_age is about 2 months.
[docs]def chunks_expiration_traversal(current_time, maximum_age, chunks_string, hash_len, check_hashes): # pylint: disable=too-many-locals """Traverse the chunks tree, deleting old files and optionally verifying hashes as we go.""" # The followlinks option to os.walk wasn't introduced until Python 2.6, and we support Python 2.5, including # early PyPy, and Jython. previous_hex = '' high_age = 60 * 60 * 24 * 30 * 2 directories_deleted = 0 hashes_deleted = 0 for internal_root_dir, list_of_dirs, list_of_filenames in os.walk(chunks_string, topdown=True): # We want this to make os.walk give us back directories in sorted order - to maximize the benefit of our in-filesystem trie # datastructure. list_of_dirs.sort() fields = internal_root_dir.split(os.sep) if len(fields) == 2: assert fields[0] == chunks_string current_hex = fields[1][:2] if current_hex != previous_hex: print('0x{} of 0x100 ({} hashes deleted, {} directories deleted)'.format( current_hex, hashes_deleted, directories_deleted )) previous_hex = current_hex prefix = internal_root_dir for regex in [re.compile('^%s%s' % (chunks_string, os.path.sep)), re.compile(os.path.sep)]: prefix = re.sub(regex, '', prefix) # Is this an old, 'empty' directory? if is_empty_dir( prefix, list_of_dirs, list_of_filenames, hash_len) and is_old_dir(internal_root_dir, high_age): # Yes, so remove it. We use rmtree because directories containing nothing but temporaries are considered # 'empty'. # sys.stderr.write('removing %s\n' % (internal_root_dir, )) shutil.rmtree(internal_root_dir) directories_deleted += 1 continue for filename in list_of_filenames: full_hash = prefix + filename full_path = os.path.join(internal_root_dir, filename) if len(full_hash) == hash_len: if expire_one_hash(current_time, maximum_age, full_path, full_hash, check_hashes): hashes_deleted += 1
[docs]def expire_one_hash(current_time, maximum_age, full_path, full_hash, check_hashes): """Expire one hash file.""" deleted = False statbuf = os.stat(full_path) if statbuf.st_mtime < current_time - maximum_age: # Chunk file is too old - remove it os.unlink(full_path) deleted = True elif statbuf.st_size == 0: # Most commonly, if a filesystem gets corrupted, that'll manifest as zero-length files os.unlink(full_path) deleted = True sys.stderr.write('%s: unlinked zero-length (corrupted) hash file %s.\n' % (sys.argv[0], full_path)) sys.stderr.write('Next time you back up a file with this hash, it will be recreated.\n') elif check_hashes: chunk = chunk_mod.Chunk(full_hash) try: data = chunk.read_chunk() except OSError: sys.stderr.write('%s: Got an OSError (disk, filesystem) reading %s .\n' % (sys.argv[0], full_path)) sys.stderr.write('You could try removing the file and attempting again, but this likely means disk or\n') sys.stderr.write('filesystem corruption!\n') sys.exit(1) except xz_mod.DecompressionError: # Note that we've checked for decompression problems in chunks, but not in metadata. Doing the # same for metadata would be good too. sys.stderr.write('%s: Warning: Chunk that failed to decompress; unlinking: %s\n' % (sys.argv[0], full_path)) deleted = True os.unlink(full_path) else: if repo_mod.get_hash(data) != full_hash: sys.stderr.write('%s: Warning: Chunk found with bad digest; unlinking: %s\n' % (sys.argv[0], full_path)) deleted = True os.unlink(full_path) return deleted
[docs]def expire_savesets(current_time, maximum_age): """Expire old savesets.""" # canonical_hostname is unused when backup_id is None. # We've already cd'd - no need to pass anything special into save_directory. saveset_summaries = saveset_summary_mod.get_all_saveset_summaries(save_directory='.', canonical_hostname='') for saveset_summary in saveset_summaries: # It's kind of important that we use finish_time here and not start_time. Otherwise, we could end up expiring # a saveset corresponding to partially-expired chunks, because sometimes saves take a while. youngest_to_keep = decimal.Decimal(str(current_time - maximum_age)) saveset_summary_path = os.path.join(saveset_summary_mod.SUMMARIES_NAME, saveset_summary.backup_id) # Sometimes a save terminates early without a finish_time recorded; we cope with that by using the summary mtime instead finish_time = saveset_summary.finish_time if finish_time is None: statbuf = os.stat(saveset_summary_path) finish_time = decimal.Decimal(str(statbuf.st_mtime)) else: finish_time = decimal.Decimal(finish_time) if finish_time < youngest_to_keep: sys.stdout.write('Unlinking saveset summary %s with finish_time %s (%s)\n' % ( saveset_summary.backup_id, saveset_summary.finish_time, 'None' if finish_time is None else time.ctime(float(finish_time)), )) os.unlink(saveset_summary_path)