Source code for saveset_summary_mod

#!/usr/bin/env python

# pylint: disable=simplifiable-if-statement

"""Provides a class and related functions for operating on savesets."""

import os
import re
import sys
import time
import random
import socket
import decimal
import platform

import backshift_os_mod

CHECKPOINT_MULTIPLE = 1000

SUMMARIES_NAME = 'summaries'


[docs]def decimal_round(decimal_value, hundredths=decimal.Decimal('0.01')): """Round a decimal value to 2 places after the decimal point.""" return decimal_value.quantize(hundredths, rounding=decimal.ROUND_HALF_EVEN)
[docs]def canonicalize_hostname(user_hostname=None): """Return the current machine's hostname - canonicalized.""" bad_hostnames = ['localhost', 'localhost.localdomain'] if user_hostname is None: non_canonical_hostname = platform.node() try: canonical_hostname = socket.gethostbyaddr(socket.gethostbyname(non_canonical_hostname))[0] except (socket.gaierror, socket.herror): canonical_hostname = 'localhost' assert canonical_hostname in bad_hostnames if canonical_hostname not in bad_hostnames: result = canonical_hostname elif non_canonical_hostname not in bad_hostnames: result = non_canonical_hostname else: raise AssertionError('user_hostname is None, and both canonical and noncanonical hostnames are blacklisted') else: result = user_hostname return result.lower()
[docs]def random_string(length=16): """Return a hexadecimal string of length length.""" list_ = [] for _unused in range(length): list_.append(random_hex_digit()) return ''.join(list_)
[docs]def random_hex_digit(): """Return a single, random hex digit.""" return '0123456789abcdef'[int(random.random() * 16)]
[docs]def get_all_saveset_summaries(save_directory, canonical_hostname, last_n=None): """Get all the savesets, creating one Saveset object for each.""" saveset_directory = os.path.join(save_directory, SUMMARIES_NAME) all_savesets_list = [] # We skip temporary files, otherwise we'd see things twice sometimes, and get a traceback sometimes. files = [filename for filename in os.listdir(saveset_directory) if not filename.endswith('.temp')] files.sort(key=lambda filename: float(filename.split('_')[0])) if last_n is not None: files = files[-last_n:] # pylint: disable=invalid-unary-operand-type for filename in files: saveset = Saveset_summary(saveset_directory, canonical_hostname, backup_id=filename) all_savesets_list.append(saveset) all_savesets_list.sort() return all_savesets_list
[docs]def backup_id_present(saveset_summary, saveset_summary_list): """Return True if saveset_summary's backup id is present in saveset_summary_list.""" for candidate_summary in saveset_summary_list: if saveset_summary.backup_id == candidate_summary.backup_id: return True return False
[docs]def get_hostname_subset_matches(all_savesets_list, canonical_hostname, subset): """Get a list of saveset summaries that match on the hostname and subset.""" hostname_subset_match_savesets = [] for saveset in all_savesets_list: if saveset.canonical_hostname == canonical_hostname and saveset.subset == subset: # This is in the set we're interested in hostname_subset_match_savesets.append(saveset) return hostname_subset_match_savesets
[docs]def get_tweaked_matches(current_saveset_summary, hostname_subset_match_savesets): """Now exclude the current saveset, because doing a save relative to itself makes no sense.""" tweaked_matches = [] for saveset in hostname_subset_match_savesets: if saveset.backup_id == current_saveset_summary.backup_id: pass else: tweaked_matches.append(saveset) return tweaked_matches
[docs]def pick_saveset_summaries(save_directory, canonical_hostname, subset, current_saveset_summary): """Pick up to 3 savesets against which to perform an incremental.""" all_savesets_list = get_all_saveset_summaries(save_directory, canonical_hostname) hostname_subset_match_savesets = get_hostname_subset_matches(all_savesets_list, canonical_hostname, subset) tweaked_matches = get_tweaked_matches(current_saveset_summary, hostname_subset_match_savesets) # for our hostname, subset tuple: # Pick the most recent (by start time) - whether complete or not most_recent_saveset = None for saveset in tweaked_matches: if most_recent_saveset is None or saveset.start_time > most_recent_saveset.start_time: most_recent_saveset = saveset # Pick the most recent (by start time) completed saveset with > 1 files in it - because # a backup of an empty mountpoint looks like a completed fullsave, but isn't. And # if a save Should have only 1 file in it, we don't need to optimize that. most_recent_completed_saveset = None for saveset in tweaked_matches: tail = most_recent_completed_saveset is None or saveset.start_time > most_recent_completed_saveset.start_time if saveset.finish_time is not None and tail and saveset.number_of_files > 1: most_recent_completed_saveset = saveset # Pick the one with the most files in it - might not be a precise number, but should be # pretty close: % CHECKPOINT_MULTIPLE most_files_saveset = None for saveset in tweaked_matches: if most_files_saveset is None or saveset.number_of_files > most_files_saveset.number_of_files: most_files_saveset = saveset # create a list with the 3, but excluding duplicates - so it could get fewer than 3 elements list_ = [] if most_recent_saveset is not None: list_.append(most_recent_saveset) if most_recent_completed_saveset is not None and not backup_id_present(most_recent_completed_saveset, list_): list_.append(most_recent_completed_saveset) if most_files_saveset is not None and not backup_id_present(most_files_saveset, list_): list_.append(most_files_saveset) # sort the list list_.sort() if list_: sys.stderr.write('Backing up relative to:\n') for element in list_: sys.stderr.write(' %s\n' % element.backup_id) else: sys.stderr.write('Found nothing to backup relative to.\n') # Attempt to get "backing up relative to" output near the beginning of looper subprocess, # not near the end. sys.stderr.flush() # Return the resulting list. # Note: if list_ is empty, this is a fullsave. return list_
[docs]def create_dir_if_needed(init_savedir): """Create the savesets directory if its needed.""" # If the summaries directory does not yet exist if not os.path.isdir(SUMMARIES_NAME): # If the user requested save directory creation, create it. Else, give a mostly-helpful error message. if init_savedir: os.mkdir(SUMMARIES_NAME) else: sys.stderr.write('summaries directory does not exist - rerun with --init-savedir option?\n') sys.exit(1)
[docs]class Saveset_summary(object): # pylint: disable=R0902 # R0902: We need a handful of instance attributes """A class for summaries - both new and existing.""" def __init__(self, directory, canonical_hostname, backup_id=None, subset=None, init_savedir=False): """Initialize.""" # pylint: disable=R0913,too-many-statements # R0913: We seem to need a few arguments # too-many-statements: I'm willing to allow a large __init__ in this case. create_dir_if_needed(init_savedir) self.last_checkpoint_time = time.time() self.directory = directory if backup_id is None: self.new_saveset = True self.finished = False self.number_of_files = 0 self.start_time = decimal.Decimal(str(time.time())) self.canonical_hostname = canonical_hostname assert subset is not None self.subset = subset self.random_string = random_string() self.backup_id = self.get_backup_id() else: # 1303099413.69_benchbox_slash_Sun-Apr-17-21-03-33-2 _ea7b44aa32d9fc64 self.new_saveset = False fields = backup_id.split('_') self.start_time = decimal.Decimal(fields[0]) self.canonical_hostname = fields[1] self.subset = fields[2] # we intentionally skip the human-readable form of the date, because we can easily get it back # from the machine-readable version self.random_string = fields[4] self.backup_id = backup_id self.backup_id_filename = os.path.join(self.directory, self.backup_id) self.temp_backup_id_filename = '%s.temp' % self.backup_id_filename if backup_id is None: sys.stderr.write('Creating backup id %s\n' % self.backup_id) else: file_ = open(self.backup_id_filename, 'r') for line in file_: saveset_fields = line.split() if saveset_fields[0] == 'start_time': saveset_start_time = decimal.Decimal(saveset_fields[1]) elif saveset_fields[0] == 'number_of_files': self.number_of_files = int(saveset_fields[1]) elif saveset_fields[0] == 'finish_time': finish_time_string = saveset_fields[1] # Note: We really do want 'None', not None if finish_time_string == 'None': self.finished = False self.finish_time = None else: self.finished = True self.finish_time = decimal.Decimal(finish_time_string) file_.close() saveset_start_time_2 = decimal_round(saveset_start_time) start_time_2 = decimal_round(self.start_time) if abs(saveset_start_time_2 - start_time_2) > decimal.Decimal('10.0'): sys.stderr.write('%s (type: %s) != %s (type: %s) - tolerance 10.0\n' % ( saveset_start_time_2, type(saveset_start_time_2), start_time_2, type(start_time_2), )) sys.stderr.write('Filename is %s\n' % self.backup_id_filename) raise AssertionError('saveset_start_time_2 != self.start_time_2 within 10.0 tolerance') self.start_time = saveset_start_time if self.new_saveset: self.update(number_of_files=0) def __str__(self): """Convert to string.""" if hasattr(self, 'finish_time'): finish_time = getattr(self, 'finish_time') else: finish_time = None return 'ss %s / %s / %d / %s' % (self.backup_id, self.start_time, self.number_of_files, finish_time) def __repr__(self): """Return a representation of this object.""" return str(self) def __cmp__(self, other): """Sort by start_time and backup_id - the python 2 way.""" if self.start_time < other.start_time: return -1 if self.start_time > other.start_time: return 1 if self.backup_id < other.backup_id: return -1 if self.backup_id > other.backup_id: return 1 return 0 def __lt__(self, other): """Sort by start_time - the python 3 way.""" if self.start_time < other.start_time: return True if self.backup_id < other.backup_id: return True return False
[docs] def get_backup_id(self): """Create a backup id from this Saveset.""" human_readable_time = re.sub('[ :]', '-', time.ctime(float(self.start_time))) elements = ['%.2f' % self.start_time, self.canonical_hostname, self.subset, human_readable_time, self.random_string] # we downcase the backup id, because otherwise case sense might cause issues # problems. return '_'.join(elements).lower()
[docs] def checkpoint_interval_elapsed(self): """Return True iff it has been 10 minutes since we last checkpointed.""" current_time = time.time() return self.last_checkpoint_time + 600 < current_time
[docs] def update(self, number_of_files=None, finished=False): """Update the backup_id file content.""" # note that finished and self.finished are two different things at this point in time if self.new_saveset: if number_of_files is not None: self.number_of_files = number_of_files if finished: self.finish_time = time.time() if self.number_of_files % CHECKPOINT_MULTIPLE == 0 or self.checkpoint_interval_elapsed() or finished: # once in a while, we do the update - because updating for every file would slow things down too much and not add # much greater accuracy file_ = open(self.temp_backup_id_filename, 'w') file_.write('start_time %f\n' % self.start_time) file_.write('number_of_files %d\n' % self.number_of_files) if finished: # This means the backup finished well :) file_.write('finish_time %f\n' % self.finish_time) else: # If this is present in a backup_id long term, that indicates there was a crash of some sort that prevented the # backup from completing normally. file_.write('finish_time None\n') # make sure the data gets committed to disk file_.flush() os.fsync(file_.fileno()) file_.close() # We create a temporary and rename, because that reduces the window during which a backup id's data could be # lost due to a crash. backshift_os_mod.safe_rename(self.temp_backup_id_filename, self.backup_id_filename) self.last_checkpoint_time = time.time() if finished: self.finished = True else: raise AssertionError('update called on old saveset')
# def __enter__(self): # return self # # def __exit__(self, type_, value, traceback): # if value is None: # self.close() # return True # else: # return False
[docs] def close(self): """Close the saveset.""" if self.new_saveset: self.update(finished=True) self.finish_time = time.time()