Source code for saveset_summary_mod

#!/usr/bin/env python

# pylint: disable=simplifiable-if-statement

"""Provides a class and related functions for operating on savesets."""

import os
import re
import sys
import time
import random
import socket
import decimal
import platform

import backshift_os_mod

CHECKPOINT_MULTIPLE = 1000

SUMMARIES_NAME = 'summaries'


[docs]def decimal_round(decimal_value, hundredths=decimal.Decimal('0.01')):
    """Round a decimal value to 2 places after the decimal point."""
    return decimal_value.quantize(hundredths, rounding=decimal.ROUND_HALF_EVEN)


[docs]def canonicalize_hostname(user_hostname=None):
    """Return the current machine's hostname - canonicalized."""
    bad_hostnames = ['localhost', 'localhost.localdomain']
    if user_hostname is None:
        non_canonical_hostname = platform.node()
        try:
            canonical_hostname = socket.gethostbyaddr(socket.gethostbyname(non_canonical_hostname))[0]
        except (socket.gaierror, socket.herror):
            canonical_hostname = 'localhost'
            assert canonical_hostname in bad_hostnames
        if canonical_hostname not in bad_hostnames:
            result = canonical_hostname
        elif non_canonical_hostname not in bad_hostnames:
            result = non_canonical_hostname
        else:
            raise AssertionError('user_hostname is None, and both canonical and noncanonical hostnames are blacklisted')
    else:
        result = user_hostname
    return result.lower()


[docs]def random_string(length=16):
    """Return a hexadecimal string of length length."""
    list_ = []
    for _unused in range(length):
        list_.append(random_hex_digit())
    return ''.join(list_)


[docs]def random_hex_digit():
    """Return a single, random hex digit."""
    return '0123456789abcdef'[int(random.random() * 16)]


[docs]def get_all_saveset_summaries(save_directory, canonical_hostname, last_n=None):
    """Get all the savesets, creating one Saveset object for each."""
    saveset_directory = os.path.join(save_directory, SUMMARIES_NAME)

    all_savesets_list = []

    # We skip temporary files, otherwise we'd see things twice sometimes, and get a traceback sometimes.
    files = [filename for filename in os.listdir(saveset_directory) if not filename.endswith('.temp')]
    files.sort(key=lambda filename: float(filename.split('_')[0]))
    if last_n is not None:
        files = files[-last_n:]  # pylint: disable=invalid-unary-operand-type

    for filename in files:
        saveset = Saveset_summary(saveset_directory, canonical_hostname, backup_id=filename)
        all_savesets_list.append(saveset)

    all_savesets_list.sort()

    return all_savesets_list


[docs]def backup_id_present(saveset_summary, saveset_summary_list):
    """Return True if saveset_summary's backup id is present in saveset_summary_list."""
    for candidate_summary in saveset_summary_list:
        if saveset_summary.backup_id == candidate_summary.backup_id:
            return True
    return False


[docs]def get_hostname_subset_matches(all_savesets_list, canonical_hostname, subset):
    """Get a list of saveset summaries that match on the hostname and subset."""
    hostname_subset_match_savesets = []

    for saveset in all_savesets_list:
        if saveset.canonical_hostname == canonical_hostname and saveset.subset == subset:
            # This is in the set we're interested in
            hostname_subset_match_savesets.append(saveset)

    return hostname_subset_match_savesets


[docs]def get_tweaked_matches(current_saveset_summary, hostname_subset_match_savesets):
    """Now exclude the current saveset, because doing a save relative to itself makes no sense."""
    tweaked_matches = []

    for saveset in hostname_subset_match_savesets:
        if saveset.backup_id == current_saveset_summary.backup_id:
            pass
        else:
            tweaked_matches.append(saveset)

    return tweaked_matches


[docs]def pick_saveset_summaries(save_directory, canonical_hostname, subset, current_saveset_summary):
    """Pick up to 3 savesets against which to perform an incremental."""
    all_savesets_list = get_all_saveset_summaries(save_directory, canonical_hostname)

    hostname_subset_match_savesets = get_hostname_subset_matches(all_savesets_list, canonical_hostname, subset)

    tweaked_matches = get_tweaked_matches(current_saveset_summary, hostname_subset_match_savesets)

    # for our hostname, subset tuple:
    # Pick the most recent (by start time) - whether complete or not
    most_recent_saveset = None
    for saveset in tweaked_matches:
        if most_recent_saveset is None or saveset.start_time > most_recent_saveset.start_time:
            most_recent_saveset = saveset

    # Pick the most recent (by start time) completed saveset with > 1 files in it - because
    # a backup of an empty mountpoint looks like a completed fullsave, but isn't.  And
    # if a save Should have only 1 file in it, we don't need to optimize that.
    most_recent_completed_saveset = None
    for saveset in tweaked_matches:
        tail = most_recent_completed_saveset is None or saveset.start_time > most_recent_completed_saveset.start_time
        if saveset.finish_time is not None and tail and saveset.number_of_files > 1:
            most_recent_completed_saveset = saveset

    # Pick the one with the most files in it - might not be a precise number, but should be
    # pretty close: % CHECKPOINT_MULTIPLE
    most_files_saveset = None
    for saveset in tweaked_matches:
        if most_files_saveset is None or saveset.number_of_files > most_files_saveset.number_of_files:
            most_files_saveset = saveset

    # create a list with the 3, but excluding duplicates - so it could get fewer than 3 elements
    list_ = []
    if most_recent_saveset is not None:
        list_.append(most_recent_saveset)
    if most_recent_completed_saveset is not None and not backup_id_present(most_recent_completed_saveset, list_):
        list_.append(most_recent_completed_saveset)
    if most_files_saveset is not None and not backup_id_present(most_files_saveset, list_):
        list_.append(most_files_saveset)

    # sort the list
    list_.sort()

    if list_:
        sys.stderr.write('Backing up relative to:\n')
        for element in list_:
            sys.stderr.write('   %s\n' % element.backup_id)
    else:
        sys.stderr.write('Found nothing to backup relative to.\n')

    # Attempt to get "backing up relative to" output near the beginning of looper subprocess,
    # not near the end.
    sys.stderr.flush()

    # Return the resulting list.
    # Note: if list_ is empty, this is a fullsave.
    return list_


[docs]def create_dir_if_needed(init_savedir):
    """Create the savesets directory if its needed."""
    # If the summaries directory does not yet exist
    if not os.path.isdir(SUMMARIES_NAME):
        # If the user requested save directory creation, create it.  Else, give a mostly-helpful error message.
        if init_savedir:
            os.mkdir(SUMMARIES_NAME)
        else:
            sys.stderr.write('summaries directory does not exist - rerun with --init-savedir option?\n')
            sys.exit(1)


[docs]class Saveset_summary(object):
    # pylint: disable=R0902
    # R0902: We need a handful of instance attributes
    """A class for summaries - both new and existing."""

    def __init__(self, directory, canonical_hostname, backup_id=None, subset=None, init_savedir=False):
        """Initialize."""
        # pylint: disable=R0913,too-many-statements
        # R0913: We seem to need a few arguments
        # too-many-statements: I'm willing to allow a large __init__ in this case.
        create_dir_if_needed(init_savedir)

        self.last_checkpoint_time = time.time()

        self.directory = directory
        if backup_id is None:
            self.new_saveset = True
            self.finished = False
            self.number_of_files = 0
            self.start_time = decimal.Decimal(str(time.time()))
            self.canonical_hostname = canonical_hostname
            assert subset is not None
            self.subset = subset
            self.random_string = random_string()
            self.backup_id = self.get_backup_id()
        else:
            # 1303099413.69_benchbox_slash_Sun-Apr-17-21-03-33-2    _ea7b44aa32d9fc64
            self.new_saveset = False
            fields = backup_id.split('_')
            self.start_time = decimal.Decimal(fields[0])
            self.canonical_hostname = fields[1]
            self.subset = fields[2]
            # we intentionally skip the human-readable form of the date, because we can easily get it back
            # from the machine-readable version
            self.random_string = fields[4]
            self.backup_id = backup_id
        self.backup_id_filename = os.path.join(self.directory, self.backup_id)
        self.temp_backup_id_filename = '%s.temp' % self.backup_id_filename
        if backup_id is None:
            sys.stderr.write('Creating backup id %s\n' % self.backup_id)
        else:
            file_ = open(self.backup_id_filename, 'r')
            for line in file_:
                saveset_fields = line.split()
                if saveset_fields[0] == 'start_time':
                    saveset_start_time = decimal.Decimal(saveset_fields[1])
                elif saveset_fields[0] == 'number_of_files':
                    self.number_of_files = int(saveset_fields[1])
                elif saveset_fields[0] == 'finish_time':
                    finish_time_string = saveset_fields[1]
                    # Note: We really do want 'None', not None
                    if finish_time_string == 'None':
                        self.finished = False
                        self.finish_time = None
                    else:
                        self.finished = True
                        self.finish_time = decimal.Decimal(finish_time_string)
            file_.close()
            saveset_start_time_2 = decimal_round(saveset_start_time)
            start_time_2 = decimal_round(self.start_time)
            if abs(saveset_start_time_2 - start_time_2) > decimal.Decimal('10.0'):
                sys.stderr.write('%s (type: %s) != %s (type: %s) - tolerance 10.0\n' % (
                    saveset_start_time_2,
                    type(saveset_start_time_2),
                    start_time_2,
                    type(start_time_2),
                ))
                sys.stderr.write('Filename is %s\n' % self.backup_id_filename)
                raise AssertionError('saveset_start_time_2 != self.start_time_2 within 10.0 tolerance')
            self.start_time = saveset_start_time
        if self.new_saveset:
            self.update(number_of_files=0)

    def __str__(self):
        """Convert to string."""
        if hasattr(self, 'finish_time'):
            finish_time = getattr(self, 'finish_time')
        else:
            finish_time = None
        return 'ss %s / %s / %d / %s' % (self.backup_id, self.start_time, self.number_of_files, finish_time)

    def __repr__(self):
        """Return a representation of this object."""
        return str(self)

    def __cmp__(self, other):
        """Sort by start_time and backup_id - the python 2 way."""
        if self.start_time < other.start_time:
            return -1
        if self.start_time > other.start_time:
            return 1
        if self.backup_id < other.backup_id:
            return -1
        if self.backup_id > other.backup_id:
            return 1
        return 0

    def __lt__(self, other):
        """Sort by start_time - the python 3 way."""
        if self.start_time < other.start_time:
            return True
        if self.backup_id < other.backup_id:
            return True
        return False

[docs]    def get_backup_id(self):
        """Create a backup id from this Saveset."""
        human_readable_time = re.sub('[ :]', '-', time.ctime(float(self.start_time)))
        elements = ['%.2f' % self.start_time, self.canonical_hostname, self.subset, human_readable_time, self.random_string]
        # we downcase the backup id, because otherwise case sense might cause issues
        # problems.
        return '_'.join(elements).lower()

[docs]    def checkpoint_interval_elapsed(self):
        """Return True iff it has been 10 minutes since we last checkpointed."""
        current_time = time.time()
        return self.last_checkpoint_time + 600 < current_time

[docs]    def update(self, number_of_files=None, finished=False):
        """Update the backup_id file content."""
        # note that finished and self.finished are two different things at this point in time
        if self.new_saveset:
            if number_of_files is not None:
                self.number_of_files = number_of_files
            if finished:
                self.finish_time = time.time()
            if self.number_of_files % CHECKPOINT_MULTIPLE == 0 or self.checkpoint_interval_elapsed() or finished:
                # once in a while, we do the update - because updating for every file would slow things down too much and not add
                # much greater accuracy
                file_ = open(self.temp_backup_id_filename, 'w')
                file_.write('start_time %f\n' % self.start_time)
                file_.write('number_of_files %d\n' % self.number_of_files)
                if finished:
                    # This means the backup finished well :)
                    file_.write('finish_time %f\n' % self.finish_time)
                else:
                    # If this is present in a backup_id long term, that indicates there was a crash of some sort that prevented the
                    # backup from completing normally.
                    file_.write('finish_time None\n')
                # make sure the data gets committed to disk
                file_.flush()
                os.fsync(file_.fileno())
                file_.close()

                # We create a temporary and rename, because that reduces the window during which a backup id's data could be
                # lost due to a crash.
                backshift_os_mod.safe_rename(self.temp_backup_id_filename, self.backup_id_filename)
                self.last_checkpoint_time = time.time()
            if finished:
                self.finished = True
        else:
            raise AssertionError('update called on old saveset')

#    def __enter__(self):
#        return self
#
#    def __exit__(self, type_, value, traceback):
#        if value is None:
#            self.close()
#            return True
#        else:
#            return False

[docs]    def close(self):
        """Close the saveset."""
        if self.new_saveset:
            self.update(finished=True)
            self.finish_time = time.time()
Source code for saveset_summary_mod

backshift

Navigation

Related Topics