Source code for hardlinks_mod

#!/usr/bin/python

'''Manage a list of hardlinks for a given backup; Also provides classes for hardlink restores.'''

# This is pretty good at dealing with 1,000,000,000 files, only 1,000 of which are hardlinks; this is
# probably the most common scenario for a backup program.
#
# It's also good at dealing with 1,000 distinct files, of which there are 1,000,000,000 each.
#
# It's not so good at dealing with 1,000,000,000 distinct files, each hardlinked to something else, for
# a total of 2,000,000,000 files.

import os
import collections

import dirops_mod
import bloom_filter_mod


[docs]def one(): ''' A simple function that returns one, so our defaultdict can default to 1 ''' return 1
[docs]class Save_device(object): # pylint: disable=R0903 # R0903: We don't need a lot of public methods '''Class to hold data related to a particular "device" on save - in unix/linux filesystems, this means a filesystem''' expected_size = None def __init__(self, deviceno): self.deviceno = deviceno # This "bloom filter" will always correctly return True for something in the set, and will almost always # correctly return False for something not in the set. It can infrequently return True for something # not in the set - so we maintain counts and deal with them later - but only for things that the filter # said was duplicated. In this way, our inode_count defaultdict doesn't balloon up with a huge number # of 1's; the bloom filter is far smaller than that. self.bloom_filter = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=Save_device.expected_size, error_rate_p=0.01) # This defaultdict defaults to one, because we only put things in it there are already in the # bloom filter (set). self.inode_count = collections.defaultdict(one) def __iadd__(self, inodeno): '''Add an inode number to the list''' if inodeno in self.bloom_filter: self.inode_count[inodeno] += 1 else: self.bloom_filter += inodeno return self
[docs] def write(self, file_): '''Write the list of duplicate inodes for this device''' for inodeno in self.inode_count: if self.inode_count[inodeno] > 1: file_.write('%d %d\n' % (inodeno, self.inode_count[inodeno]))
[docs]class Restore_inode(object): '''Class to hold inode data''' def __init__(self, inodeno, ideal_count): self._ideal_count = ideal_count self._actual_count = 0 self._inodeno = inodeno self._filename = None
[docs] def get_filename(self): '''Return the single filename associated with the first occurrence of this device+inode pair''' self._actual_count += 1 return self._filename
[docs] def set_filename(self, filename): '''Associate a single filename with the first occurrence of this device+inode pair''' self._filename = filename