'''Provides a very simple rolling checksum for file data''' import os import sys ifdef(`pyx',cdef )class Tri_level_chunker: # pylint: disable=R0902,R0903 # R0902: We want a few instance attributes # R0903: We don't need a lot of public methods '''Chunk up a file into blocks in a pretty repeatable way, even in the face of small changes to a file''' ifdef(`pyx',cdef object _rolling_checksum) ifdef(`pyx',cdef int _file_handle) ifdef(`pyx',cdef int _threshold) ifdef(`pyx',cdef int _consecutive_boundaries) ifdef(`pyx',cdef object _block) ifdef(`pyx',cdef int _at_eof) ifdef(`pyx',cdef int _done) def __init__(self, file_handle): '''Divide a file up into portions''' self._rolling_checksum = Rolling_checksum() self._file_handle = file_handle # this threshold should give us about 2**20 byte chunks on average self._threshold = self._rolling_checksum.get_modulus() * 0.01 #sys.stdout.write('self._threshold is %d\n' % self._threshold) self._consecutive_boundaries = 0 self._block = os.read(self._file_handle, 2**22) self._at_eof = False self._done = False def __iter__(self): return self def __next__(self): # pylint: disable=R0912 # R0912: Too many branches: Tell me about it. If we had a yield, this'd be much simpler, but cython 0.14 doesn't do that '''Yield (without using the yield keyword) the next block in the sequence''' for byteno, character in enumerate(self._block): if isinstance(character, str): byte = ord(character) else: byte = character self._rolling_checksum.add(byte) checksum = self._rolling_checksum.get_checksum() #sys.stdout.write('checksum is %d\n' % checksum) if checksum < self._threshold: self._consecutive_boundaries += 1 #sys.stdout.write('self._consecutive_boundaries is %d\n' % self._consecutive_boundaries) if self._consecutive_boundaries == 3: result = self._block[:byteno] if self._at_eof: replenishment = '' else: replenishment = os.read(self._file_handle, byteno) if not replenishment: self._at_eof = True if replenishment: self._block = self._block[byteno:] + replenishment else: self._block = self._block[byteno:] self._consecutive_boundaries = 0 return result elif not (0 < self._consecutive_boundaries < 3): sys.stderr.write('%s: Internal error: not 0 < _consecutive_boundaries < 3\n' % sys.argv[0]) sys.exit(1) else: continue else: self._consecutive_boundaries = 0 if self._done: raise StopIteration # we're done processing chunks - return what we had, and raise StopIteration next time self._done = True return self._block next = __next__ ifdef(`pyx',cdef )class Rolling_checksum: # pylint: disable=R0903,R0902 # R0903: We don't need a lot of public methods # R0902: We want lots of instance attributes ifdef(`pyx',cdef object _total) ifdef(`pyx',cdef int _width) ifdef(`pyx',cdef long _multiplicand) ifdef(`pyx',cdef long _modulus) ifdef(`pyx',cdef long _addend) ifdef(`pyx',cdef long _offset) ifdef(`pyx',cdef list _window) ifdef(`pyx',cdef long _magic_checksum) '''Compute a very simple, fast, rolling checksum''' #def __init__(self, width=2039, multiplicand=1021, modulus=32713): #def __init__(self, width = 3606721, multiplicand = 583621, modulus = 1048573, addend = 11515687): def __init__(self, width = 47, multiplicand = 5059, modulus = 389, addend = 83093): self._total = addend * width self._width = width self._multiplicand = multiplicand self._modulus = modulus self._addend = addend self._offset = 0 self._window = [ addend ] * self._width #self._multiplicand = list(primes.primes(self._width)) #self._magic_checksum = self._width * 64 self._magic_checksum = 0 ifdef(`pyx',cp)def add(self, byte): '''Add a byte into our rolling checksum function''' # Inspired by linear congruential random number generation. It's not quite the same thing though. if isinstance(byte, str): byte = ord(byte) #self._window[self._offset] = byte * self._multiplicand[self._offset] + self._offset + 1 adjusted = byte * self._multiplicand + self._addend self._window[self._offset] = adjusted #self._total += self._window[self._offset] self._total += adjusted new_offset = (self._offset + 1) % self._width self._offset = new_offset self._total -= self._window[new_offset] return self ifdef(`pyx',cp)def get_checksum(self): '''Return the current checksum''' return self._total % self._modulus ifdef(`pyx',cp)def demarcation(self): '''Is this the dividing line between two chunks?''' #print('self._total is %d, checksum is %d' % (self._total, self._total % self._modulus)) return self.get_checksum() == self._magic_checksum ifdef(`pyx',cp)def get_modulus(self): '''Return the modulus''' return self._modulus