"""Provides a very simple rolling checksum for file data""" import sys ifdef(`pyx',from cpython cimport array,import array) ifdef(`pyx',language_level=3) ifdef(`pyx',from libc.stdint cimport uint64_t) ifdef(`pyx',from libc.stdint cimport uint32_t) def min_max_chunker(file_): """Make sure chunk sizes are above and below a pair of thresholds""" empty = True minimum_length = 2 ** 19 # 0.5 mebibytes maximum_length = 2 ** 22 # 4.0 mebibytes for chunk in n_level_chunker(file_, levels=3): if empty: chunk_so_far = chunk else: chunk_so_far += chunk empty = False len_chunk_so_far = `len'(chunk_so_far) if len_chunk_so_far < minimum_length: # go back up for another piece continue if len_chunk_so_far < maximum_length: # good - we're in the sweet spot yield chunk_so_far empty = True continue if len_chunk_so_far >= maximum_length: # we have a long chunk - split it and go around for more yield chunk_so_far[:maximum_length] chunk_so_far = chunk_so_far[maximum_length:] continue raise AssertionError("Should never reach this point") if not empty and chunk_so_far: yield chunk_so_far def n_level_chunker(file_, ifdef(`pyx',int )levels=1): # pylint: disable=R0912,R0914 # R0912: I looked into reducing the branch count here, but it seemed to make things more complex, not less # R0914: I believe we need a few local variables """Divide a file up into portions when we get 3 in-range checksums in a row""" ifdef(`pyx',` 'cdef uint64_t byte) ifdef(`pyx',` 'cdef uint64_t modulus) ifdef(`pyx',` 'cdef float threshold) ifdef(`pyx',` 'cdef int consecutive_boundaries) ifdef(`pyx',` 'cdef int preferred_block_len) ifdef(`pyx',` 'cdef int byteno) rolling_checksum = Rolling_checksum() # this threshold should give us about 2**20 byte chunks on average modulus = rolling_checksum.get_modulus() threshold = modulus * 0.007 consecutive_boundaries = 0 two_to_the_22nd = 2 ** 22 preferred_block_len = two_to_the_22nd block = file_.read(preferred_block_len) while True: if not block[preferred_block_len - 1:]: block = block + file_.read(preferred_block_len - `len'(block)) if not block: break for byteno, character in enumerate(block): byte = character checksum = rolling_checksum.add_byte(byte) if checksum < threshold: consecutive_boundaries += 1 else: consecutive_boundaries = 0 continue if consecutive_boundaries == levels: consecutive_boundaries = 0 result = block[:byteno] block = block[byteno:] if result: # result is not empty, so yield it yield result # we must break out of the loop to restart byteno at 0 break else: # We made it all the way through the enumeration without yielding anything, so yield all we have and empty block if block: yield block # in other words, make block an empty string block = block[0:0] # we're done processing chunks - return what's left, if it's not empty if block: # result is not empty, so yield it sys.stderr.write('yielding block (3) of length %d\n' % `len'(result)) yield block block = file_.read(two_to_the_22nd) assert not block # This one actually seems to be better as an iterator than a generator ifdef(`pyx',cdef )class Rolling_checksum(object): ifdef(`pyx',` 'cdef uint64_t _total) ifdef(`pyx',` 'cdef uint32_t _width) ifdef(`pyx',` 'cdef uint64_t _multiplicand) ifdef(`pyx',` 'cdef uint64_t _modulus) ifdef(`pyx',` 'cdef uint64_t _addend) ifdef(`pyx',` 'cdef uint32_t _offset) ifdef(`pyx',` 'cdef array.array _window) ifdef(`pyx',` 'cdef uint64_t _magic_checksum) # pylint: disable=R0902 # R0902: We need a few instance attributes; we're a nontrivial iterator - but I suspect this'll beat polynomials bigtime """ Compute a very simple, fast, rolling checksum. We don't maintain a list of bytes or anything - we just produce checksums on demand. Inspired by linear congruential random number generation. It's not quite the same thing though. """ def __init__(self): ifdef(`pyx',` 'cdef uint32_t width) ifdef(`pyx',` 'cdef uint64_t multiplicand) ifdef(`pyx',` 'cdef uint64_t modulus) ifdef(`pyx',` 'cdef uint64_t addend) width = 607 multiplicand = 7 modulus = 3677 addend = 66041 self._total = addend * width self._width = width self._multiplicand = multiplicand self._modulus = modulus self._addend = addend self._offset = 0 self._window = array.array('Q', [addend] * self._width) ifdef(`pyx',c)def add_byte(self, ifdef(`pyx',uint64_t )byte): """Add a byte into our rolling checksum function""" ifdef(`pyx',cdef uint32_t new_offset) ifdef(`pyx',cdef uint32_t offset_p1) ifdef(`pyx',cdef uint64_t adjusted) adjusted = byte * self._multiplicand + self._addend self._window[self._offset] = adjusted self._total += adjusted offset_p1 = self._offset + 1 # new_offset = offset_p1 % self._width new_offset = offset_p1 # Usually I would use %, but Cython's -a report says that's slow in this case. if offset_p1 >= self._width: new_offset -= self._width self._offset = new_offset self._total -= self._window[new_offset] result = self._total % self._modulus return result ifdef(`pyx',cp)def get_modulus(self): """Return the modulus""" return self._modulus