'''Provides a very simple rolling checksum for file data'''

#mport os
import sys

#mport comma_mod

ifdef(`pyx',from libc.stdint cimport uint64_t)

def min_max_chunker(file_):
    '''Make sure chunk sizes are above and below a pair of thresholds'''

    empty = True
    minimum_length = 2 ** 19  # 0.5 mebibytes
    maximum_length = 2 ** 22  # 4.0 mebibytes

    for chunk in n_level_chunker(file_, levels=3):
        if empty:
            chunk_so_far = chunk
        else:
            chunk_so_far += chunk
        empty = False

        len_chunk_so_far = `len'(chunk_so_far)

        if len_chunk_so_far < minimum_length:
            # go back up for another piece
            continue

        if len_chunk_so_far < maximum_length:
            # good - we're in the sweet spot
            yield chunk_so_far
            empty = True
            continue

        if len_chunk_so_far >= maximum_length:
            # we have a long chunk - split it and go around for more
            yield chunk_so_far[:maximum_length]
            chunk_so_far = chunk_so_far[maximum_length:]
            continue

        raise AssertionError("Should never reach this point")

    if not empty and chunk_so_far:
        yield chunk_so_far


def n_level_chunker(file_, ifdef(`pyx',int )levels=1):

    # pylint: disable=R0912,R0914
    # R0912: I looked into reducing the branch count here, but it seemed to make things more complex, not less
    # R0914: I believe we need a few local variables

    '''Divide a file up into portions when we get 3 in-range checksums in a row'''
ifdef(`pyx',`    'cdef uint64_t byte)
ifdef(`pyx',`    'cdef uint64_t modulus)
ifdef(`pyx',`    'cdef float threshold)
ifdef(`pyx',`    'cdef int consecutive_boundaries)
ifdef(`pyx',`    'cdef int preferred_block_len)
ifdef(`pyx',`    'cdef int need_ord)
ifdef(`pyx',`    'cdef int first_time)
ifdef(`pyx',`    'cdef int byteno)

    rolling_checksum = Rolling_checksum()

    # this threshold should give us about 2**20 byte chunks on average
    modulus = rolling_checksum.get_modulus()
    threshold = modulus * 0.007

    consecutive_boundaries = 0

    two_to_the_22nd = 2 ** 22
    preferred_block_len = two_to_the_22nd

    block = file_.read(preferred_block_len)

    first_time = True

    while True:
        if not block[preferred_block_len - 1:]:
            block = block + file_.read(preferred_block_len - `len'(block))

        if not block:
            break

        for byteno, character in enumerate(block):
            # This if deals with python 2 returing single-character str's, and python 3 returning an integer between 0 and 255
            if first_time:
                if isinstance(character, int):
                    need_ord = False
                else:
                    need_ord = True
                first_time = False

            if need_ord:
                byte = ord(character)
            else:
                byte = character

            checksum = rolling_checksum.add_byte(byte)

            if checksum < threshold:
                consecutive_boundaries += 1
            else:
                consecutive_boundaries = 0
                continue

            if consecutive_boundaries == levels:
                consecutive_boundaries = 0
                result = block[:byteno]
                block = block[byteno:]

                if result:
                    # result is not empty, so yield it
                    yield result
                # we must break out of the loop to restart byteno at 0
                break
        else:
            # We made it all the way through the enumeration without yielding anything, so yield all we have and empty block
            if block:
                yield block
                # in other words, make block an empty string
                block = block[0:0]

    # we're done processing chunks - return what's left, if it's not empty
    if block:
        # result is not empty, so yield it
        sys.stderr.write('yielding block (3) of length %d\n' % `len'(result))
        yield block
        block = file_.read(two_to_the_22nd)
        assert not block


# This one actually seems to be better as an iterator than a generator
ifdef(`pyx',cdef )class Rolling_checksum(object):
ifdef(`pyx',`    'cdef uint64_t _total)
ifdef(`pyx',`    'cdef uint64_t _width)
ifdef(`pyx',`    'cdef uint64_t _multiplicand)
ifdef(`pyx',`    'cdef uint64_t _modulus)
ifdef(`pyx',`    'cdef uint64_t _addend)
ifdef(`pyx',`    'cdef uint64_t _offset)
ifdef(`pyx',`    'cdef list _window)
ifdef(`pyx',`    'cdef uint64_t _magic_checksum)
ifdef(`pyx',`    'cdef int first_time)
ifdef(`pyx',`    'cdef int need_ord)

    # pylint: disable=R0902
    # R0902: We need a few instance attributes; we're a nontrivial iterator - but I suspect this'll beat polynomials bigtime

    '''
    Compute a very simple, fast, rolling checksum.  We don't maintain a list of bytes or anything - we just produce checksums
    on demand.  Inspired by linear congruential random number generation.  It's not quite the same thing though.
    '''

    def __init__(self, width=607, multiplicand=7, modulus=3677, addend=66041):
        # pylint: disable=R0913
        # R0913: We need a few arguments
        self._total = addend * width
        self._width = width
        self._multiplicand = multiplicand
        self._modulus = modulus
        self._addend = addend
        self._offset = 0
        self._window = [addend] * self._width

    ifdef(`pyx',cp)def add_byte(self, ifdef(`pyx',uint64_t )byte):
        '''Add a byte into our rolling checksum function'''
        adjusted = byte * self._multiplicand + self._addend
        self._window[self._offset] = adjusted
        #self._total += self._window[self._offset]
        self._total += adjusted
        new_offset = (self._offset + 1) % self._width
        self._offset = new_offset
        self._total -= self._window[new_offset]
        return self._total % self._modulus

    ifdef(`pyx',cp)def get_modulus(self):
        '''Return the modulus'''
        return self._modulus