# cython: profile=True import sys import exceptions sys.path.insert(0, 'drs_buffer') import cython_drs_buffer_mod as drs_buffer_mod ctypedef unsigned long long u_int64_t ctypedef unsigned char u_char_t cdef extern from "rabinpoly.h": ctypedef struct c_Window "window": int size u_int64_t fingerprint int bufpos u_int64_t U u_char_t *buf u_int64_t slide8(u_char_t byte_to_add) void reset() c_Window *new_Window "new window"(u_int64_t polynomial, unsigned int window_size) void del_Window "delete"(c_Window *window) cdef class Window: cdef c_Window *thisptr def __cinit__(self, unsigned int window_size=16): fingerprint_pt = 0xbfe6b8a5bf378d83 self.thisptr = new_Window(fingerprint_pt, window_size) def __dealloc__(self): del_Window(self.thisptr) cpdef reset(self): self.thisptr.reset() cpdef slide8(self, character): return self.thisptr.slide8(character) cdef class Chunker(object): cdef Window _window cdef object _file cdef int _mask cdef int _size_exponent cdef int _average_size cdef int _maximum_size cdef object _read_buffer cdef int _demarcation cdef int _read_buffer_position cdef int _read_buffer_len def __init__(self, file_, int size_exponent=20): self._window = Window() self._file = file_ self._read_buffer = drs_buffer_mod.DRS_buffer() self._read_buffer_position = 0 self._read_buffer_len = 0 # we arbitrarily pick a magic fingerprint of half the mask, because we need some arbitrary value to use as a chunk separator, and # using 0 tends to make files that start with nulls have too many demarcations. self._size_exponent = size_exponent self._average_size = 2** self._size_exponent self._demarcation = self._average_size // 2 self._mask = self._average_size - 1 self._maximum_size = self._average_size * 2 def __iter__(self): return self def _do_chunk(self): temp = self._read_buffer[:self._read_buffer_position] del self._read_buffer[:self._read_buffer_position] self._read_buffer_len = len(self._read_buffer) self._read_buffer_position = 0 return True, temp def _get_fingerprint(self, int byte): return self._window.slide8(byte) def _process_byte_prep(self): cdef int byte cdef long long fingerprint byte = self._read_buffer[self._read_buffer_position] fingerprint = self._get_fingerprint(byte) self._read_buffer_position += 1 return fingerprint def _process_byte(self): fingerprint = self._process_byte_prep() if fingerprint & self._mask == self._demarcation or self._read_buffer_position == self._maximum_size: return self._do_chunk() return False, '' def _feed_read_buffer(self): block = self._file.read(2**20) if not block: raise exceptions.StopIteration self._read_buffer.extend(block) self._read_buffer_len = len(self._read_buffer) def __next__(self): while True: if self._read_buffer_position < self._read_buffer_len: done, buf = self._process_byte() if done: return buf elif self._read_buffer_position == self._read_buffer_len: self._feed_read_buffer() else: raise exceptions.AssertionError, "This should never happen"