#!/usr/bin/python

# cython: language_level=3

# We want profile data for this module, even when we're built with cython - during testing.  For RL, we want to turn this off.
# cython: profile=False

"""This module provides a funnel sort routine - IOW, a k-ary (or exponential) merge sort with an insertion sort at the bottom."""

#mport sys
import math
#mport time
import heapq
#mport pprint
#mport exceptions
#mport collections

# In CPython, insertion sort seems to outperform binarysort almost always.
# In Cython, insertion sort beats binarysort a little less often, but it still wins most of the time.
# So we use insertion sort, not binary sort
# However, binarysort is left here as a form of documentation about what I tried.

import insertionsort

def funnelsort(ifdef(`m4pyx',list )list_):
    """Perform a funnel sort on an entire list."""
    _funnelsort(list_, 0, `len'(list_) - 1)

ifdef(`m4pyx',cdef )class Semi_queue(object):
    """
    Class provides a queue datastructure.  Initially it doles out portions of a passed-in list, but later it duplicates
    the relevant subregion to facilitate in-place merging - after the size of the queue has already decreased, often
    significantly.
    """
ifdef(`m4pyx',`    cdef list list')
ifdef(`m4pyx',`    cdef public unsigned int subleft')
# we use adjust to make the sort stable - to give a relative position within our first (and sometimes second) buffer(s)
ifdef(`m4pyx',`    cdef int adjust')
ifdef(`m4pyx',`    cdef unsigned int subright')
ifdef(`m4pyx',`    cdef unsigned int empty')
ifdef(`m4pyx',`    cdef unsigned int has_some')

    def __init__(self, ifdef(`m4pyx',list )list_, ifdef(`m4pyx',unsigned int )subleft, ifdef(`m4pyx',unsigned int )subright):
        """Constructor."""
        self.list = list_
        #self.original_length = `len'(list_)
        self.adjust = -subleft
        self.subleft = subleft
        self.subright = subright
        if self.subleft > self.subright:
            self.has_some = False
        else:
            self.has_some = True
#        self.duplicated = False

    def __str__(self):
        return 'list: %s, has_some: %s' % (self.list[self.subleft:self.subright+1], self.has_some)

    __repr__ = __str__

    ifdef(`m4pyx',cp)def popleft(self):
        """Remove an element from the queue."""
        value = self.list[self.subleft]
        self.subleft += 1
        if self.subleft > self.subright:
            self.has_some = False
        return self.subleft + self.adjust, value

    def __nonzero__(self):
        """Is the queue nonempty?"""
        return self.has_some

    __bool__ = __nonzero__

    ifdef(`m4pyx',cp)def duplicate(self):
        """Duplicate the still-relevant subregion, to facilitate mostly in-place merging."""
#        if self.duplicated:
#            print 'Eh?  Already duplicated'
#            sys.exit(1)
        self.list = self.list[self.subleft:self.subright+1]
        self.adjust = self.subleft + self.adjust
        self.subright -= self.subleft
        self.subleft = 0
#        self.duplicated = True

# This seems like a good idea, but in reality, it's very slow
ifdef(`m4pyx',cp)def int_cube_root_ceil(input_value):
    """Binary search for the cubed root of input_value, rounded up."""
    if input_value in (0, 1):
        return input_value
    # the cube root will always be >= 0, because we're only defined for nonnegative whole numbers
    low_guess = 0
    # the cube root will always be less than input_value.  Actually, we know it'll be significantly lower than that, but starting
    # any lower than input_value gives us rounding problems sometimes.
    high_guess = input_value
    while True:
#        if low_guess > real_value:
#            print 'low_guess got too high:', low_guess, real_value
#            sys.exit(1)
#        if high_guess < real_value:
#            #print 'high_guess got too high:', high_guess, real_value
#            sys.exit(1)
        midpoint = (low_guess + high_guess) // 2
        #print low_guess, midpoint, high_guess
        trial_value = midpoint * midpoint * midpoint
        if trial_value == input_value:
            # This is exact - return the midpoint
            #print 'exact:', midpoint
            return midpoint
        if low_guess + 1 == high_guess:
            # This one's not exact, and we know it's somewhere between low_guess and high_guess, and we know we aren't
            # going to get any more precision.  And, we need to round up.  Because the result isn't precise, we know we
            # need high_guess, not low_guess.
            #print 'inexact, round up:', high_guess
            return high_guess
        if trial_value > input_value:
            #print 'decreasing high_guess from', high_guess, 'to', midpoint, 'because %d > %f' % (trial_value, input_value)
            high_guess = midpoint
        elif trial_value < input_value:
            #print 'increasing low_guess from', low_guess, 'to', midpoint, 'because %d < %f' % (trial_value, input_value)
            low_guess = midpoint

# It appears that Newton's method of finding a (cubed) root doesn't work for integer values.
# However, we can still compute the real root of x**3 - input_value == 0 to a low
# degree of precision, and then round up.
# However, even this is still too slow.
def newtons_cube_root(input_value):
    """Compute a cubed root using newton's method."""
    #print 'starting', input_value
    float_input_value = float(input_value)
    trial_value = float_input_value / 3.0
    epsilon = 0.499
    one_minus_epsilon = 1 - epsilon
    while True:
        trial_value_squared = trial_value * trial_value
        trial_value_cubed = trial_value_squared * trial_value
        if trial_value_cubed == float_input_value:
            #print 'ending exact'
            return int(trial_value)
        if abs(trial_value_cubed - float_input_value) < epsilon:
            #print 'ending close enough'
            return int(trial_value + one_minus_epsilon)
        new_trial_value = trial_value - (trial_value_cubed - float_input_value) / (3 * trial_value_squared)
        #print 'got to', new_trial_value, 'from', trial_value
        trial_value = new_trial_value

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def divide_and_subsort( \
    ifdef(`m4pyx',unsigned int )width, \
    ifdef(`m4pyx',list )list_, \
    ifdef(`m4pyx',unsigned int )left, \
    ifdef(`m4pyx',unsigned int )right, \
    ):
    """Give each queue an initial values."""
ifdef(`m4pyx',`    cdef unsigned int subwidth')
ifdef(`m4pyx',`    cdef unsigned int portions')
ifdef(`m4pyx',`    cdef unsigned int portionno')
ifdef(`m4pyx',`    cdef unsigned int subleft')
ifdef(`m4pyx',`    cdef unsigned int subright')
    # This seems to be faster even than newton's method with a low precision.  I wonder if it's getting done in hardware?
    # It's also faster than math.pow.
    subwidth = int(math.ceil(width ** 0.33333333333))
    queues = []
    # get k (portions) sorted sublists
    portions = int((width + subwidth - 1) // subwidth)
    for portionno in range(portions):
        subleft = left + portionno * subwidth
        subright = subleft + subwidth - 1
        if subright > right:
            subright = right
        if subleft > subright:
            # nothing to do
            continue
        _funnelsort(list_, subleft, subright)
        queue = Semi_queue(list_, subleft, subright)
        queues.append(queue)
    return queues

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def seed_heap(queues):
    """Fill the heap initially with one value from each queue."""

    # BTW, I tried using a treap instead of a heapq - it's slower than using heapq

    heap = []
    # pull 1 value from each queue to seed the heap (0 values for an already-empty queue)
    for queueno, queue in enumerate(queues):
        if queue:
            offset, value = queue.popleft()
            heap.append((value, queueno, offset))
    heapq.heapify(heap)
    return heap

#ifdef(`m4purepy',`@profile')
def merge(ifdef(`m4pyx',list )list_, queues, ifdef(`m4pyx',list )heap, ifdef(`m4pyx',unsigned int )add_at):
    """Merge the queues into list_."""
ifdef(`m4pyx',`    cdef unsigned int duplicated_queues')
    duplicated_queues = 0
    while heap:
        # get the minimum value
        minimum_value, queueno, offset = heapq.heappop(heap)
        # expose space in list_ that we can fill into, as necessary
        while True:
            try:
                possible_duplicate_queue = queues[duplicated_queues]
            except IndexError:
                # all queues have been duplicated - albeit odds are most were duplicated only after having shrunken quite a lot
                break
            else:
                if possible_duplicate_queue.subleft == add_at:
                    possible_duplicate_queue.duplicate()
                    duplicated_queues += 1
                    # and go back up for another iteration, in case we need to duplicate something else too
                else:
                    # nothing to duplicate
                    break
        list_[add_at] = minimum_value
        add_at += 1
        possible_pull_from_queue = queues[queueno]
        if possible_pull_from_queue:
            offset, value = possible_pull_from_queue.popleft()
            heapq.heappush(heap, (value, queueno, offset))

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def _funnelsort(ifdef(`m4pyx',list )list_, ifdef(`m4pyx',unsigned int )left, ifdef(`m4pyx',unsigned int )right):
    """funnel sort a subregion of a list."""
ifdef(`m4pyx',`    cdef unsigned int width')
ifdef(`m4pyx',`    cdef unsigned int queueno')
    width = right - left + 1
    # 72 seemed to perform the best on DRS' machine, and while that will probably be different on other machines, the performance
    # difference isn't extreme for other values, and optimizing this parameter dynamically is complex.
    if width <= 72:
        # we have a list of small width, it's faster to use insertion sort
        insertionsort.insertionsort(list_, left, right)
    else:
        # divide the list into sublists, recursively, and sort each list as we come back
        queues = divide_and_subsort(width, list_, left, right)

        # Set up a heap with one value from each queue
        heap = seed_heap(queues)

        # Now merge those regions, in place, using the heap to keep track of what the least element is each step of the way.
        # We use a heap rather than a straightforward list, because we will sometimes have a large number of lists to merge.
        merge(list_, queues, heap, left)