#!/usr/bin/python

# We want profile data for this module, even when we're built with cython - during testing.  For RL, we want to turn this off.
# cython: profile=False

'''This module provides a funnel sort routine - IOW, a k-ary (or exponential) merge sort with an insertion sort at the bottom'''

#mport sys
import math
#mport time
import heapq
#mport pprint
import exceptions
#mport collections

# In CPython, insertion sort seems to outperform binarysort almost always.
# In Cython, insertion sort beats binarysort a little less often, but it still wins most of the time.
# So we use insertion sort, not binary sort
# However, binarysort is left here as a form of documentation about what I tried.

import insertionsort

def funnelsort(ifdef(`m4pyx',list )list_):
	'''Perform a funnel sort on an entire list'''
	_funnelsort(list_, 0, `len'(list_) - 1)

ifdef(`m4pyx',cdef )class Semi_queue:
	'''
	Class provides a queue datastructure.  Initially it doles out portions of a passed-in list, but later it duplicates
	the relevant subregion to facilitate in-place merging - after the size of the queue has already decreased, often
	significantly.
	'''
	ifdef(`m4pyx',cdef list list)
	ifdef(`m4pyx',cdef public unsigned int subleft)
	# we use adjust to make the sort stable - to give a relative position within our first (and sometimes second) buffer(s)
	ifdef(`m4pyx',cdef int adjust)
	ifdef(`m4pyx',cdef unsigned int subright)
	ifdef(`m4pyx',cdef unsigned int empty)
#	ifdef(`m4pyx',cdef unsigned int duplicated)
	ifdef(`m4pyx',cdef unsigned int has_some)

	def __init__(self, ifdef(`m4pyx',list )list_, ifdef(`m4pyx',unsigned int )subleft, ifdef(`m4pyx',unsigned int )subright):
		'''Constructor'''
		self.list = list_
		#self.original_length = `len'(list_)
		self.adjust = -subleft
		self.subleft = subleft
		self.subright = subright
		if self.subleft > self.subright:
			self.has_some = False
		else:
			self.has_some = True
#		self.duplicated = False

	def __str__(self):
		return 'list: %s, has_some: %s' % (self.list[self.subleft:self.subright+1], self.has_some)

	__repr__ = __str__

	ifdef(`m4pyx',cp)def popleft(self):
		'''Remove an element from the queue'''
		value = self.list[self.subleft]
		self.subleft += 1
		if self.subleft > self.subright:
			self.has_some = False
		return self.subleft + self.adjust, value
	
	def __nonzero__(self):
		'''Is the queue nonempty?'''
		return self.has_some

	ifdef(`m4pyx',cp)def duplicate(self):
		'''Duplicate the still-relevant subregion, to facilitate mostly in-place merging'''
#		if self.duplicated:
#			print 'Eh?  Already duplicated'
#			sys.exit(1)
		self.list = self.list[self.subleft:self.subright+1]
		self.adjust = self.subleft + self.adjust
		self.subright -= self.subleft
		self.subleft = 0
#		self.duplicated = True

# This seems like a good idea, but in reality, it's very slow
ifdef(`m4pyx',cp)def int_cube_root_ceil(input_value):
	'''Binary search for the cubed root of input_value, rounded up'''
	if input_value in (0, 1):
		return input_value 
	# the cube root will always be >= 0, because we're only defined for nonnegative whole numbers
	low_guess = 0
	# the cube root will always be less than input_value.  Actually, we know it'll be significantly lower than that, but starting
	# any lower than input_value gives us rounding problems sometimes.
	high_guess = input_value
#	real_value = math.pow(input_value, 1/3.0)
	while True: 
#		if low_guess > real_value:
#			print 'low_guess got too high:', low_guess, real_value
#			sys.exit(1)
#		if high_guess < real_value:
#			#print 'high_guess got too high:', high_guess, real_value
#			sys.exit(1)
		midpoint = (low_guess + high_guess) // 2
		#print low_guess, midpoint, high_guess
		trial_value = midpoint * midpoint * midpoint 
		if trial_value == input_value:
			# This is exact - return the midpoint
			#print 'exact:', midpoint
			return midpoint 
		if low_guess + 1 == high_guess:
			# This one's not exact, and we know it's somewhere between low_guess and high_guess, and we know we aren't
			# going to get any more precision.  And, we need to round up.  Because the result isn't precise, we know we
			# need high_guess, not low_guess.
			#print 'inexact, round up:', high_guess
			return high_guess
		if trial_value > input_value:
			#print 'decreasing high_guess from', high_guess, 'to', midpoint, 'because %d > %f' % (trial_value, input_value)
			high_guess = midpoint
		elif trial_value < input_value:
			#print 'increasing low_guess from', low_guess, 'to', midpoint, 'because %d < %f' % (trial_value, input_value)
			low_guess = midpoint

# It appears that Newton's method of finding a (cubed) root doesn't work for integer values.
# However, we can still compute the real root of x**3 - input_value == 0 to a low
# degree of precision, and then round up.
# However, even this is still too slow.
def newtons_cube_root(input_value):
	'''Compute a cubed root using newton's method'''
	#print 'starting', input_value
	float_input_value = float(input_value)
	trial_value = float_input_value / 3.0
	epsilon = 0.499
	one_minus_epsilon = 1 - epsilon
	while True:
		trial_value_squared = trial_value * trial_value
		trial_value_cubed = trial_value_squared * trial_value 
		if trial_value_cubed == float_input_value:
			#print 'ending exact'
			return int(trial_value)
		if abs(trial_value_cubed - float_input_value) < epsilon:
			#print 'ending close enough'
			return int(trial_value + one_minus_epsilon)
		new_trial_value = trial_value - (trial_value_cubed - float_input_value) / (3 * trial_value_squared)
		#print 'got to', new_trial_value, 'from', trial_value
		trial_value = new_trial_value

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def divide_and_subsort( \
	ifdef(`m4pyx',unsigned int )width, \
	ifdef(`m4pyx',list )list_, \
	ifdef(`m4pyx',unsigned int )left, \
	ifdef(`m4pyx',unsigned int )right, \
	):
	'''Give each queue an initial values'''
	ifdef(`m4pyx',cdef unsigned int subwidth)
	ifdef(`m4pyx',cdef unsigned int portions)
	ifdef(`m4pyx',cdef unsigned int portionno)
	ifdef(`m4pyx',cdef unsigned int subleft)
	ifdef(`m4pyx',cdef unsigned int subright)
	# This seems to be faster even than newton's method with a low precision.  I wonder if it's getting done in hardware?
	# It's also faster than math.pow.
	subwidth = int(math.ceil(width ** 0.33333333333))
	queues = []
	# get k (portions) sorted sublists
	#portions = int(math.ceil(float(width) / subwidth))
	portions = int((width + subwidth - 1) // subwidth)
	for portionno in xrange(portions):
		subleft = left + portionno * subwidth
		subright = subleft + subwidth - 1
		if subright > right:
			subright = right
		if subleft > subright:
			# nothing to do
			continue
		_funnelsort(list_, subleft, subright)
		queue = Semi_queue(list_, subleft, subright)
		queues.append(queue)
	return queues

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def seed_heap(queues):
	'''Fill the heap initially with one value from each queue'''

	# BTW, I tried using a treap instead of a heapq - it's slower than using heapq

	heap = []
	# pull 1 value from each queue to seed the heap (0 values for an already-empty queue)
	for queueno, queue in enumerate(queues):
		if queue:
			offset, value = queue.popleft()
			heap.append((value, queueno, offset))
	heapq.heapify(heap)
	return heap

#ifdef(`m4purepy',`@profile')
def merge(ifdef(`m4pyx',list )list_, queues, ifdef(`m4pyx',list )heap, ifdef(`m4pyx',unsigned int )add_at):
	'''Merge the queues into list_'''
	ifdef(`m4pyx',cdef unsigned int duplicated_queues)
	duplicated_queues = 0
	while heap:
		# get the minimum value
		minimum_value, queueno, offset = heapq.heappop(heap)
		# expose space in list_ that we can fill into, as necessary
		while True:
			try:
				possible_duplicate_queue = queues[duplicated_queues]
			except exceptions.IndexError:
				# all queues have been duplicated - albeit odds are most were duplicated only after having shrunken quite a lot
				break
			else:
				if possible_duplicate_queue.subleft == add_at:
					possible_duplicate_queue.duplicate()
					duplicated_queues += 1
					# and go back up for another iteration, in case we need to duplicate something else too
				else:
					# nothing to duplicate
					break
		list_[add_at] = minimum_value
		add_at += 1
		possible_pull_from_queue = queues[queueno]
		if possible_pull_from_queue:
			offset, value = possible_pull_from_queue.popleft()
			heapq.heappush(heap, (value, queueno, offset))

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def _funnelsort(ifdef(`m4pyx',list )list_, ifdef(`m4pyx',unsigned int )left, ifdef(`m4pyx',unsigned int )right):
	'''funnel sort a subregion of a list'''
	ifdef(`m4pyx',cdef unsigned int width)
	ifdef(`m4pyx',cdef unsigned int queueno)
	width = right - left + 1
	# 72 seemed to perform the best on DRS' machine, and while that will probably be different on other machines, the performance
	# difference isn't extreme for other values, and optimizing this parameter dynamically is complex.
	if width <= 72:
		# we have a list of small width, it's faster to use insertion sort
		insertionsort.insertionsort(list_, left, right)
	else:
		# divide the list into sublists, recursively, and sort each list as we come back
		queues = divide_and_subsort(width, list_, left, right)

		# Set up a heap with one value from each queue
		heap = seed_heap(queues)

		# Now merge those regions, in place, using the heap to keep track of what the least element is each step of the way.
		# We use a heap rather than a straightforward list, because we will sometimes have a large number of lists to merge.
		merge(list_, queues, heap, left)