#!/usr/bin/python

# We want profile data for this module, even when we're built with cython - during testing.  For RL, we want to turn this off.
# cython: profile=False

'''This module provides a funnel sort routine - IOW, a k-ary (or exponential) merge sort with an insertion sort at the bottom'''

#mport sys
import math
#mport time
import heapq
#mport pprint
import exceptions
#mport collections

# In CPython, insertion sort seems to outperform binarysort almost always.
# In Cython, insertion sort beats binarysort a little less often, but it still wins most of the time.
# So we use insertion sort, not binary sort
# However, binarysort is left here as a form of documentation about what I tried.

ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',cp)def binarysort(list_, ifdef(`m4pyx',int )initial_left, ifdef(`m4pyx',int )initial_right):
	ifdef(`m4pyx', cdef int left)
	ifdef(`m4pyx', cdef int left_for_binary_search)
	ifdef(`m4pyx', cdef int right_for_binary_search)

	'''
	Sort a (small) list using binary sort.  It's much like insertion sort, except we use a binary search
	on the already-sorted portion to find where things should be placed
	'''

	#print 'initial:', list_
	sorted_left = initial_left
	sorted_right = initial_left + 1
	#print 'foo'
	#sys.stdout.flush()
	while sorted_right < initial_right + 1:
		#print 'top of outer loop:', list_[sorted_left:sorted_right], list_[sorted_right+1:initial_right+1]
		#sys.stdout.flush()
		#time.sleep(0.25)
		left_for_binary_search = sorted_left
		right_for_binary_search = sorted_right
		pivot = list_[initial_right]
		#print 'put %s in its proper place' % pivot
		assert left_for_binary_search < right_for_binary_search
		while True:
			midpoint = (left_for_binary_search + right_for_binary_search) / 2
			#print 'binsearch', 'left_for_binary_search:', left_for_binary_search, 'midpoint:', midpoint, 'right_for_binary_search:', right_for_binary_search
			if pivot < list_[midpoint]:
				#print 'moving right to midpoint'
				right_for_binary_search = midpoint
			else:
				#print 'moving left to midpoint'
				left_for_binary_search = midpoint + 1
			if left_for_binary_search >= right_for_binary_search:
				final_point = left_for_binary_search
				break
		#print '%s belongs at index %d' % (pivot, right_for_binary_search)
		assert left_for_binary_search == right_for_binary_search
		#print 'before shifts', list_
		for shift_index in xrange(initial_right - 1, final_point - 1, -1):
			#print 'shifting index', shift_index, 'to index', shift_index+1
			list_[shift_index + 1] = list_[shift_index]
			#print 'after that shift:', list_
		#print 'just before final placement', list_
		#print 'putting %s at index %s' % (pivot, final_point)
		list_[final_point] = pivot

		sorted_right += 1
	#print 'final:', list_

ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',cp)def insertionsort(list_, ifdef(`m4pyx',int )left, ifdef(`m4pyx',int) right):
	'''Perform an insertion sort on a subregion of a list'''
	ifdef(`m4pyx',cdef int i)
	ifdef(`m4pyx',cdef int j)
	for i in xrange(left+1, right+1):
		j = i
		temp = list_[j]
		#while j > left and list_[j-1] > temp:
		#	list_[j] = list_[j-1]
		#	j -= 1
		while True:
			list_j_minus_1 = list_[j - 1]
			if j > left and list_j_minus_1 > temp:
				list_[j] = list_j_minus_1
				j -= 1
			else:
				break
		list_[j] = temp

def funnelsort(list_):
	'''Perform a funnel sort on an entire list'''
	_funnelsort(list_, 0, `len'(list_) - 1)

ifdef(`m4pyx',cdef )class Semi_queue:
	'''
	Class provides a queue datastructure.  Initially it doles out portions of a passed-in list, but later it duplicates
	the relevant subregion to facilitate in-place merging - after the size of the queue has already decreased, often
	significantly.
	'''
	ifdef(`m4pyx',cdef object list)
	ifdef(`m4pyx',cdef public int subleft)
	ifdef(`m4pyx',cdef int subright)
	ifdef(`m4pyx',cdef int empty)
#	ifdef(`m4pyx',cdef int duplicated)
	ifdef(`m4pyx',cdef int has_some)

	def __init__(self, list_, ifdef(`m4pyx',int )subleft, ifdef(`m4pyx',int )subright):
		'''Constructor'''
		self.list = list_
		#self.original_length = `len'(list_)
		self.subleft = subleft
		self.subright = subright
		if self.subleft > self.subright:
			self.has_some = False
		else:
			self.has_some = True
#		self.duplicated = False

	def __str__(self):
		return 'list: %s, has_some: %s' % (self.list[self.subleft:self.subright+1], self.has_some)

	__repr__ = __str__

	ifdef(`m4pyx',cp)def popleft(self):
		'''Remove an element from the queue'''
		value = self.list[self.subleft]
		self.subleft += 1
		if self.subleft > self.subright:
			self.has_some = False
		return value
	
	def __nonzero__(self):
		'''Is the queue nonempty?'''
		return self.has_some

	ifdef(`m4pyx',cp)def duplicate(self):
		'''Duplicate the still-relevant subregion, to facilitate mostly in-place merging'''
#		if self.duplicated:
#			print 'Eh?  Already duplicated'
#			sys.exit(1)
		self.list = self.list[self.subleft:self.subright+1]
		self.subright -= self.subleft
		self.subleft = 0
#		self.duplicated = True

# This seems like a good idea, but in reality, it's very slow
ifdef(`m4pyx',cp)def int_cube_root_ceil(input_value):
	'''Binary search for the cubed root of input_value, rounded up'''
	if input_value in (0, 1):
		return input_value 
	# the cube root will always be >= 0, because we're only defined for nonnegative whole numbers
	low_guess = 0
	# the cube root will always be less than input_value.  Actually, we know it'll be significantly lower than that, but starting
	# any lower than input_value gives us rounding problems sometimes.
	high_guess = input_value
#	real_value = math.pow(input_value, 1/3.0)
	while True: 
#		if low_guess > real_value:
#			print 'low_guess got too high:', low_guess, real_value
#			sys.exit(1)
#		if high_guess < real_value:
#			#print 'high_guess got too high:', high_guess, real_value
#			sys.exit(1)
		midpoint = (low_guess + high_guess) // 2
		#print low_guess, midpoint, high_guess
		trial_value = midpoint * midpoint * midpoint 
		if trial_value == input_value:
			# This is exact - return the midpoint
			#print 'exact:', midpoint
			return midpoint 
		if low_guess + 1 == high_guess:
			# This one's not exact, and we know it's somewhere between low_guess and high_guess, and we know we aren't
			# going to get any more precision.  And, we need to round up.  Because the result isn't precise, we know we
			# need high_guess, not low_guess.
			#print 'inexact, round up:', high_guess
			return high_guess
		if trial_value > input_value:
			#print 'decreasing high_guess from', high_guess, 'to', midpoint, 'because %d > %f' % (trial_value, input_value)
			high_guess = midpoint
		elif trial_value < input_value:
			#print 'increasing low_guess from', low_guess, 'to', midpoint, 'because %d < %f' % (trial_value, input_value)
			low_guess = midpoint

# It appears that Newton's method of finding a (cubed) root doesn't work for integer values.
# However, we can still compute the real root of x**3 - input_value == 0 to a low
# degree of precision, and then round up.
# However, even this is still too slow.
def newtons_cube_root(input_value):
	'''Compute a cubed root using newton's method'''
	#print 'starting', input_value
	float_input_value = float(input_value)
	trial_value = float_input_value / 3.0
	epsilon = 0.499
	one_minus_epsilon = 1 - epsilon
	while True:
		trial_value_squared = trial_value * trial_value
		trial_value_cubed = trial_value_squared * trial_value 
		if trial_value_cubed == float_input_value:
			#print 'ending exact'
			return int(trial_value)
		if abs(trial_value_cubed - float_input_value) < epsilon:
			#print 'ending close enough'
			return int(trial_value + one_minus_epsilon)
		new_trial_value = trial_value - (trial_value_cubed - float_input_value) / (3 * trial_value_squared)
		#print 'got to', new_trial_value, 'from', trial_value
		trial_value = new_trial_value

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def divide_and_subsort( \
	ifdef(m4pyx,int )width, \
	list_, \
	ifdef(`m4pyx',int )left, \
	ifdef(`m4pyx',int )right, \
	):
	# pylint: disable-msg=R0913
	# R0913: We want 6 arguments
	'''Give each queue an initial values'''
	ifdef(`m4pyx',cdef int subwidth)
	ifdef(`m4pyx',cdef int portions)
	ifdef(`m4pyx',cdef int portionno)
	ifdef(`m4pyx',cdef int subleft)
	ifdef(`m4pyx',cdef int subright)
	# This seems to be faster even than newton's method with a low precision.  I wonder if it's getting done in hardware?
	# It's also faster than math.pow.
	subwidth = int(math.ceil(width ** 0.33333333333))
	queues = []
	# get k (portions) sorted sublists
	#portions = int(math.ceil(float(width) / subwidth))
	portions = int((width + subwidth - 1) // subwidth)
	for portionno in xrange(portions):
		subleft = left + portionno * subwidth
		subright = subleft + subwidth - 1
		if subright > right:
			subright = right
		if subleft > subright:
			# nothing to do
			continue
		_funnelsort(list_, subleft, subright)
		queue = Semi_queue(list_, subleft, subright)
		queues.append(queue)
	return queues

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def seed_heap(queues):
	'''Fill the heap initially with one value from each queue'''

	# BTW, I tried using a treap instead of a heapq - it's slower than using heapq

	heap = []
	# pull 1 value from each queue to seed the heap (0 values for an already-empty queue)
	for queueno, queue in enumerate(queues):
		if queue:
			value = queue.popleft()
			heap.append((value, queueno))
	heapq.heapify(heap)
	return heap

#ifdef(`m4purepy',`@profile')
def merge(list_, queues, heap, ifdef(`m4pyx',int )add_at):
	'''Merge the queues into list_'''
	ifdef(`m4pyx',cdef int duplicated_queues)
	duplicated_queues = 0
	while heap:
		# get the minimum value
		minimum_value, queueno = heapq.heappop(heap)
		# expose space in list_ that we can fill into, as necessary
		while True:
			try:
				possible_duplicate_queue = queues[duplicated_queues]
			except exceptions.IndexError:
				# all queues have been duplicated - albeit odds are most were duplicated only after having shrunken quite a lot
				break
			else:
				if possible_duplicate_queue.subleft == add_at:
					possible_duplicate_queue.duplicate()
					duplicated_queues += 1
					# and go back up for another iteration, in case we need to duplicate something else too
				else:
					# nothing to duplicate
					break
		list_[add_at] = minimum_value
		add_at += 1
		possible_pull_from_queue = queues[queueno]
		if possible_pull_from_queue:
			value = possible_pull_from_queue.popleft()
			heapq.heappush(heap, (value, queueno))

#ifdef(`m4purepy',`@profile')
ifdef(`m4pyx',c)def _funnelsort(list_, ifdef(`m4pyx',int )left, ifdef(`m4pyx',int )right):
	'''funnel sort a subregion of a list'''
	ifdef(`m4pyx',cdef int width)
	ifdef(`m4pyx',cdef int queueno)
	width = right - left + 1
	# 72 seemed to perform the best on DRS' machine, and while that will probably be different on other machines, the performance
	# difference isn't extreme for other values, and optimizing this parameter dynamically is complex.
	if width <= 72:
		# we have a list of small width, it's faster to use insertion sort
		insertionsort(list_, left, right)
	else:
		# divide the list into sublists, recursively, and sort each list as we come back
		queues = divide_and_subsort(width, list_, left, right)

		# Set up a heap with one value from each queue
		heap = seed_heap(queues)

		# Now merge those regions, in place, using the heap to keep track of what the least element is each step of the way.
		# We use a heap rather than a straightforward list, because we will sometimes have a large number of lists to merge.
		merge(list_, queues, heap, left)