#!/usr/bin/env python3 # get the numtokeep highest lines in the input, based on sorting on field fieldno, # and either in ASCII order or numeric order # This is version 1.1 # If we have the nest module (which depends on the treap module), then use treaps for O(nlogm) time. # Otherwise, use bisect and a list for O(nm) time. import sys import functools sys.path.append('/usr/local/lib') def usage(retval): sys.stderr.write('Usage: %s [-n numtokeep] [-f fieldno] [-a] [-r] [-h] [--filename fn]\n' % sys.argv[0]) sys.stderr.write('-n n specifies the number of numbers to keep\n') sys.stderr.write('-f n says to compare on the nth field (0 based)\n') sys.stderr.write('-a says to sort in ASCII order, rather than numerically\n') sys.stderr.write('-r says to sort in reverse\n') sys.stderr.write('-s sent says if a field cannot be converted to a float,\n') sys.stderr.write(' to use sentinel value "sent" instead when sorting\n') sys.stderr.write('-h says to give usage information (help)\n') sys.stderr.write('--disallow-duplicates (or just -d) says to only keep one value of a given size\n') sys.stderr.write('--filename fn says to read from file fn\n') sys.stderr.write('--bufsize n says to read using a buffer size of n; only meaningful with --filename\n') sys.stderr.write("--use-bisect Uses Python's bisect.insort\n") sys.stderr.write("--use-treap Uses a Treap\n") sys.stderr.write("--use-heap Uses Python's heapq module\n") sys.stderr.write("--use-rbtree Uses a Red-Black tree\n") sys.stderr.write("--use-fibonacci-heap Uses a Fibonacci heap\n") sys.stderr.write('\n') sys.stderr.write('Reads from stdin by default, but that\'s considerably slower than --filename /dev/stdin\n') sys.exit(retval) def get_initial(allow_duplicates, numtokeep, Line, file_): earlyeof = False highest=[] if allow_duplicates: # get up to numtokeep values and sort them - no duplicates to worry about for i in range(numtokeep): line = file_.readline() if not line: earlyeof = True break obj = Line(line.rstrip('\n')) highest.append(obj) else: # we don't want duplicates. # get numtokeep values and sort them # we use a dictionary to keep track of whether we have unique values at this stage. # Then the one at a time code ignores the dictionary count = 0 uniqueness_dict = {} while count < numtokeep: line = file_.readline() if not line: earlyeof = True break obj = Line(line.rstrip('\n')) if line in uniqueness_dict: pass else: highest.append(obj) uniqueness_dict[line] = 1 count += 1 del uniqueness_dict return (earlyeof, highest) def main(): numtokeep = 10 fieldno = 0 ascii_order = False reverse = False have_sentinel = False datastructure_specified = False use_bisect = False use_treap = False use_fibonacci_heap = False use_heap = False use_rbtree = False filename = '' file_ = sys.stdin allow_duplicates = True bufsize = 2**18 while sys.argv[1:]: if sys.argv[1] == '-n' and sys.argv[2:]: numtokeep = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-s' and sys.argv[2:]: have_sentinel = True ascii_sentinel=sys.argv[2] numeric_sentinel=int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-f' and sys.argv[2:]: fieldno = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] in [ '--disallow-duplicates', '-d' ]: allow_duplicates = False elif sys.argv[1] == '--filename' and sys.argv[2:]: filename = sys.argv[2] del sys.argv[1] elif sys.argv[1] == '--bufsize' and sys.argv[2:]: bufsize = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-a': ascii_order = True elif sys.argv[1] == '-r': reverse = True elif sys.argv[1] == '--use-bisect': use_bisect = True datastructure_specified = True elif sys.argv[1] == '--use-treap': use_treap = True datastructure_specified = True elif sys.argv[1] == '--use-fibonacci-heap': use_fibonacci_heap = True datastructure_specified = True elif sys.argv[1] == '--use-rbtree': use_rbtree = True datastructure_specified = True elif sys.argv[1] == '--use-heap': use_heap = True datastructure_specified = True elif sys.argv[1] == '-h': usage(0) else: usage(1) del sys.argv[1] if not datastructure_specified: use_heap = True if use_bisect: import bisect elif use_treap: import treap if allow_duplicates: import dupdict_mod elif use_rbtree: import red_black_dict_mod if allow_duplicates: import dupdict_mod elif use_heap: if not allow_duplicates: sys.stderr.write('%s: heap mode does not work with --disallow-duplicates\n' % sys.argv[0]) exit(1) import heapq elif use_fibonacci_heap: if not allow_duplicates: sys.stderr.write('%s: fibonacci heap mode does not work with --disallow-duplicates\n' % sys.argv[0]) exit(1) import fibonacci_heap_mod else: message1 = '%s: You must specify exactly one of --use-bisect, --use-heap, ' % (sys.argv[0], ) message2 = '--use-treap, --use-rbtree or --use-fibonacci-heap\n' sys.stderr.write(message1 + message2) sys.exit(1) if filename != '': file_ = open(filename, 'r', bufsize) if use_treap: def Line(line): fields=line.split() key = None if fields[fieldno:]: if ascii_order: try: key = fields[fieldno] except (ValueError, IndexError): key = ascii_sentinel else: try: key = float(fields[fieldno]) except (ValueError, IndexError): sys.stderr.write('Could not convert %s to a float\n' % fields[fieldno]) if have_sentinel: key = numeric_sentinel line = line else: sys.exit(1) else: line = line else: if have_sentinel: if ascii_order: key = ascii_sentinel else: key = numeric_sentinel line = line else: sys.stderr.write('Could not convert nonexistent field %d to float\n' % fieldno) if key is None: raise ValueError('key is None: %s' % line) return (key, line) elif use_heap or use_bisect or use_rbtree or use_fibonacci_heap: @functools.total_ordering class Line: def __init__(self, line): fields=line.split() self.key = None if fields[fieldno:]: if ascii_order: self.key = fields[fieldno] else: try: self.key = float(fields[fieldno]) except ValueError: sys.stderr.write('Could not convert %s to a float\n' % fields[fieldno]) if have_sentinel: if ascii_order: self.key = ascii_sentinel else: self.key = numeric_sentinel self.line = line else: sys.exit(1) else: self.line = line else: if have_sentinel: if ascii_order: self.key = ascii_sentinel else: self.key = numeric_sentinel self.line = line else: sys.stderr.write('Could not convert nonexistent field %d to float\n' % fieldno) if self.key is None: raise ValueError('key is None: %s' % line) def __cmp__(self, other): # For Python 2.x if reverse: if self.key > other.key: return -1 elif self.key < other.key: return 1 else: return 0 else: if self.key < other.key: return -1 elif self.key > other.key: return 1 else: return 0 def __lt__(self, other): # For Python 3.x if reverse: if self.key > other.key: return True else: return False else: if self.key < other.key: return True else: return False def __eq__(self, other): # For Python 3.x if self.key == other.key: return True else: return False def __str__(self): return self.line __repr__ = __str__ if use_treap: # use treaps, O(n*log2m) earlyeof = False if allow_duplicates: highest = dupdict_mod.Dupdict(dict_like_object=treap.treap()) else: highest = treap.treap() for i in range(numtokeep): line = file_.readline() if not line: earlyeof = True break obj = Line(line.rstrip('\n')) highest[obj] = 0 if not earlyeof: for line in file_: obj = Line(line.rstrip('\n')) highest[obj] = 0 #sys.stderr.write('type(highest) == %s\n' % (type(highest), )) if len(highest) > numtokeep: if reverse: dummy, dummy = highest.remove_max() else: dummy, dummy = highest.remove_min() if reverse: list_ = list(highest) else: list_ = list(highest.reverse_iterator()) for tupl in list_: print(tupl[1]) elif use_fibonacci_heap: fib_heap = fibonacci_heap_mod.Fibonacci_heap() # Attempt to inhale numtokeep values index = 0 for line in file_: line = line.rstrip('\n') fields = line.split() priority = float(fields[fieldno]) if reverse: priority = -priority fib_heap.enqueue(line, priority) index += 1 if index >= numtokeep: break # Inhale the remainder of stdin - if any if index == numtokeep: for line in file_: line = line.rstrip('\n') fields = line.split() priority = float(fields[fieldno]) if reverse: priority = -priority fib_heap.enqueue(line, priority) dummy = fib_heap.dequeue_min() # print the best numtokeep values list_ = [] while fib_heap: entry = fib_heap.dequeue_min() list_.append(entry.get_value()) list_.reverse() for element in list_: print(element) elif use_heap: early_eof, highest = get_initial(allow_duplicates, numtokeep, Line, file_) if not early_eof: heapq.heapify(highest) for line in file_: obj = Line(line.rstrip('\n')) if obj > highest[0]: # add obj, remove the lowest value dummy = heapq.heappushpop(highest, obj) reverse = not reverse highest.sort() for value in highest: print(value) elif use_rbtree: if allow_duplicates: highest = dupdict_mod.Dupdict(dict_like_object=red_black_dict_mod.RedBlackTree()) else: highest = red_black_dict_mod.RedBlackTree() for line in file_: obj = Line(line.rstrip('\n')) if len(highest) < numtokeep: # add obj highest[obj] = 0 elif obj > highest.find_min(): # add obj, remove the lowest value highest[obj] = 0 # This checked out fine, Wed Apr 23 14:49:54 PDT 2014 #assert len(highest) == highest._slow_len() if len(highest) > numtokeep: (key, value) = highest.remove_min() dummy = key assert value == 0 list_ = list(highest) list_.reverse() for value in list_: print(value) elif use_bisect: # don't use a binary tree of any sort, O(n*m) earlyeof, highest = get_initial(allow_duplicates, numtokeep, Line, file_) # a trick to speed things up - we pretend we're sorting in reverse order so we can delete highest[-1] instead # of highest[0] reverse = not reverse highest.sort() if not earlyeof: for line in file_: obj = Line(line.rstrip('\n')) if obj < highest[-1]: if allow_duplicates: bisect.insort(highest, obj) del highest[-1] else: insertion_point = bisect.bisect_left(highest, obj) if highest[insertion_point:] and highest[insertion_point] != obj: highest.insert(insertion_point, obj) del highest[-1] else: # it's already there, don't add it because duplicates are disallowed pass for value in highest: print(value) main()