#!/usr/bin/env python3 # get the numtokeep highest lines in the input, based on sorting on field fieldno, # and either in ASCII order or numeric order # This is version 1.1 # If we have the nest module (which depends on the treap module), then use treaps for O(nlogm) time. # Otherwise, use bisect and a list for O(nm) time. import sys import functools sys.path.append("/usr/local/lib") def usage(retval): sys.stderr.write("Usage: %s [-n numtokeep] [-f fieldno] [-a] [-r] [-h] [--filename fn]\n" % sys.argv[0]) sys.stderr.write("-n n specifies the number of numbers to keep\n") sys.stderr.write("-f n says to compare on the nth field (0 based)\n") sys.stderr.write("-a says to sort in ASCII order, rather than numerically\n") sys.stderr.write("-r says to sort in reverse\n") sys.stderr.write("-s sent says if a field cannot be converted to a float,\n") sys.stderr.write(' to use sentinel value "sent" instead when sorting\n') sys.stderr.write("-h says to give usage information (help)\n") sys.stderr.write("--disallow-duplicates (or just -d) says to only keep one value of a given size\n") sys.stderr.write("--filename fn says to read from file fn\n") sys.stderr.write("--bufsize n says to read using a buffer size of n; only meaningful with --filename\n") sys.stderr.write("--use-bisect Uses Python's bisect.insort\n") sys.stderr.write("--use-treap Uses a Treap\n") sys.stderr.write("--use-heap Uses Python's heapq module\n") sys.stderr.write("--use-rbtree Uses a Red-Black tree\n") sys.stderr.write("--use-fibonacci-heap Uses a Fibonacci heap; the -a option is not allowed with a fibonacci heap\n") sys.stderr.write("\n") sys.stderr.write("Reads from stdin by default, but that's considerably slower than --filename /dev/stdin\n") sys.exit(retval) def get_initial(allow_duplicates, numtokeep, Line, file_): earlyeof = False highest = [] if allow_duplicates: # get up to numtokeep values and sort them - no duplicates to worry about for i in range(numtokeep): line = file_.readline() if not line: earlyeof = True break obj = Line(line.rstrip("\n")) highest.append(obj) else: # we don't want duplicates. # get numtokeep values and sort them # we use a dictionary to keep track of whether we have unique values at this stage. # Then the one at a time code ignores the dictionary count = 0 uniqueness_dict = {} while count < numtokeep: line = file_.readline() if not line: earlyeof = True break obj = Line(line.rstrip("\n")) if line in uniqueness_dict: pass else: highest.append(obj) uniqueness_dict[line] = 1 count += 1 del uniqueness_dict return (earlyeof, highest) def main(): numtokeep = 10 fieldno = 0 ascii_order = False reverse = False have_sentinel = False datastructure_specified = False use_bisect = False use_treap = False use_fibonacci_heap = False use_heap = False use_rbtree = False filename = "" file_ = sys.stdin allow_duplicates = True bufsize = 2**18 while sys.argv[1:]: if sys.argv[1] == "-n" and sys.argv[2:]: numtokeep = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == "-s" and sys.argv[2:]: have_sentinel = True ascii_sentinel = sys.argv[2] numeric_sentinel = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == "-f" and sys.argv[2:]: fieldno = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] in ["--disallow-duplicates", "-d"]: allow_duplicates = False elif sys.argv[1] == "--filename" and sys.argv[2:]: filename = sys.argv[2] del sys.argv[1] elif sys.argv[1] == "--bufsize" and sys.argv[2:]: bufsize = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == "-a": ascii_order = True elif sys.argv[1] == "-r": reverse = True elif sys.argv[1] == "--use-bisect": use_bisect = True datastructure_specified = True elif sys.argv[1] == "--use-treap": use_treap = True datastructure_specified = True elif sys.argv[1] == "--use-fibonacci-heap": use_fibonacci_heap = True datastructure_specified = True elif sys.argv[1] == "--use-rbtree": use_rbtree = True datastructure_specified = True elif sys.argv[1] == "--use-heap": use_heap = True datastructure_specified = True elif sys.argv[1] == "-h": usage(0) else: usage(1) del sys.argv[1] if not datastructure_specified: use_heap = True if use_bisect: import bisect elif use_treap: import treap if allow_duplicates: import dupdict_mod elif use_rbtree: import red_black_dict_mod if allow_duplicates: import dupdict_mod elif use_heap: if not allow_duplicates: sys.stderr.write("%s: heap mode does not work with --disallow-duplicates\n" % sys.argv[0]) exit(1) import heapq elif use_fibonacci_heap: if not allow_duplicates: sys.stderr.write("%s: fibonacci heap mode does not work with --disallow-duplicates\n" % sys.argv[0]) exit(1) import fibonacci_heap_mod if ascii_order: sys.stderr.write("%s: fibonacci heap mode does not work with -a\n" % sys.argv[0]) exit(1) else: message1 = "%s: You must specify exactly one of --use-bisect, --use-heap, " % (sys.argv[0],) message2 = "--use-treap, --use-rbtree or --use-fibonacci-heap\n" sys.stderr.write(message1 + message2) sys.exit(1) if filename != "": file_ = open(filename, "r", bufsize) if use_treap: def Line(line): fields = line.split() key = None if fields[fieldno:]: if ascii_order: try: key = fields[fieldno] except (ValueError, IndexError): key = ascii_sentinel else: try: key = float(fields[fieldno]) except (ValueError, IndexError): sys.stderr.write("Could not convert %s to a float\n" % fields[fieldno]) if have_sentinel: key = numeric_sentinel line = line else: sys.exit(1) else: line = line else: if have_sentinel: if ascii_order: key = ascii_sentinel else: key = numeric_sentinel line = line else: sys.stderr.write("Could not convert nonexistent field %d to float\n" % fieldno) if key is None: raise ValueError("key is None: %s" % line) return (key, line) elif use_heap or use_bisect or use_rbtree or use_fibonacci_heap: @functools.total_ordering class Line: def __init__(self, line): self.line = line fields = line.split() self.key = None if fields[fieldno:]: if ascii_order: self.key = fields[fieldno] else: try: self.key = float(fields[fieldno]) except ValueError: sys.stderr.write("Could not convert %s to a float\n" % fields[fieldno]) if have_sentinel: if ascii_order: self.key = ascii_sentinel else: self.key = numeric_sentinel else: sys.exit(1) else: self.line = line else: if have_sentinel: if ascii_order: self.key = ascii_sentinel else: self.key = numeric_sentinel self.line = line else: sys.stderr.write("Could not convert nonexistent field %d to float\n" % fieldno) if self.key is None: raise ValueError("key is None: %s" % line) def __cmp__(self, other): # For Python 2.x if reverse: if self.key > other.key: return -1 elif self.key < other.key: return 1 else: return 0 else: if self.key < other.key: return -1 elif self.key > other.key: return 1 else: return 0 def __lt__(self, other): # For Python 3.x if reverse: if self.key > other.key: return True else: return False else: if self.key < other.key: return True else: return False def __eq__(self, other): # For Python 3.x if self.key == other.key: return True else: return False def __str__(self): return self.line __repr__ = __str__ if use_treap: # use treaps, O(n*log2m) earlyeof = False if allow_duplicates: highest = dupdict_mod.Dupdict(dict_like_object=treap.treap()) else: highest = treap.treap() for i in range(numtokeep): line = file_.readline() if not line: earlyeof = True break obj = Line(line.rstrip("\n")) highest[obj] = 0 if not earlyeof: for line in file_: obj = Line(line.rstrip("\n")) highest[obj] = 0 # sys.stderr.write('type(highest) == %s\n' % (type(highest), )) if len(highest) > numtokeep: if reverse: dummy, dummy = highest.remove_max() else: dummy, dummy = highest.remove_min() if reverse: list_ = list(highest) else: list_ = list(highest.reverse_iterator()) for tupl in list_: print(tupl[1]) elif use_fibonacci_heap: fib_heap = fibonacci_heap_mod.Fibonacci_heap() # Attempt to inhale numtokeep values index = 0 for line in file_: line = line.rstrip("\n") fields = line.split() priority = float(fields[fieldno]) if reverse: priority = -priority fib_heap.enqueue(line, priority) index += 1 if index >= numtokeep: break # Inhale the remainder of stdin - if any if index == numtokeep: for line in file_: line = line.rstrip("\n") fields = line.split() priority = float(fields[fieldno]) if reverse: priority = -priority fib_heap.enqueue(line, priority) dummy = fib_heap.dequeue_min() # print the best numtokeep values list_ = [] while fib_heap: entry = fib_heap.dequeue_min() list_.append(entry.get_value()) list_.reverse() for element in list_: print(element) elif use_heap: early_eof, highest = get_initial(allow_duplicates, numtokeep, Line, file_) if not early_eof: heapq.heapify(highest) for line in file_: obj = Line(line.rstrip("\n")) if obj > highest[0]: # add obj, remove the lowest value dummy = heapq.heappushpop(highest, obj) reverse = not reverse highest.sort() for value in highest: print(value) elif use_rbtree: if allow_duplicates: highest = dupdict_mod.Dupdict(dict_like_object=red_black_dict_mod.RedBlackTree()) else: highest = red_black_dict_mod.RedBlackTree() for line in file_: obj = Line(line.rstrip("\n")) if len(highest) < numtokeep: # add obj highest[obj] = 0 elif obj > highest.find_min(): # add obj, remove the lowest value highest[obj] = 0 # This checked out fine, Wed Apr 23 14:49:54 PDT 2014 # assert len(highest) == highest._slow_len() if len(highest) > numtokeep: (key, value) = highest.remove_min() dummy = key assert value == 0 list_ = list(highest) list_.reverse() for value in list_: print(value) elif use_bisect: # don't use a binary tree of any sort, O(n*m) earlyeof, highest = get_initial(allow_duplicates, numtokeep, Line, file_) # a trick to speed things up - we pretend we're sorting in reverse order so we can delete highest[-1] instead # of highest[0] reverse = not reverse highest.sort() if not earlyeof: for line in file_: obj = Line(line.rstrip("\n")) if obj < highest[-1]: if allow_duplicates: bisect.insort(highest, obj) del highest[-1] else: insertion_point = bisect.bisect_left(highest, obj) if highest[insertion_point:] and highest[insertion_point] != obj: highest.insert(insertion_point, obj) del highest[-1] else: # it's already there, don't add it because duplicates are disallowed pass for value in highest: print(value) main()