#!/usr/bin/env python # get the numtokeep highest lines in the input, based on sorting on field fieldno, # and either in ASCII order or numeric order # This is version 1.0 import os import sys import bisect try: import psyco #import nopers psyco.full() except: if os.uname()[4] in [ 'i386', 'i486', 'i586', 'i686' ]: sys.stderr.write('Psyco initialization failed. You appear to be on an x86 system, so you might want\n') sys.stderr.write('to install it to get better performance.\n') else: if not os.uname()[4] in [ 'i386', 'i486', 'i586', 'i686' ]: sys.stderr.write('Psyco initialization succeeded and that is interesting, because I thought it was x86-only :)\n') numtokeep = 10 fieldno = 0 ascii_order = 0 reverse = 0 ignore = 0 have_sentinel=0 every=0 def usage(retval): sys.stderr.write('Usage: %s [-n numtokeep] [-e every] [-f fieldno] [-a] [-r] [-h] [--filename fn]\n' % sys.argv[0]) sys.stderr.write('-a says to sort in ASCII order, rather than numerically\n') sys.stderr.write('-r says to sort in reverse\n') sys.stderr.write('-s sent says if a field cannot be converted to a float,\n') sys.stderr.write(' to use sentinel value "sent" instead when sorting\n') sys.stderr.write('-h says to give usage information (help)\n') sys.stderr.write('\n') sys.stderr.write('Reads from stdin by default, but that\'s considerably slower\n') sys.exit(retval) filename = '' file = sys.stdin while sys.argv[1:]: if sys.argv[1] == '-n' and sys.argv[2:]: numtokeep = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-s' and sys.argv[2:]: sentinel=int(sys.argv[2]) have_sentinel=1 del sys.argv[1] elif sys.argv[1] == '-f' and sys.argv[2:]: fieldno = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '--filename' and sys.argv[2:]: filename = sys.argv[2] del sys.argv[1] elif sys.argv[1] == '--bufsize' and sys.argv[2:]: bufsize = int(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '-a': ascii_order = 1 elif sys.argv[1] == '-r': reverse = 1 elif sys.argv[1] == '-h': usage(0) else: usage(1) del sys.argv[1] if filename != '': file = open(filename, 'r', bufsize) class Line: def __init__(self,line): fields=line.split() if fields[fieldno:]: if ascii_order: self.key = fields[fieldno] else: try: self.key = float(fields[fieldno]) except: sys.stderr.write('Could not convert %s to a float\n' % fields[fieldno]) if have_sentinel: self.key = sentinel else: sys.exit(1) else: self.line = line else: if have_sentinel: self.key = sentinel self.line = line else: sys.stderr.write('Could not convert nonexistent field %d to float\n' % fieldno) # fast? def __cmp__(self,other): if reverse: if self.key > other.key: return -1 elif self.key < other.key: return 1 else: return 0 else: if self.key < other.key: return -1 elif self.key > other.key: return 1 else: return 0 # pretty but slow? # def __cmp__(self,other): # if reverse: # return -cmp(self.key, other.key) # else: # return cmp(self.key, other.key) def __str__(self): return self.line __repr__ = __str__ earlyeof = 0 highest=[] for i in range(numtokeep): line = file.readline() if not line: earlyeof = 1 break obj = Line(line) highest.append(obj) def dump(): print [ str(x).strip() for x in highest ] print # a little trick to speed things up - we pretend we're sorting in reverse order so we can delete highest[-1] instead # of highest[0] reverse = not reverse highest.sort() #dump() # this really needs a heap of some sort! if not earlyeof: for line in file: obj = Line(line) if obj < highest[-1]: bisect.insort(highest, obj) del highest[-1] #dump() for i in highest: sys.stdout.write(str(i))