#!/usr/bin/python # def revcmp(a, b): return cmp(b, a) # a.sort(revcmp) import os import stat import sys import string class file_bucket: def __init__(self,attributes): self.attributes=attributes self.names=[attributes.name] def add_name(self,name): self.names = self.names + [name] def __str__(self): return string.join(self.names) class file_attributes: def __init__(self,name): self.name = name statbuf=os.stat(name) self.dev = statbuf[stat.ST_DEV] self.ino = statbuf[stat.ST_INO] self.len = statbuf[stat.ST_SIZE] self.len_summed = 0 self.checksum = 0 def extend_sum(self): max_chunk=32768 len=self.len-self.len_summed if len > max_chunk: len=max_chunk if len == 0: return # open for reading only fp=os.open(self.name,0) dummy=os.lseek(fp,self.len_summed,0) buf=os.read(fp,len) os.close(fp) # for chr in buf[0:len-1]: # self.checksum = self.checksum + ord(chr) # I suspect a large prime is good here... Make sure it's larger # than 256*max_chunk, or move the % into the loop! # self.checksum = self.checksum % 366829 self.checksum = (self.checksum + hash(buf)) % 366829 self.len_summed = self.len_summed + len # derived from Lib/cmp.py def do_cmp(self, other): # Compare two files, for real bufsize = 8096 # Could be tuned self_fp = os.open(self.name, 0) other_fp = os.open(other.name, 0) while 1: self_buf = os.read(self_fp,bufsize) other_buf = os.read(other_fp,bufsize) if self_buf <> other_buf: os.close(self_fp) os.close(other_fp) return 0 if not self_buf: os.close(self_fp) os.close(other_fp) return 1 # it might be helpful to make this idempotent, someday def are_equal(self,other): if self.dev == other.dev and self.ino == other.ino: return 1 if self.len <> other.len: return 0 if self.len_summed == 0: self.extend_sum() if other.len_summed == 0: other.extend_sum() # extend the len_summed's as needed, to make them equal. # don't forget that the files must have equal length to reach this # point while self.len_summed < other.len_summed: self.extend_sum() while other.len_summed < self.len_summed: other.extend_sum() # while the sums are equal, and we haven't hit the end of the files... while self.len_summed < self.len and self.checksum == other.checksum: self.extend_sum() other.extend_sum() if self.checksum <> other.checksum: return 0 # These two files have identical lengths and identical checksums # It's time to do it the hard way return file_attributes.do_cmp(self,other) def __repr__(self): return self.name def main(): equivs=[] # this is currently O(n^2), and operates much like a worst-case insertion # sort. It -could- be done O(nlogn), but I'm not sure I see how to do # so without giving up the incremental checksum heuristic. It might # be worth trying without the checksumming someday; probably wouldn't # take long - and I suppose either could be allowed with a command-line # switch if sys.argv[1] == '-v': del sys.argv[1] verbose=1 else: verbose=0 fileno=0 for filename in sys.argv[1:]: if verbose: fileno=fileno+1 sys.stderr.write('adding '+filename+' '+`fileno`+' ') sys.stderr.write(`len(sys.argv)-fileno`+' '+`len(equivs)`+'\n') #print filename,equivs nextfile = file_attributes(filename) for possible_match in equivs: if file_attributes.are_equal(nextfile,possible_match.attributes): possible_match = possible_match.add_name(filename) break else: equivs = equivs + [file_bucket(nextfile)] #print for file in equivs: print string.join(file.names) main() #filelist=[[]] #for file in sys.argv[1:]: # filelist[0]=filelist[0] + [file] # #print filelist # # #stat1=os.stat(filelist[0][0]) ##print stat #for filenum in range(len(filelist)-1): # stat2=os.stat(filelist[filenum+1][0]) # print filelist[filenum], filelist[filenum+1] # print stat1[stat.ST_SIZE],stat2[stat.ST_SIZE] # stat1=stat2 # # ##stat=os.stat(file) #