#!/usr/bin/python

# def revcmp(a, b): return cmp(b, a)
# a.sort(revcmp)

import os
import stat
import sys
import string

class file_bucket:
	
	def __init__(self,attributes):
		self.attributes=attributes
		self.names=[attributes.name]
	
	def add_name(self,name):
		self.names = self.names + [name]
	
	def __str__(self):
		return string.join(self.names)

class file_attributes:

	def __init__(self,name):
		self.name = name
		statbuf=os.stat(name)
		self.dev = statbuf[stat.ST_DEV]
		self.ino = statbuf[stat.ST_INO]
		self.len = statbuf[stat.ST_SIZE]
		self.len_summed = 0
		self.checksum = 0

	def extend_sum(self):
		max_chunk=32768
		len=self.len-self.len_summed
		if len > max_chunk:
			len=max_chunk
		if len == 0:
			return
		# open for reading only
		fp=os.open(self.name,0)
		dummy=os.lseek(fp,self.len_summed,0)
		buf=os.read(fp,len)
		os.close(fp)
#		for chr in buf[0:len-1]:
#			self.checksum = self.checksum + ord(chr)
		# I suspect a large prime is good here...  Make sure it's larger
		# than 256*max_chunk, or move the % into the loop!
#		self.checksum = self.checksum % 366829
		self.checksum = (self.checksum + hash(buf)) % 366829
		self.len_summed = self.len_summed + len
	
	# derived from Lib/cmp.py
	def do_cmp(self, other): # Compare two files, for real
		bufsize = 8096 # Could be tuned
		self_fp = os.open(self.name, 0)
		other_fp = os.open(other.name, 0)
		while 1:
			self_buf = os.read(self_fp,bufsize)
			other_buf = os.read(other_fp,bufsize)
			if self_buf <> other_buf:
				os.close(self_fp)
				os.close(other_fp)
				return 0
			if not self_buf:
				os.close(self_fp)
				os.close(other_fp)
				return 1

	# it might be helpful to make this idempotent, someday
	def are_equal(self,other):
		if self.dev == other.dev and self.ino == other.ino:
			return 1
		if self.len <> other.len:
			return 0
		if self.len_summed == 0:
			self.extend_sum()
		if other.len_summed == 0:
			other.extend_sum()
		# extend the len_summed's as needed, to make them equal.
		# don't forget that the files must have equal length to reach this
		# point
		while self.len_summed < other.len_summed:
			self.extend_sum()
		while other.len_summed < self.len_summed:
			other.extend_sum()
		# while the sums are equal, and we haven't hit the end of the files...
		while self.len_summed < self.len and self.checksum == other.checksum:
			self.extend_sum()
			other.extend_sum()
		if self.checksum <> other.checksum:
			return 0
		# These two files have identical lengths and identical checksums
		# It's time to do it the hard way
		return file_attributes.do_cmp(self,other)
			
	def __repr__(self):
		return self.name


def main():
	equivs=[]
	# this is currently O(n^2), and operates much like a worst-case insertion
	# sort.  It -could- be done O(nlogn), but I'm not sure I see how to do
	# so without giving up the incremental checksum heuristic.  It might
	# be worth trying without the checksumming someday; probably wouldn't
	# take long - and I suppose either could be allowed with a command-line
	# switch
	if sys.argv[1] == '-v':
		del sys.argv[1]
		verbose=1
	else:
		verbose=0
	fileno=0
	for filename in sys.argv[1:]:
		if verbose:
			fileno=fileno+1
			sys.stderr.write('adding '+filename+' '+`fileno`+' ')
			sys.stderr.write(`len(sys.argv)-fileno`+' '+`len(equivs)`+'\n')
		#print filename,equivs
		nextfile = file_attributes(filename)
		for possible_match in equivs:
			if file_attributes.are_equal(nextfile,possible_match.attributes):
				possible_match = possible_match.add_name(filename)
				break
		else:
			equivs = equivs + [file_bucket(nextfile)]
	#print
 	for file in equivs:
		print string.join(file.names)

main()

#filelist=[[]]
#for file in sys.argv[1:]:
#	filelist[0]=filelist[0] + [file]
#
#print filelist
#
#
#stat1=os.stat(filelist[0][0])
##print stat
#for filenum in range(len(filelist)-1):
#	stat2=os.stat(filelist[filenum+1][0])
#	print filelist[filenum], filelist[filenum+1]
#	print stat1[stat.ST_SIZE],stat2[stat.ST_SIZE]
#	stat1=stat2
#	
#
##stat=os.stat(file)
#