package main import "os" import "io" import "fmt" import "bufio" import "strconv" import "container/list" import "crypto/md5" import "encoding/hex" func trimOptionalNull(buffer []byte) []byte { var lenm1 = len(buffer) - 1 if len(buffer) > 0 && buffer[lenm1] == '\000' { return buffer[:lenm1] } return buffer } func getFileLength(filename string) int64 { fi, err := os.Lstat(filename) if err != nil { fmt.Println("Error stat'ing", filename, ":", err) return -1; } return fi.Size() } func getMd5Prefix(filename string, minPrefixLen int64) string { // Based on: // https://gist.github.com/sergiotapia/8263278 // https://tip.golang.org/pkg/crypto/md5/#example_New_file var buffer = make([]byte, minPrefixLen) file, err := os.Open(filename) if err != nil { fmt.Println(err) return "" } defer file.Close() hasher := md5.New() var _, err2 = file.Read(buffer) if err2 != nil { fmt.Println(err2) return "" } hasher.Write(buffer) return hex.EncodeToString(hasher.Sum(nil)) } func getFullMd5(filename string) string { // Based on: // https://gist.github.com/sergiotapia/8263278 // https://tip.golang.org/pkg/crypto/md5/#example_New_file file, err := os.Open(filename) if err != nil { fmt.Println(err) return "" } defer file.Close() hasher := md5.New() if _, err := io.Copy(hasher, file); err != nil { fmt.Println(err) } return hex.EncodeToString(hasher.Sum(nil)) } func bySizeToByPrefix(bySize map[int64]*list.List) map[string]*list.List { // Take the bySize map and compute a map with key == size -and- prefix hash var minPrefixLen int64 = 1024 var byPrefix = make(map[string]*list.List) for fileLengthIn64, listOfFilenames := range bySize { var fileLengthString = strconv.FormatInt(fileLengthIn64, 10) var lengthOfList = listOfFilenames.Len() if fileLengthIn64 >= minPrefixLen && lengthOfList > 2 { // Compute a prefix digest, and append it to the size (as a string) separated by / . // These could end up in different map keys. for temp := listOfFilenames.Front(); temp != nil; temp = temp.Next() { var filenameString = temp.Value.(string) var md5PrefixHash = getMd5Prefix(filenameString, minPrefixLen) var newKey = fileLengthString + "/" + md5PrefixHash _, keyPresent := byPrefix[newKey] if !keyPresent { byPrefix[newKey] = list.New() } byPrefix[newKey].PushBack(temp.Value) } } else { // Degenerate case: We have either a short length, or too few filenames to bother with. var newKey = fileLengthString + "/" _, keyPresent := byPrefix[newKey] if !keyPresent { byPrefix[newKey] = list.New() } // These are all considered the same so far // fmt.Println(newKey) byPrefix[newKey].PushBackList(bySize[fileLengthIn64]) } } return byPrefix } func byPrefixToByFull(byPrefix map[string]*list.List) map[string]*list.List { // Take the bySize prefix and compute a map with size, prefix and full hash var byFull = make(map[string]*list.List) for oldKey, listOfFilenames := range byPrefix { // We do a full hash, unless the length of files is 1 var lengthOfList = listOfFilenames.Len() if lengthOfList > 1 { // Compute a whole-file digest, and append it to the size+prefix hash separated by / for temp := listOfFilenames.Front(); temp != nil; temp = temp.Next() { var filenameString = temp.Value.(string) var md5FullHash = getFullMd5(filenameString) // var oldKey = temp.Value.(string) var newKey = oldKey + "/" + md5FullHash _, keyPresent := byFull[newKey] if !keyPresent { byFull[newKey] = list.New() } byFull[newKey].PushBack(temp.Value) } } } return byFull } // If we want to switch from a (doubly linked) list to a slice, this might help: // https://www.dotnetperls.com/container-list-go func main() { var bySize = make(map[int64]*list.List) reader := bufio.NewReader(os.Stdin) for true { var filename, err = reader.ReadBytes('\000') if err != nil { break } filename = trimOptionalNull(filename) var filenameString = string(filename) var fileLength = getFileLength(filenameString) // fmt.Println(fileLength, filenameString) // var fileLengthString = strconv.FormatInt(fileLength, 10) _, keyPresent := bySize[fileLength] if !keyPresent { bySize[fileLength] = list.New() } bySize[fileLength].PushBack(filenameString) } var byPrefix = bySizeToByPrefix(bySize) var byFull = byPrefixToByFull(byPrefix) // Print byPrefix to stdout for key, value := range byFull { fmt.Print(key) for temp := value.Front(); temp != nil; temp = temp.Next() { var valueString = temp.Value.(string) fmt.Print(" " + valueString) } fmt.Println() } }