#!/usr/bin/gawk -f function usage() { print ARGV[0] " [-h|--hardlink]"; print " --hardlink:\tmove the duplicate file (to X.relinked)" print " \t\tand hardlink to original file"; print " \n\tWithout this argument, the script will only print"; print " \tthe commands it would have executed"; earlybath = 1; exit 0; } BEGIN { totsize = 0; args["hardlink"] = 0; earlybath = 0; for (i = 1; i < ARGC; i++) { if (ARGV[i] ~ /^(-h|--hardlink)/) args["hardlink"] = 1; else usage(); delete ARGV[i]; } if (args["delete"] && !args["hardlink"]) usage(); } function do_cmd (cmd, res) { cmd | getline res; close(cmd); return res; } function get_inode(file, cmd) { cmd = "stat -c '%i' " "\"" file "\""; return do_cmd(cmd); } function get_dev(file, cmd) { cmd = "stat -c '%d' " "\"" file "\""; return do_cmd(cmd); } function get_size(file, cmd) { cmd = "stat -c '%s' " "\"" file "\""; return do_cmd(cmd); } function update_hardlink(master, copy, cmd) { cmd = "mv \"" copy "\" \"" copy ".relinked\""; if (args["hardlink"]) do_cmd(cmd); print cmd; cmd = "ln \"" master "\" \"" copy "\""; if (args["hardlink"]) do_cmd(cmd); print cmd; } function md5sum(file, cmd,md5,text) { cmd = "md5sum " "\"" file "\""; text = do_cmd(cmd); if (split(text,md5)) return md5[1]; else return; } function sha1sum(file, cmd,sha1,text) { cmd = "sha1sum " "\"" file "\""; text = do_cmd(cmd); if (split(text,sha1)) return sha1[1]; else return; } function is_same_inode(file1, file2, inode) { if (get_dev(file1) != get_dev(file2)) return 0; if ( (inode = get_inode(file1)) != get_inode(file2)) return 0; return inode; } { inode = 0; regex="^" $1 " "; gsub(regex,"",$0); md5 = 0; sum = sha1sum($0); if (sums[sum]) { if ((inode = is_same_inode(sums[sum], $0))) { print "same: " $0 " and " sums[sum], inode; } else { print "duplicate: " $0 " and " sums[sum], inode, sum; if ((md5 = md5sum($0)) != md5sum(sums[sum])) { print "eek, md5sum didnt match! " md5; } else { totsize += get_size(sums[sum]); update_hardlink(sums[sum], $0); } } print "\n\n"; } else { sums[sum] = $0; } } END { if (!earlybath) print "total size: " totsize/1024 " bytes"; }