Here is try number 1 (warts and all):
#!/usr/bin/perl # # $Id: fdf.pl,v 1.1.1.1 2006-07-29 14:26:57 adam Exp $ # use strict; use warnings; use File::Find; use File::Glob; use Digest::SHA; use Getopt::Std; my %options; my $VERSION = 0.1; getopts( 'dvg:l:u:ho:', \%options ); if ( $options{'h'} ) { show_usage(); exit; } my @start_dirs = @ARGV; if ( scalar @start_dirs < 1 ) { show_usage(); die "\nERROR: No start directory provided.\n\n"; } foreach my $start_dir (@start_dirs) { die "Unable to locate start Directory: $start_dir\n" unless -d $start_dir; } die "Upper Limit ($options{'u'}) is less than the Lower Limit ($options{'l'}).\n" if ( ( $options{'u'} && $options{'l'} ) && ( $options{'u'} < $options{'l'} ) ); if ( $options{'d'} ) { $options{'v'} = 1; } if ( $options{'v'} ) { print {*STDERR} "Find Duplicate Files v$VERSION (verbose mode)\n"; print {*STDERR} " Search GLOB: $options{'g'}\n" if $options{'g'}; print {*STDERR} " Minimum file size: $options{'l'}\n" if $options{'l'}; print {*STDERR} " Maximum file size: $options{'u'}\n" if $options{'u'}; print {*STDERR} " Output Log File: $options{'o'}\n" if $options{'o'}; foreach my $start_dir (@start_dirs) { print {*STDERR} "Finding all files in: $start_dir\n"; } print {*STDERR} "\nThis may take a while...\n\n"; } our %size_by_files; our $sub_total = 0; $File::Find::dont_use_nlink = 1; find( \&pass_one, @start_dirs ); if ( $options{'v'} ) { print {*STDERR} "Pass 1 complete. Possibility of $sub_total bytes of duplication.\n", "Now calculating checksums. This may take a little while longer...\n\n"; } my $dupes_by_size = pass_two( \%size_by_files ); my $output; if ( $options{'o'} ) { open $output, '>', $options{'o'} or die "Unable to write to log file $options{'v'}\n"; select $output; } foreach my $key ( sort keys %{$dupes_by_size} ) { if ( scalar @{ $dupes_by_size->{$key} } > 1 ) { my @files = @{ $dupes_by_size->{$key} }; foreach my $file (@files) { print "$key\t$file\n"; } print "\n"; } } if ( $output ) { close $output; } exit; sub pass_one { my $filename = $File::Find::name; if ( -f $filename && -r _ ) { my $size = -s _; if ( $size > 0 ) { my $use = 1; my @files; if ( $options{'l'} && $size < $options{'l'} ) { undef $use; } if ( $options{'u'} && $size > $options{'u'} ) { undef $use; } if ($use) { if ( $size_by_files{$size} ) { @files = @{ $size_by_files{$size} }; $sub_total = $sub_total + $size; } push @files, $filename; $size_by_files{$size} = \@files; } } } return; } sub pass_two { my $file_list = shift; my $dupe_total = 0; my %dupe_files; foreach my $size ( sort keys %{$file_list} ) { if ( scalar @{ $file_list->{$size} } > 1 ) { my @files = @{ $file_list->{$size} }; foreach my $file (@files) { my $digest; eval { $digest = Digest::SHA->new()->addfile($file, "b")->hexdigest; }; if ($digest) { my @pos_files; if ( $dupe_files{$digest} ) { @pos_files = @{ $dupe_files{$digest} }; $dupe_total = $dupe_total + $size; } push @pos_files, $file; $dupe_files{$digest} = \@pos_files; } if ( $options{'d'} ) { print {*STDERR} "$digest\t$file\t$size\n"; } } } } if ( $options{'v'} ) { print {*STDERR} "Pass 2 complete. $dupe_total bytes of duplicates found.\n"; if ( $options{'o'} ) { print {*STDERR} "Results logged to $options{'o'}\n\n"; } else { print {*STDERR} "Results are show below:\n\n"; } } return \%dupe_files; } sub show_usage { print <<"USAGE"; This is Find Duplicate Files version $VERSION Usage: fdf [ -l] [ -u ] [ -v ] [ -o ] Options: -l Lower limit of files size to scan, in bytes -u Upper limit of files size to scan, in bytes -v Verbose mode (sent to *STDERR) -o Output log -h This usage note Copyright: Copyright Adam John Trickett / iredale consulting 2006 Licence: OSI Certified Open Source Software. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public Licence as published by the Free Software Foundation; either version 2 of the Licence, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public Licence for more details. You should have received a copy of the GNU General Public Licence along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. USAGE return; }