Here is try number 1 (warts and all):
#!/usr/bin/perl
#
# $Id: fdf.pl,v 1.1.1.1 2006-07-29 14:26:57 adam Exp $
#
use strict;
use warnings;
use File::Find;
use File::Glob;
use Digest::SHA;
use Getopt::Std;
my %options;
my $VERSION = 0.1;
getopts( 'dvg:l:u:ho:', \%options );
if ( $options{'h'} ) {
show_usage();
exit;
}
my @start_dirs = @ARGV;
if ( scalar @start_dirs < 1 ) {
show_usage();
die "\nERROR: No start directory provided.\n\n";
}
foreach my $start_dir (@start_dirs) {
die "Unable to locate start Directory: $start_dir\n" unless -d $start_dir;
}
die
"Upper Limit ($options{'u'}) is less than the Lower Limit ($options{'l'}).\n"
if ( ( $options{'u'} && $options{'l'} )
&& ( $options{'u'} < $options{'l'} ) );
if ( $options{'d'} ) {
$options{'v'} = 1;
}
if ( $options{'v'} ) {
print {*STDERR} "Find Duplicate Files v$VERSION (verbose mode)\n";
print {*STDERR} " Search GLOB: $options{'g'}\n" if $options{'g'};
print {*STDERR} " Minimum file size: $options{'l'}\n" if $options{'l'};
print {*STDERR} " Maximum file size: $options{'u'}\n" if $options{'u'};
print {*STDERR} " Output Log File: $options{'o'}\n" if $options{'o'};
foreach my $start_dir (@start_dirs) {
print {*STDERR} "Finding all files in: $start_dir\n";
}
print {*STDERR} "\nThis may take a while...\n\n";
}
our %size_by_files;
our $sub_total = 0;
$File::Find::dont_use_nlink = 1;
find( \&pass_one, @start_dirs );
if ( $options{'v'} ) {
print {*STDERR}
"Pass 1 complete. Possibility of $sub_total bytes of duplication.\n",
"Now calculating checksums. This may take a little while longer...\n\n";
}
my $dupes_by_size = pass_two( \%size_by_files );
my $output;
if ( $options{'o'} ) {
open $output, '>', $options{'o'}
or die "Unable to write to log file $options{'v'}\n";
select $output;
}
foreach my $key ( sort keys %{$dupes_by_size} ) {
if ( scalar @{ $dupes_by_size->{$key} } > 1 ) {
my @files = @{ $dupes_by_size->{$key} };
foreach my $file (@files) {
print "$key\t$file\n";
}
print "\n";
}
}
if ( $output ) {
close $output;
}
exit;
sub pass_one {
my $filename = $File::Find::name;
if ( -f $filename && -r _ ) {
my $size = -s _;
if ( $size > 0 ) {
my $use = 1;
my @files;
if ( $options{'l'} && $size < $options{'l'} ) {
undef $use;
}
if ( $options{'u'} && $size > $options{'u'} ) {
undef $use;
}
if ($use) {
if ( $size_by_files{$size} ) {
@files = @{ $size_by_files{$size} };
$sub_total = $sub_total + $size;
}
push @files, $filename;
$size_by_files{$size} = \@files;
}
}
}
return;
}
sub pass_two {
my $file_list = shift;
my $dupe_total = 0;
my %dupe_files;
foreach my $size ( sort keys %{$file_list} ) {
if ( scalar @{ $file_list->{$size} } > 1 ) {
my @files = @{ $file_list->{$size} };
foreach my $file (@files) {
my $digest;
eval {
$digest = Digest::SHA->new()->addfile($file, "b")->hexdigest;
};
if ($digest) {
my @pos_files;
if ( $dupe_files{$digest} ) {
@pos_files = @{ $dupe_files{$digest} };
$dupe_total = $dupe_total + $size;
}
push @pos_files, $file;
$dupe_files{$digest} = \@pos_files;
}
if ( $options{'d'} ) {
print {*STDERR} "$digest\t$file\t$size\n";
}
}
}
}
if ( $options{'v'} ) {
print {*STDERR}
"Pass 2 complete. $dupe_total bytes of duplicates found.\n";
if ( $options{'o'} ) {
print {*STDERR} "Results logged to $options{'o'}\n\n";
}
else {
print {*STDERR} "Results are show below:\n\n";
}
}
return \%dupe_files;
}
sub show_usage {
print <<"USAGE";
This is Find Duplicate Files version $VERSION
Usage:
fdf [ -l ] [ -u ] [ -v ] [ -o ]
Options:
-l Lower limit of files size to scan, in bytes
-u Upper limit of files size to scan, in bytes
-v Verbose mode (sent to *STDERR)
-o Output log
-h This usage note
Copyright:
Copyright Adam John Trickett / iredale consulting 2006
Licence:
OSI Certified Open Source Software.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public Licence as published
by the Free Software Foundation; either version 2 of the Licence,
or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public Licence for more details.
You should have received a copy of the GNU General Public Licence
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111, USA.
USAGE
return;
}