#!/usr/bin/perl -w # NAME: findextent.pl (was findfiles.pl) # AIM: Search for files of a given extension in a directory, recursive by default # 17/03/2013 - Rename more to what it does - find files with a specific EXTENSION # 09/12/2011 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use File::stat; # to get the file date (mtime) and size use Digest::MD5 qw(md5 md5_hex md5_base64); use Cwd; my $perl_dir = 'C:\GTools\perl'; unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.2 2013-03-17"; #my $VERS = "0.0.1 2011-11-03"; my $load_log = 0; my $in_dir = ''; my $in_ext = ''; my $verbosity = 0; my $out_xml = ''; ### program variables my @warnings = (); my $cwd = cwd(); my $os = $^O; my @file_list = (); my $dir_sep = "\\"; my $tot_count = 0; my @found_files = (); sub process_dir($$); my $debug_on = 1; my $def_dir = 'C:\FGCVS\Flightgear\data'; my $def_ext = ".wav"; sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub process_in_file($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn); $lnn = 0; foreach $line (@lines) { chomp $line; $lnn++; if ($line =~ /\s*#\s*include\s+(.+)$/) { $inc = $1; prt("$lnn: $inc\n"); } } } sub process_dir($$) { my ($dir,$lev) = @_; my @dirs = (); my ($file,$ff,$sb); my ($n,$d,$e); if (opendir(DIR,$dir)) { my @files = readdir(DIR); closedir DIR; $dir .= $dir_sep if ( !($dir =~ /(\\|\/)$/) ); foreach $file (@files) { next if ($file eq "."); next if ($file eq ".."); $ff = $dir.$file; if (-d $ff) { push(@dirs,$ff); } elsif (-f $ff) { if ($sb = stat($ff)) { ($n,$d,$e) = fileparse($file, qr/\.[^.]*/); push(@file_list,[$file,$ff,$sb->mtime,$sb->size,$e]); $tot_count++; if (($tot_count % 1000) == 0) { prt("Got $tot_count files...\n"); } } else { prtw("WARNING: Unable to 'stat' $ff\n"); } } } } else { prtw("WARNING: Unable to open directory $dir\n"); } foreach $file (@dirs) { process_dir($file,$lev+1); } if ($lev == 0) { prt("Got $tot_count files...\n"); } } sub process_in_dir($) { my ($dir) = shift; if (opendir(DIR,$dir)) { closedir DIR; } else { pgm_exit(1,"ERROR: Unable to open directory $dir!\n"); } process_dir($dir,0); } sub process_files($) { my ($ra) = shift; # = \@file_list my $cnt = scalar @{$ra}; my $tot_siz = 0; prt("Found $cnt files to process... matching to [$in_ext]...\n"); # 0 1 2 3 4 # push(@file_list,[$file,$ff,$sb->mtime,$sb->size,$e]); my ($i,$fil,$ff,$tm,$sz,$ext,$text,$lcext,$md5); my $max = 75; $text = lc($in_ext); $md5 = ""; for ($i = 0; $i < $cnt; $i++) { $ext = ${$ra}[$i][4]; $lcext = lc($ext); if ($text eq $lcext) { $fil = ${$ra}[$i][0]; $ff = ${$ra}[$i][1]; $tm = ${$ra}[$i][2]; $sz = ${$ra}[$i][3]; $tot_siz += $sz; # 0 1 2 3 4 5 6 7 push(@found_files,[$fil,$ff,$tm,$sz,$ext,$md5,0,0]); } } my $fcnt = scalar @found_files; prt("Found $fcnt with extent $in_ext... total ".get_nn($tot_siz)." bytes... getting MD5 of each...\n"); my ($j,$mcnt,$min,$len); $min = 0; for ($i = 0; $i < $fcnt; $i++) { $ff = $found_files[$i][1]; $len = length($ff); $min = $len if ($len > $min); $md5 = 'unknown'; if (open(FILE, $ff)) { binmode(FILE); $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest; close(FILE); $found_files[$i][5] = $md5; } else { prtw("WARNING: open file [$ff] FAILED\n"); } } prt("Comparing MD5 of each, with others...\n"); $mcnt = 0; for ($i = 0; $i < $fcnt; $i++) { next if ($found_files[$i][6]); $md5 = $found_files[$i][5]; for ($j = 0; $j < $fcnt; $j++) { next if ($i == $j); next if ($found_files[$j][6]); if ($md5 eq $found_files[$j][5]) { $found_files[$i][6] = $i + 1; $found_files[$j][6] = $i + 1; $mcnt++; } } if ($i && (($i % 100) == 0)) { prt("Done $i, with matches $mcnt...\n"); } } prt("Found $mcnt of $fcnt with SAME MD5\n"); my $ssize = 0; my $scnt = 0; my $val = 0; my $tcnt = 0; my $dupes = 0; my $tot_dup = 0; my $csz = ''; $min = $max if ($min > $max); for ($i = 0; $i < $fcnt; $i++) { next if ($found_files[$i][7]); $val = $found_files[$i][6]; if ($val) { $ff = $found_files[$i][1]; $sz = $found_files[$i][3]; $scnt++; $tcnt = 1; $ff .= ' ' while (length($ff) < $min); $csz = get_nn($sz); $csz = ' '.$csz while (length($csz) < 14); prt("\n$scnt:$tcnt: $ff $csz\n"); $found_files[$i][7] = 1; $dupes = 0; # zero dupe size for ($j = 0; $j < $fcnt; $j++) { next if ($i == $j); next if ($found_files[$j][7]); if ($val == $found_files[$j][6]) { $ff = $found_files[$j][1]; $sz = $found_files[$j][3]; $dupes += $sz; $tcnt++; $ff .= ' ' while (length($ff) < $min); $csz = get_nn($sz); $csz = ' '.$csz while (length($csz) < 14); prt("$scnt:$tcnt: $ff $csz\n"); $found_files[$j][7] = 1; } } $tot_dup += $dupes; prt("$tcnt dupes, save ".get_nn($dupes)." if eliminated.\n"); } } prt("Save ".get_nn($tot_dup)." bytes if all exact duplicates eliminated...\n"); } ######################################### ### MAIN ### parse_args(@ARGV); ### prt( "$pgmname: in [$cwd]: Hello, World...\n" ); ### process_in_file($in_dir); process_in_dir($in_dir); process_files(\@file_list); pgm_exit(0,""); ######################################## sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --ext (-e) = Extensions to search for. Must commence with dot (.)\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); prt("AIM: Search for files of a given extension in a directory, recursive by default.\n"); } sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^l/) { $load_log = 1; prt("Set to load log at end.\n") if (VERB1()); } elsif ($sarg =~ /^e/) { need_arg(@av); shift @av; $sarg = $av[0]; $in_ext = $sarg; prt("Set extension to [$in_ext].\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_xml = $sarg; prt("Set out file to [$out_xml].\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_dir = $arg; prt("Set input to [$in_dir]\n"); } shift @av; } if ($debug_on) { prtw("WARNING: DEBUG is ON!\n"); if (length($in_dir) == 0) { $in_dir = $def_dir; prt("Set DEFAULT directory [$in_dir]\n"); } if (length($in_ext) == 0) { $in_ext = $def_ext; prt("Set DEFAULT extent [$in_ext]\n"); } $load_log = 1; } if (length($in_dir) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (length($in_ext) == 0) { pgm_exit(1,"ERROR: No extension to search for found in command!\n"); } if (! -d $in_dir) { pgm_exit(1,"ERROR: Unable to find in directory [$in_dir]! Check name, location...\n"); } } # eof - template.pl