chklinks02.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:24 2010 from chklinks02.pl 2008/07/02 50.1 KB.

#!/perl -w
# NAME: chklinks02.pl
# AIM: Given a input FOLDER, check all the HTML found for a <a href="...."
# reference and make sure that reference EXISTS,
# either as a LOCAL file,
# or that an IP address can be obtained for the HOST if http://<something> ...
# AND check ALL image links <img src="..."...>, if it is a LOCAL file,
# and other 'link' items, like .zip, .txt, etc.
# 18/06/2007 - Add some command parameters, and help
# 02/06/2007 - geoff mclane - geoffair.com/mperl/index.htm
use strict;
use warnings;
use File::Basename;
use Socket;
unshift(@INC, 'C:/GTools/perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
   my @tmpsp = split(/\\/,$pgmname);
   $pgmname = $tmpsp[-1];
}
my $outfile = "temp.$pgmname.txt";
open_log($outfile);
prt( "$pgmname ... Hello, World ...\n" );
# SET A DEFAULT INPUT FOLDER / FILE
###my $in_folder = "C:\\HOMEPAGE\\HOM\\test4\\index.htm";
###my @splexcludes = qw( ok includes );
###my $in_folder = "C:\\HOMEPAGE\\GA\\macpc\\index.htm";
###my $in_folder = "C:\\HOMEPAGE\\Max7\\anew\\index.html";
###my $in_folder = "C:\\HOMEPAGE\\simple\\index.htm";
###my $in_folder = "C:\\HOMEPAGE\\GA\\index.html";
my $in_folder = "C:\\HOMEPAGE\\GA\\travel\\maroc\\index.htm";
my @splexcludes = qw( macpc );
###my $in_folder = "C:\\HOMEPAGE\\GA\\flags\\index.htm";
###my $in_folder = "C:\\HOMEPAGE\\GeoffAir\\welcome.html";
# some FEATURES and USER variables
# my @excludes = qw( cvineng2.htm );
my @excludes = qw( desktop.ini php.ini blank.html blank.htm );
my $recurse = 1;   # recursive
my $ignfpd = 1;      # ignore FRONTPAGE folders
my $chkip = 1;      # check the IP address
my $showhreflinks = 0;   # show a WARNING when an IMG, ICO, etc is a REMOTE link
my $showlinks = 0;   # show the links for each file
my $showscripts = 0;   # show SCRIPT files
my $writeips = 1;   # write IP found to a file
my $refreships = 0;   # if $chkip, and $writeips, re-write NEW check file
my $shownohrefs = 0;   # show when NO HREF found in file
my $ipfile = "iplinks.txt";
my @ipsfound = ();
my $verbal = 0;
my @html_ext = qw( .htm .html .shtml .php );
my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_ext  = qw( .css );
my @script_ext = qw( .js .class .cgi );
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );
my @excused = ( '?dir=test', '?dir=.' );
# program variables
# NOTE: Each of these is a multidimensional array - see offset below
my @htm_files = ();   # store files found in folder
my @img_files = ();
my @css_files = ();
my @zip_files = ();
my @txt_files = ();
my @script_files = ();
my @other_files = ();
# offsets in above arrays
my $of_ff = 0;   # full file name
my $of_hr = 1;   # array ref of href links
my $of_im = 2;   # array ref of image links
my $of_lk = 3;   # linked count
my $of_sp = 4;   # spare
my $of_to = 5;   # links TO
my $of_fm = 6;  # links FROM
my @donesrcs = ();
my @doneimgs = ();
my %ext_hash = ();
my $cnt = 0;
my $file = '';
my @warnings = ();   # list of errors, warnings during running
my @httprefs = ();   # set of HREF src values
my @httpsrefs = ();
my @ftprefs = ();
my @mtrefs = ();
my $hcnt = 0;
my $href = '';
my %hrefs = ();
my $val = '';
my $msg = '';
my @scripts = ();
my $scnt = 0;
my $imgcnt = 0;
my $procnt = 0;
my $homefile = '';
my $total_hrefs = 0;
my $total_imgs = 0;
my @missed = ();
my $excusecnt = 0;
my $hrflnkcnt = 0; # $showhreflinks
my $homeoffset = -1;
my @offsdone = ();
my @htmlinks = ();
# debug only bits
my $dbg1 = 0;   # show entering folder ...
my $dbg2 = 0;   # show ALL HREF entries ...
my $dbg3 = 0;   # show IP found ...
my $dbg4 = 0;   # show entered/exit script
my $dbg5 = 0;   # show 'ok' when found
my $dbg6 = 0;   # show processing lines
my $dbg7 = 0;   # show anchor count
my $dbg8 = 0;   # show unique anchor href
my $dbg9 = 0;   # show files with SCRIPTS
my $dbg10 = 0;   # show diag for get_img_srcs() ...
my $dbg11 = 0;   # in image processing show entered/exits script 
my $dbg12 = 0;   # in image processing show processing count
my $dbg13 = 0;   # in image processing show ok - found file
my $dbg14 = 0;   # in image processing show image count found
my $dbg15 = 0;   # in image processing show image count when NONE found
my $dbg16 = 0;   # show WARNINGS during run ...
my $dbg17 = 0;   # show MISSING or BLANK HREF in PHP file
my $dbg18 = 0;   # check_linkages: show 'ok', in 2nd link check
my $dbg19 = 0;   # check_local_links: show progress ...
my $dbg20 = 0;   # check_local_links: show ALL link COUNTS - NONE IS ALWAYS SHOWN ...
my $dbg21 = 0;   # check_local_links: show LINK when found ...
my $dbg22 = 0;   # mark_image_link: show comparing, and comparision ...
my $dbg23 = 0;   # mark_image_link: show count of new images marked ...
my $dbg24 = 0;   # show each image file being marked
my $dbg25 = 0;   # show NO LINK FOUND
my $dbg26 = 0;   # show EACH HTML FILE BEING PROCESSED
my $dbg27 = 0;   # show EACH extesnions, and counts
my $dbg28 = 0;   # show image links information ...
my $dbg29 = 0;   # show ZIP, TXT, CSS, SCRIPT and OTHER file links information ...
my $dbg30 = 0;   # show HTML HREF links information ...
my $dbg31 = 0;   # like $dbg20 - check_local_links: show ALL links - NONE IS ALWAYS SHOWN ...
my $dbg32 = 0;   # show missing during processing
my $dbg33 = 0;   # show HAS NO LINKS during processing
parse_args(@ARGV);
if (length($in_folder) == 0) {
   mydie( "No input folder (or file) given/found in command ...\n" );
}
if (-f $in_folder) {
   ($homefile, $in_folder) = fileparse($in_folder);
   $in_folder =~ s/[\\\/]$//;
}
show_startup();
process_folder( $in_folder );
show_found_counts();
process_file_array();
process_host_array();
if (length($homefile)) {
   ###trace_from_htm( $homefile, 0 );
   check_linkages( $homefile );
   check_local_links( $homefile );
   show_link_counts("HTML File", \@htm_files);
   show_link_counts("IMG Files", \@img_files);
   show_link_counts("CSS Files", \@css_files);
   show_link_counts("ZIP Files", \@zip_files);
   show_link_counts("TXT Files", \@txt_files);
   show_link_counts("Script Files", \@script_files);
   show_link_counts("Other Files", \@other_files);
}
$scnt = scalar @scripts;
if ($scnt && ($dbg9 || $showscripts)) {
   prt( "Got $scnt files containing SCRIPTS ...\n" );
   # push(@scripts, [$fil, $lns]);
   for (my $i = 0; $i < $scnt; $i++) {
      $file = $scripts[$i][0];
      $val = $scripts[$i][1];
      prt( "$file $val\n" );
   }
}
##############################################################
prt( "\n###### SHOW RESULTS ########\n" );
prt( "WARNING: $hrflnkcnt images by HREF not shown! (change \$showhreflinks)\n" ) if ($hrflnkcnt);
$cnt = scalar @warnings;
if ($cnt) {
   prt( "\nWARNINGS FOLLOW ($cnt):\n" );
   foreach my $w (@warnings) {
      prt( "$w\n" );
   }
} else {
   prt( "No warnings ...\n" );
}
if (@missed) {
   prt( "\nMISSING FOLLOW: ".scalar @missed."\n" );
   foreach $file (@missed) {
      prt( "$file\n");
   }
}
prt( "###### END RESULTS ########\n" );
##############################################################
close_log($outfile,1);
exit(0);
##################################
sub process_file_array {
   my $max = scalar @htm_files;
   my $bgntime = time();
   my $msg = '';
   for (my $i = 0; $i < $max; $i++) {
      $file = $htm_files[$i][$of_ff];
      my ($nm,$dir,$ext) = fileparse( $file, qr/\.[^.]*/ );
      my $htot = 0;
      my $itot = 0;
      $procnt++;
      if (open INF, "<$file") {
         my @lines = <INF>;
         close INF;
         @lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php');
         # THIS IS USING htmltool.pl - get a single line of TEXT ...
         my $txt = join( '', @lines ); # get whole text
         my @is = ret_imgs_array( $txt );
         my $ntxt = remove_script( $txt );
         $ntxt = trimblanklines($ntxt);
         @hrefs = (); # clear
         my @hr = ret_hrefs_array( $ntxt );
         ### collecthrefs( $txt, 0 );
         ### collectimgs( $txt, 0 );
         # bump the counts of HREF and IMGS collected
         $itot += scalar @is;
         $htot += scalar @hr;
         # store the references ... that is a reference to an array
         $htm_files[$i][$of_hr] = \@hr;
         $htm_files[$i][$of_im] = \@is;
         # new code to do some similar things, but while in the array
         ###@lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php');
         @lines = dropcomments_from_array(@lines);
         my @srcs = get_img_srcs($file, @lines);
         $imgcnt += check_images( $file, @srcs );
         @srcs = get_href_srcs($file, @lines);
         check_hrefs( $file, @srcs );
      }
      $total_hrefs += $htot;
      $total_imgs += $itot;
      if ((($procnt % 100) == 0)||($max < 10)) {
         ###local $| = 1;
         ###prt( "\rDone $procnt HTML files ..." );
         if ($max < 10) {
            prt( "Done $file HTML file ... href,other($htot,$itot)\n" );
         } else {
            $msg = "href,other ($total_hrefs,$total_imgs)";
            show_time( $max, $procnt, $bgntime, $msg );
            #prt( "Done $procnt HTML files ... href,other ($total_hrefs,$total_imgs)\n" );
         }
      }
   }
   prt( "Completed $procnt HTML files ... Found $total_hrefs HREF, and $total_imgs IMG/OTHER tokens.\n" );
}
sub get_href_type {
   my ($src) = shift;
   if ($src =~ /^http:/i) {
      #push(@httprefs, [$src, $fil, $lnnos] );
      return 1; # remote HREF
   } elsif ($src =~ /^https:/i) {
      return 1; # remote HREF
      #push(@httpsrefs, [$src, $fil, $lnnos] );
   } elsif ($src =~ /^ftp:/i) {
      #push(@ftprefs, [$src, $fil, $lnnos] );
      return 3; # remote HREF
   } elsif ($src =~ /^mailto:/i) {
      #push(@mtrefs, [$src, $fil, $lnnos] );
      return 4; # remote HREF
   } elsif ( $src =~ /^javascript:/i ) {
      return 5; # a JAVASCRIPT HREF
   } elsif ($src =~ /^file:/i) {
      return 5; # remote HREF
   } elsif ( substr($src,0,1) eq '#') {
      # local in page HREF
      return 6;
   } else {
      my $ind = index($src,'#');
      if ( $ind != -1 ) {
         $src = substr($src,0,$ind);
      }
      $ind = index($src,'?');
      if ( $ind != -1 ) {
         $src = substr($src,0,$ind);
      }
      $src =~ s/\/$//;
      if (length($src)) {
         return 7;
      }
   }
   return 0;
}
sub get_local_href {
   my ($src) = shift;
   my $ind = index($src,'#');
   if ( $ind != -1 ) {
      $src = substr($src,0,$ind);
   }
   $ind = index($src,'?');
   if ( $ind != -1 ) {
      $src = substr($src,0,$ind);
   }
   $src =~ s/\/$//;   # remove any TRAILING '/' char
   # 25/07/2007 - also 'convert' '%20' to space
   $src =~ s/%20/ /g;
   return $src;
}
sub dos_2_unix($) {
   my ($du) = shift;
   $du =~ s/\\/\//g;
   return $du;
}
### my @donesrcs = ();
sub in_done_srcs {
   my ($f) = shift;
   foreach my $fd (@donesrcs) {
      if ($fd eq $f) {
         return 1;
      }
   }
   return 0;
}
sub in_done_imgs {
   my ($f) = shift;
   foreach my $fd (@doneimgs) {
      if ($fd eq $f) {
         return 1;
      }
   }
   return 0;
}
sub fix_rel_unix_path {
   my ($path) = shift;
   $path = dos_2_unix($path);
   my @a = split(/\//, $path);
   my $npath = '';
   my $max = scalar @a;
   my @na = ();
   for (my $i = 0; $i < $max; $i++) {
      my $p = $a[$i];
      if ($p eq '.') {
         # ignore this
      } elsif ($p eq '..') {
         if (@na) {
            pop @na;   # discard previous
         } else {
            prt( "WARNING: Got relative .. without previous!!!\n" );
         }
      } else {
         push(@na,$p);
      }
   }
   foreach my $pt (@na) {
      $npath .= "/" if length($npath);
      $npath .= $pt;
   }
   return $npath;
}
sub add_new_link {
   my ($nlnk, $lnks) = @_;
   my @arr = split(',', $lnks);
   foreach my $lk (@arr) {
      if ($lk eq $nlnk) {
         return 0;
      }
   }
   return 1;
}
sub mark_image_link {
   my ($fmfil, $fnd, $src, $lev) = @_;
   my $fcnt = scalar @img_files;
   my $msrc = lc(dos_2_unix($src));
   my $lnks = '';
   prt( "Seeking [$msrc] in $fcnt images files ...\n" ) if ($dbg22);
   for (my $i = 0; $i < $fcnt; $i++) {
      my $fil = $img_files[$i][$of_ff];
      my $mfil = lc(dos_2_unix($fil));
      prt( "Comparing to $mfil ...\n" ) if ($dbg22);
      if ($msrc eq $mfil) {
         $val = $img_files[$i][$of_lk];
         $val++;
         $img_files[$i][$of_lk] = $val;
         # add image file linked to from what file
         $lnks = $img_files[$i][$of_fm];
         if (length($lnks) == 0) {
            $lnks = $fmfil;
         } elsif (add_new_link($fmfil, $lnks)) {
            $lnks .= ',';
            $lnks .= $fmfil;
         }
         $img_files[$i][$of_fm] = $lnks;
         prt( "IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28);
         return 0;
      }
   }
   prt( "$src - NOT FOUND![1]\n" ) if ($dbg21);
   return 1;
}
sub mark_other_links {
   my ($fmfil, $fnd, $src, $lev) = @_;
   my $totcnt = 0;
   my $msrc = lc(dos_2_unix($src));
   my $fcnt = scalar @img_files;
   my $i = 0;
   my $lnks = '';
   $totcnt += $fcnt;
   if (mark_image_link( $fmfil, $fnd, $src, $lev ) == 0) {
      return 0;
   }
   # maybe ZIP files
   $fcnt = scalar @zip_files;
   $totcnt += $fcnt;
   for ($i = 0; $i < $fcnt; $i++) {
      my $fil = $zip_files[$i][$of_ff];
      my $mfil = lc(dos_2_unix($fil));
      if ($msrc eq $mfil) {
         $val = $zip_files[$i][$of_lk];
         $val++;
         $zip_files[$i][$of_lk] = $val;
         # add zip file linked to from what file
         $lnks = $zip_files[$i][$of_fm];
         if (length($lnks) == 0) {
            $lnks = $fmfil;
         } elsif (add_new_link($fmfil, $lnks)) {
            $lnks .= ',';
            $lnks .= $fmfil;
         }
         $zip_files[$i][$of_fm] = $lnks;
         prt( "ZIP link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
         return 0;
      }
   }
   # maybe TXT files
   $fcnt = scalar @txt_files;
   $totcnt += $fcnt;
   for ($i = 0; $i < $fcnt; $i++) {
      my $fil = $txt_files[$i][$of_ff];
      my $mfil = lc(dos_2_unix($fil));
      if ($msrc eq $mfil) {
         $val = $txt_files[$i][$of_lk];
         $val++;
         $txt_files[$i][$of_lk] = $val;
         # add txt file linked to from what file
         $lnks = $txt_files[$i][$of_fm];
         if (length($lnks) == 0) {
            $lnks = $fmfil;
         } elsif (add_new_link($fmfil, $lnks)) {
            $lnks .= ',';
            $lnks .= $fmfil;
         }
         $txt_files[$i][$of_fm] = $lnks;
         prt( "TXT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
         return 0;
      }
   }
   # maybe CSS files
   $fcnt = scalar @css_files;
   $totcnt += $fcnt;
   for ($i = 0; $i < $fcnt; $i++) {
      my $fil = $css_files[$i][$of_ff];
      my $mfil = lc(dos_2_unix($fil));
      if ($msrc eq $mfil) {
         $val = $css_files[$i][$of_lk];
         $val++;
         $css_files[$i][$of_lk] = $val;
         # add txt file linked to from what file
         $lnks = $css_files[$i][$of_fm];
         if (length($lnks) == 0) {
            $lnks = $fmfil;
         } elsif (add_new_link($fmfil, $lnks)) {
            $lnks .= ',';
            $lnks .= $fmfil;
         }
         $css_files[$i][$of_fm] = $lnks;
         prt( "CSS link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
         return 0;
      }
   }
   # maybe SCRIPT files
   $fcnt = scalar @script_files;
   $totcnt += $fcnt;
   for ($i = 0; $i < $fcnt; $i++) {
      my $fil = $script_files[$i][$of_ff];
      my $mfil = lc(dos_2_unix($fil));
      if ($msrc eq $mfil) {
         $val = $script_files[$i][$of_lk];
         $val++;
         $script_files[$i][$of_lk] = $val;
         # add script file linked to from what file
         $lnks = $script_files[$i][$of_fm];
         if (length($lnks) == 0) {
            $lnks = $fmfil;
         } elsif (add_new_link($fmfil, $lnks)) {
            $lnks .= ',';
            $lnks .= $fmfil;
         }
         $script_files[$i][$of_fm] = $lnks;
         prt( "SCRIPT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
         return 0;
      }
   }
   # OK, OTHER
   $fcnt = scalar @other_files;
   $totcnt += $fcnt;
   for ($i = 0; $i < $fcnt; $i++) {
      my $fil = $other_files[$i][$of_ff];
      my $mfil = lc(dos_2_unix($fil));
      if ($msrc eq $mfil) {
         $val = $other_files[$i][$of_lk];
         $val++;
         $other_files[$i][$of_lk] = $val;
         # add script file linked to from what file
         $lnks = $other_files[$i][$of_fm];
         if (length($lnks) == 0) {
            $lnks = $fmfil;
         } elsif (add_new_link($fmfil, $lnks)) {
            $lnks .= ',';
            $lnks .= $fmfil;
         }
         $other_files[$i][$of_fm] = $lnks;
         prt( "OTHER link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
         return 0;
      }
   }
   $totcnt += 1 if ($totcnt == 0);
   return $totcnt;
}
#######################################################################
# mark a link
# parameters:
# LINK IS IN FILE - $fmfil
# Offset in $htm_files[$fnd]
# Link item source - $src
sub mark_link {
   my ($fmfil, $fnd, $src, $lev) = @_;
   my $fcnt = scalar @htm_files;
   my $msrc = lc(dos_2_unix($src));
   my $ff = $htm_files[$fnd][$of_ff];
   my $i = 0;
   my $totcnt = $fcnt;
   my $fil = '';
   my $mfil = '';
   my $val = 0;
   my $lnks = '';
   for ($i = 0; $i < $fcnt; $i++) {
      if ($i != $fnd) {
         $fil = $htm_files[$i][$of_ff];
         $mfil = lc(dos_2_unix($fil));
         if ($msrc eq $mfil) {
            $val = $htm_files[$i][$of_lk];
            $val++;
            $htm_files[$i][$of_lk] = $val;
            # add HTML file linked to from what file
            $lnks = $htm_files[$i][$of_fm];
            if (length($lnks) == 0) {
               $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
               $lnks .= ',';
               $lnks .= $fmfil;
            }
            $htm_files[$i][$of_fm] = $lnks;
            prt( "HTML link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg30);
            ###prt( "$ff ($fnd) linked to $fil ($i) $val\n" ) if ($dbg21);
            my $hr = $htm_files[$i][$of_hr];   # extract HREF ref.array
            my $im = $htm_files[$i][$of_im];   # extract IMAGE ref.array
            my $hrc = scalar @{$hr};
            my $imc = scalar @{$im};         # get count of images
            my $j = 0;
            my ($itmnam, $itmdir) = fileparse($fil);   # get name and path
            prt( "$lev [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
            for ($j = 0; $j < $hrc; $j++) {
               my $hrf = ${$hr}[$j];
               my $hrt = get_href_type($hrf); 
               if ($hrt == 7) {
                  my $nsrc = fix_rel_unix_path($itmdir.get_local_href($hrf));
                  if ( !in_done_srcs($nsrc) ) {
                     push(@donesrcs, $nsrc);   # put it in DONE list
                     mark_link( $fil, $i, $nsrc, $lev + 1 );   # and MARK its links now
                  }
               }
            }
            $val = 0;
            prt( "$fil - Checking $imc images files ...\n") if ($dbg24);
            for ($j = 0; $j < $imc; $j++) {      # do each, in this linked file
               my $img = ${$im}[$j];         # get the image string
               my $isrc = $itmdir.$img;      # join it with the path
               my $nisrc = fix_rel_unix_path($isrc);   # fix rel, and force unix path
               prt( "Marking [$nisrc] - ".($j+1)." of $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
               if ( !in_done_imgs($nisrc) ) {
                  push(@doneimgs, $nisrc);   # put it in DONE list
                  mark_other_links( $fil, $j, $nisrc, 0 );   # and MARK the link in @img_files
                  $val++;
               } else {
                  prt( "Already IN doneimgs ...\n" ) if ($dbg19 || $dbg26);
               }
            }
            prt( "$fil - Marked $val of $imc images files ...\n") if ($val && $dbg24);
            return 0;
         }
      }
   }
   # hmmmm, LINK not found in HREF files, maybe IMAGES, zip, etc ...
   $val = mark_other_links( $fmfil, $fnd, $src, $lev );
   if ($val) {
      $totcnt += $val;
      prt( "NO LINK FOUND HREF [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25);
      return 1;
   }
   return 0;
}
###########################################################################
# show link count, and links, in passed multidimensional file array
#
# If showlinks (or $dbg20 or $dbg31) is ON, shows internal LINKS
# NOTE: Presently DOES NOT get all LINKS??? BAH!!!
###########################################################################
sub show_link_counts {
   my ($m, $hf) = @_;
   my $fcnt = scalar @{$hf};
   my $mcnt = 0;
   my $mss = "Checking LINKS for $fcnt $m files ...\n"; 
   if ($fcnt) {
      for (my $i = 0; $i < $fcnt; $i++) {
         my $fil = ${$hf}[$i][$of_ff];
         my $hrt = ${$hf}[$i][$of_lk];
         if ($hrt) {
            if ($dbg20 || $dbg31 || $showlinks) {
               prt( $mss ) if (length($mss));
               $mss = '';
               if ($dbg31 || $showlinks) {
                  my $lnks = ${$hf}[$i][$of_fm];
                  prt( "$i: $fil has $hrt links [$lnks]\n" );
               } else {
                  prt( "$i: $fil has $hrt links\n" );
               }
            }
         } else {
            prt( $mss ) if (length($mss));
            $mss = "WARNING: $fil($i) HAS NO LINKS!";
            prt( "$mss\n" ) if ($dbg33);
            push(@warnings, $mss);
            $mss = '';
            $mcnt++;
         }
      }
      if ($mcnt) {
         prt( "Done LINKS for $fcnt $m files ... MISSED $mcnt!!!\n" );
      }
   } else {
      prt( "There are NO $m files ...\n" ) if ($dbg20);
   }
}
sub in_excused {
   my ($tx) = shift;
   foreach my $t (@excused) {
      if ($t eq $tx) {
         return 1;
      }
   }
   return 0;
}
# using the given HOME PAGE, try to TRACE ALL LINKS
sub check_local_links {
   my ($hf) = shift;
   my $fcnt = scalar @htm_files;
   my ($hfnm,$hfdir) = fileparse($hf);
   my $lchf = lc($hfnm);
   my $fnd = get_home_offset($hf);
   my ($fil,$nm,$dir,$ext);
   if ($hfdir eq ".\\") {
      $hfdir = $in_folder."\\";
   }
   my $itmdir = '';
   my $itmnam = '';
   my $i = 0;
   my $i2 = 0;
   prt( "Checking local links, for $fcnt files, from $hf ...\n");
   if ($fnd == -1) {
      prt( "WARNING: check_local_links: Unable to find [$hf] ...\n" );
      return 1;
   }
   # process item 1 ...
   $procnt = 1;
   my $hr = $htm_files[$fnd][$of_hr];
   my $im = $htm_files[$fnd][$of_im];
   my $hrc = scalar @{$hr};
   my $imc = scalar @{$im};
   my $hrf = '';
   my $img = '';
   my $hrt = 0;
   my $src = '';
   my $nsrc = '';
   my $ff = '';
   my $shwerr = 0;
   my $emsg = '';
   $fil = $htm_files[$fnd][$of_ff];   # extract FULL PATH name of file ...
   $htm_files[$fnd][$of_lk] = 1;
   ($itmnam, $itmdir) = fileparse($fil);   # get name and path
   prt( "HOME [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
   for ($i = 0; $i < $hrc; $i++) {
      $hrf = ${$hr}[$i];
      $hrt = get_href_type($hrf); 
      $i2 = $i + 1;
      $shwerr = 0;
      $emsg = "IN[$fil] $i2 HREF [$hrf]$hrt";
      if ($hrt == 0) {
         if (in_excused($hrf)) {
            $excusecnt++;
         } else {
            $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[1]";
            push(@warnings, "WARNING: $emsg!" );
            $shwerr = 1;
         }
      } elsif ($hrt < 5) {
         $emsg .= " REMOTE";
      } elsif ($hrt == 5) {
         $emsg .= " JAVASCRIPT";
      } elsif ($hrt == 6) {
         $emsg .= " IN PAGE";
      } else {
         my $lref = get_local_href($hrf);
         $src = $itmdir.$lref;
         my $nusrc = fix_rel_unix_path($src);
         ### prt( "REL PATH [$src] to UNIX PATH [$nusrc]\n" );
         push(@donesrcs, $nusrc);   # put it in DONE list
         # mark link - FROM $fil
         if ( mark_link( $fil, $fnd, $nusrc, 0 ) ) {
            $emsg .= " SITE REF [$nusrc] ***NO IN-SITE LINK***???";
            $msg = "$i2 [$fil] HREF [$hrf]$hrt SITE REF [$nusrc] ***NO IN-SITE LINK***???";
            if (-f $src) {
               $msg .= "\n*** BUT FILE EXISTS [$src] ***";
               $emsg .= "\n*** BUT FILE EXISTS [$src] ***";
               push(@warnings, "WARNING: Local HREF [$lref] in [$fil] OUTSIDE WEB! but EXISTS!" );
            } else {
               push(@missed, $msg );
               $shwerr = 1;
            }
         } else {
            $emsg .= " SITE REF [$src] ok" if ($dbg19);
         }
      }
      prt( "$emsg\n" ) if ($dbg19 || $shwerr);
   }
   prt( "HOME - Marking $imc images files ...\n") if ($dbg24);
   for ($i = 0; $i < $imc; $i++) {
      $img = ${$im}[$i];
      $src = $itmdir.$img;
      $nsrc = fix_rel_unix_path($src);
      prt( "HOME $fil - Mark $src ($nsrc) image ...\n" ) if ($dbg24);
      push(@doneimgs, $nsrc);   # put it in DONE list
      mark_other_links( $fil, $i, $nsrc, 0 );
   }
   return 0;
}
sub offset_done {
   my ($off, @done) = @_;
   foreach my $num (@done) {
      if ($off == $num) {
         return 1;
      }
   }
   return 0;
}
sub trace_from_htm {
   my ($hf, $lev) = @_;
   my $fnd = get_offset_of_htm($hf);
   my $msg = '';
   if (($fnd != -1) && !offset_done($fnd,@offsdone)) {
      push(@offsdone,$fnd);
      my $hr = $htm_files[$fnd][$of_hr];
      my $hrc = scalar @{$hr};
      my @offsets = ();
      my($itmnam, $itmdir) = fileparse($hf);   # get name and path
      for (my $i = 0; $i < $hrc; $i++) {
         my $hrf = ${$hr}[$i];
         my $hrt = get_href_type($hrf); 
         if ($hrt == 7) {
            #my $src = fix_rel_unix_path($itmdir.get_local_href($hrf));
            my $src = $itmdir.get_local_href($hrf);
            push(@offsets,$src);
            trace_from_htm($src, ($lev + 1));
         }
      }
      $hrc = scalar @offsets;
      my $cnt = $lev;
      $msg = sprintf("%4d ", $lev);
      prt( $msg );
      while($cnt) {
         prt( ' ' );
         $msg .= ' ';
         $cnt--;
      }
      prt( "$hf links to $hrc files ...\n" );
      $msg .= "$hf links to $hrc files ...";
      push(@htmlinks, [$lev, $msg]);
      foreach my $fil (@offsets) {
         $cnt = $lev;
         $msg = sprintf("%4d ", $lev);
         prt( $msg );
         while($cnt) {
            prt( ' ' );
            $msg .= ' ';
            $cnt--;
         }
         prt( "$fil\n" );
         $msg .= $fil;
         push(@htmlinks, [$lev, $msg]);
      }
   }
}
sub get_offset_of_htm {
   my ($hf) = shift;
   my $fcnt = scalar @htm_files;
   my ($hfnm,$hfdir) = fileparse($hf);
   if ($hfdir eq ".\\") {
      $hfdir = $in_folder."\\";
   }
   my $lchf = lc($hfnm);
   my $fnd = -1;
   for (my $i = 0; $i < $fcnt; $i++) {
      my $fil = $htm_files[$i][$of_ff];
      my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
      if (lc($nm.$ext) eq $lchf) {
         # have at least the NAME, but maybe not the FOLDER
         if (lc($hfdir) eq lc($dir)) {
            $fnd = $i;
            last;
         }
      }
   }
   return $fnd;
}
sub get_home_offset {
   if ($homeoffset != -1) {
      return $homeoffset;
   }
   my ($hf) = shift;
   prt( "Getting offset of HOME file $hf ...\n");
   my $fnd = get_offset_of_htm($hf);
   if ($fnd == -1) {
      prt( "WARNING: Unable to find [$hf] ...\n" );
      push(@warnings, "WARNING: Unable to find [$hf] ...");
   } else {
      prt( "Found $hf at index $fnd\n" );
   }
   $homeoffset = $fnd;
   return $homeoffset;
}
#############################################################################
# check linkages
#############################################################################
sub check_linkages {
   my ($hf) = shift;
   my $fcnt = scalar @htm_files;
   my ($hfnm,$hfdir) = fileparse($hf);
   my $lchf = lc($hfnm);
   my $fnd = get_home_offset($hf);
   my ($fil,$nm,$dir,$ext);
   if ($hfdir eq ".\\") {
      $hfdir = $in_folder."\\";
   }
   my $itmdir = '';
   my $itmnam = '';
   my $i = 0;
   my $i2 = 0;
   prt( "Re-checking HREF and IMG/OTHER links, for $fcnt files ...\n");
   if ($fnd == -1) {
      prt( "WARNING: Unable to find [$hf] ...\n" );
      push(@warnings, "WARNING: Unable to find [$hf] ...");
      return 1;
   }
   # process item 1 ...
   $procnt = 1;
   my $hr = $htm_files[$fnd][$of_hr];
   my $im = $htm_files[$fnd][$of_im];
   my $hrc = scalar @{$hr};
   my $imc = scalar @{$im};
   my $hrf = '';
   my $img = '';
   my $hrt = 0;
   my $src = '';
   my $ff = '';
   my $shwerr = 0;
   my $emsg = '';
   $fil = $htm_files[$fnd][$of_ff];   # extract FULL PATH name of file ...
   ($itmnam, $itmdir) = fileparse($fil);   # get name and path
   prt( "\n" ) if ($dbg18);
   prt( "$procnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18);
   for ($i = 0; $i < $hrc; $i++) {
      $hrf = ${$hr}[$i];
      $hrt = get_href_type($hrf); 
      $i2 = $i + 1;
      $shwerr = 0;
      $emsg = "HH[$fil] ";
      $emsg .= "$i2 HREF [$hrf]$hrt";
      if ($hrt == 0) {
         if (in_excused($hrf)) {
            $excusecnt++;
         } else {
            $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[2]";
            push(@warnings, "WARNING: $emsg" );
            $shwerr = 1;
         }
      } elsif ($hrt < 5) {
         $emsg .= " REMOTE";
      } elsif ($hrt == 5) {
         $emsg .= " JAVASCRIPT";
      } elsif ($hrt == 6) {
         $emsg .= " LOCAL";
      } else {
         $src = $itmdir.get_local_href($hrf);
         if (-f $src) {
            $emsg .= " SITE REF [$src] ok";
         } else {
            $emsg .= " SITE REF [$src] ***MISSING***?[1]";
            push(@missed, $emsg );
            $shwerr = 1;
         }
      }
      prt( "$emsg\n" ) if ($dbg18 || $shwerr);
   }
   # From this beginning
   for (my $j = 0; $j < $fcnt; $j++) {
      $fil = $htm_files[$j][$of_ff];
      ($itmnam, $itmdir) = fileparse($fil);   # get name and path
      if ($j != $fnd) {
         $procnt++;
         $hr = $htm_files[$j][$of_hr];
         $im = $htm_files[$j][$of_im];
         $hrc = scalar @{$hr};
         $imc = scalar @{$im};
         prt( "\n" ) if ($dbg18);
         prt( "$procnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18);
         for ($i = 0; $i < $hrc; $i++) {
            $hrf = ${$hr}[$i];
            $hrt = get_href_type($hrf); 
            $i2 = $i + 1;
            $shwerr = 0;
            $emsg = "HF[$fil] ";
            $emsg .= "$i2 HREF [$hrf]$hrt";
            if ($hrt == 0) {
               if (in_excused($hrf)) {
                  $excusecnt++;
               } else {
                  $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[3]";
                  push(@warnings, "WARNING: $emsg");
                  $shwerr = 1;
               }
            } elsif ($hrt < 5) {
               $emsg .= " REMOTE";
            } elsif ($hrt == 5) {
               $emsg .= " JAVASCRIPT";
            } elsif ($hrt == 6) {
               $emsg .= " LOCAL";
            } else {
               $src = get_local_href($hrf);
               if ($src eq '.') {
                  if (length($homefile)) { # && ($fdir eq $in_folder)
                     $src = $homefile;   # translate a DOT to HOME FILE
                  }
               }
               $ff = $itmdir.$src;
               if (-f $ff) {
                  $emsg .= " SITE REF [$ff] ok";
               } else {
                  $emsg .= " SITE REF [$src][$ff] ***MISSING***?[3]";
                  push(@missed, $emsg );
                  $shwerr = 1;
               }
            }
            prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32));
         }
         for (my $i = 0; $i < $imc; $i++) {
            $img = ${$im}[$i];
            $emsg = "IF[$fil] [$img] ";
            if ($img =~ /^http:\/\/.*/i) {
               if ($showhreflinks) {
                  push(@warnings, "WARNING: IMG link is HREF $emsg [1]");
               } else {
                  $hrflnkcnt++;
               }
            } else {
               # 25/07/2007 - deal with '%20' in text
               $img =~ s/%20/ /g;
               $src = $itmdir.$img;
               $shwerr = 0;
               if (-f $src) {
                  $emsg .= " IMG ok";
               } else {
                  $emsg .= " IMG ***MISSING***?[5]";
                  push(@missed, $emsg );
                  $shwerr = 1;
               }
            }
            prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32));
         }
      }
   }
   return 0;
}
sub check_images {
   my ($ifile, @srcs) = @_;
   my ($nm, $dir) = fileparse($ifile);
   my $scnt = scalar @srcs;
   if ($scnt) {
      prt( "Found $scnt imgs in $nm ...\n" ) if ($dbg14);
      for (my $i = 0; $i < $scnt; $i++) {
         my $src = $srcs[$i][0];
         my $lnn = $srcs[$i][1];
         if ($src =~ /^http:\/\//i) {
            # remote HREF
         } else {
            # 25/07/2007 - deal with '%20' to space
            $src =~ s/%20/ /g;
            my $ff = $dir.$src;
            if ( -f $ff ) {
               prt( "$src - ok\n" ) if ($dbg13);
            } else {
               my $msg = "WARNING: [$src] $ifile:$lnn NOT FOUND![2]";
               push(@warnings, $msg);
               prt( "$msg\n" ) if ($dbg16);
            }
         }
      }
   } else {
      prt( "Found NO imgs in [$ifile] ...\n" ) if ($dbg15);
   }
   return $scnt;
}
sub get_img_srcs {
   my ($fil, @lns) = @_;
   my $lc = scalar @lns;
   my $scnt = 0;
   my ($nm,$dir) = fileparse( $fil );
   #my $sdbg12 = $dbg12;
   #my $sdbg11 = $dbg11;
   #my $sdbg16 = $dbg16;
   #my $sdbg10 = $dbg10;
   #if (lc($nm) eq 'moon.htm') {
   #   $dbg12 = 1;
   #   $dbg11 = 1;
   #   $dbg16 = 1;
   #   $dbg10 = 1;
   #}
   prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg12);
   my @isrc = ();
   my $ln = '';
   my $bal = '';
   my $inscript = 0;
   my $msg = '';
   my $bgnln = 0;
   my $lnnos = '';
   for (my $i = 0; $i < $lc; $i++) {
      $ln = $bal;
      $bal = '';
      $ln .= $lns[$i];
      chomp $ln;
      prt( "$i [$ln] ...\n" ) if ($dbg10);
      if ($inscript) {
         if ($ln =~ /<\/script>/i) {
            $inscript =0;
            prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
         }
         next;
      }
      if ( $ln =~ /<img\s+(.*)/i ) {
         my $iln = $1;
         if ( $ln =~ /<script.*>/i ) {
            $msg = "WARNING: Also found SCRIPT in IMG line ...[$ln]";
            push(@warnings, $msg);
            prt( "$msg\n" ) if ($dbg16);
         }
         prt( "Found [$iln] ...\n" ) if ($dbg10);
         $bgnln = $i;
         while ( !($iln =~ />/) && ($i < $lc)) {
            $i++;
            my $nxln = $lns[$i];
            chomp $nxln;
            prt( "Adding [$nxln] ...\n" ) if ($dbg10);
            $iln .= ' ' if !($iln =~ /=$/);
            $iln .= $nxln;
         }
         $lnnos = "$bgnln:$i";
         my $ind = index($iln, '>');
         if ($ind != -1) {
            $bal = substr($iln, $ind+1);
            $iln = substr($iln, 0, $ind+1);
         }
         $iln = trim_all($iln);
         #if ($iln =~ /src=\"(.+)\"/i) {
         if ($iln =~ /src=\s*\"(\S+)\"/i) {
            prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
            push(@isrc, [$1, $lnnos, $fil]);
            $scnt++;
         } elsif ($iln =~ /src=\s*(\S+)/i) {   # without QUOTES
            prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
            push(@isrc, [$1, $lnnos, $fil]);
            $scnt++;
         } elsif ($iln =~ /src=\s*\'(\S+)\'/i) {   # single QUOTES
            prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
            push(@isrc, [$1, $lnnos, $fil]);
            $scnt++;
         } else {
            $msg = "WARNING: SRC NOT FOUND in [$iln]$fil:$lnnos...";
            push(@warnings, $msg);
            prt( "$msg\n" ) if ($dbg16);
         }
      } elsif ( $ln =~ /<script.*>/i ) {
         $inscript = 1;
         prt( "Entered a SCRIPT ...\n" ) if ($dbg11);
         if ($ln =~ /<\/script>/i) {
            $inscript =0;
            prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
         }
      }
   }
   prt( "Returning $scnt img sources ...\n") if ($dbg10);
   #$dbg12 = $sdbg12;
   #$dbg11 = $sdbg11;
   #$dbg16 = $sdbg16;
   #$dbg10 = $sdbg10;
   return @isrc;
}
sub check_hrefs {
   my ($fil, @srcs) = @_;
   my ($fnm,$fdir,$fext) = fileparse( $fil, qr/\.[^.]*/ );
   my $scnt = scalar @srcs;
   my $isphp = (lc($fext) eq '.php');
   if ($scnt) {
      prt( "Found $scnt anchor href= in $fnm$fext ...\n" ) if ($dbg7);
      for (my $i = 0; $i < $scnt; $i++) {
         my $orgsrc = $srcs[$i][0];
         my $lnnos = $srcs[$i][1];
         my $src = $orgsrc;
         if ($src =~ /^http:/i) {
            # remote HREF
            push(@httprefs, [$src, $fil, $lnnos] );
         } elsif ($src =~ /^https:/i) {
            # remote HREF
            push(@httpsrefs, [$src, $fil, $lnnos] );
         } elsif ($src =~ /^ftp:/i) {
            # remote HREF
            push(@ftprefs, [$src, $fil, $lnnos] );
         } elsif ($src =~ /^mailto:/i) {
            # remote HREF
            push(@mtrefs, [$src, $fil, $lnnos] );
         } elsif ( $src =~ /^#/ ) {
            # local in page HREF
         } elsif ( $src =~ /^javascript:/i ) {
            # a JAVASCRIPT HREF
         } else {
            my $ind = index($src,'#');
            if ( $ind != -1 ) {
               $src = substr($src,0,$ind);
            }
            $ind = index($src,'?');
            if ( $ind != -1 ) {
               $src = substr($src,0,$ind);
            }
            $src =~ s/\/$//;
            if (length($src)) {
               if ($src eq '.') {   # HREF is just a DOT
                  if (length($homefile)) { # && ($fdir eq $in_folder)
                     $src = $homefile;   # translate a DOT to HOME FILE
                  }
               }
               # 25/07/2007 - deal with '%20' back to space
               $src =~ s/%20/ /g;
               my $ff = $fdir.$src;
               if ( -f $ff ) {
                  prt( "$src - ok\n" ) if ($dbg5);
               } else {
                  my $msg = "WARNING: href [$src] file NOT FOUND![3] in [$fil]$lnnos";
                  push(@warnings, $msg);
                  prt( "$msg\n" ) if ($dbg16);
               }
            } else {
               if ($isphp) {
                  prt( "Found BLANK HREF in PHP $fil ...\n" ) if ($dbg17);
               } else {
                  $msg = "WARNING: Found BLANK HREF in $fil ...";
                  push(@warnings, $msg);
                  prt( "$msg\n" ) if ($dbg16);
               }
            }
         }
      }
   } else {
      if ($isphp) {
         prt( "Found NO HREFs in PHP $fil ...\n" ) if ($dbg17);
      } else {
         prt( "NO HREF FOUND in $fil ...\n" ) if ($shownohrefs);
      }
   }
}
############################################################
# Only used is $chkip = 1;
# Show IP Address
# uses sockets, gethostbyname
# Return 0, if can NOT be resolved.
# else the number of IP addresses resolved.
############################################################
sub showIPAddress {
   my ($nm) = shift;
   my @addr = gethostbyname($nm);
   my $cnt = 0;
   if( !@addr ) {
      prt( "Can't resolve $nm: $!\n" );
      return 0;
   }
   @addr = map { inet_ntoa($_) } @addr[4 .. $#addr];
   foreach my $k (@addr) {
      $cnt++;
      prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3);
   }
   return $cnt;
}
################################################
# Add to @scripts multidimensional array,
# if NOT already in there, when on the line
# numbers are added.
###############################################
sub add_2_scripts {
   my ($fil, $lns) = @_;
   my $sc = scalar @scripts;
   for (my $i = 0; $i < $sc; $i++) {
      my $cf = $scripts[$i][0];
      if ($cf eq $fil) {
         my $lc = $scripts[$i][1];
         $lc .= ":$lns";
         $scripts[$i][1] = $lc;
         return 0;
      }
   }
   push(@scripts, [$fil, $lns]);
   return 1;
}
####################################################
# Get HREF sources
# Given an ARRAY of file lines, check for
# anchor href="something" ...
# Return the "something" in an array
####################################################
sub get_href_srcs {
   my ($fil, @lns) = @_;
   my $lc = scalar @lns;
   my $scnt = 0;
   my $slns = 0;   # count the SCRIPT lines
   my ($nm,$dir) = fileparse( $fil );
   prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6);
   my @isrc = ();
   my $ln = '';
   my $bal = '';
   my $inscript = 0;
   $slns = 0;
   my $bgnln = 0;
   my $endln = 0;
   for (my $i = 0; $i < $lc; $i++) {
      $ln = $bal;
      $bal = '';
      $ln .= $lns[$i];
      chomp $ln;
      prt( "$i [$ln] ...\n" ) if ($dbg10);
      if ($inscript) {
         if ($ln =~ /<\/script>/i) {
            $inscript =0;
            prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
            add_2_scripts( $fil, $slns );
            $slns = 0;
            next;
         }
         $slns++;
         next;
      }
      if ( $ln =~ /<a\s+(.*)/i ) {
         my $iln = $1;
         prt( "Found [$iln] ...\n" ) if ($dbg10);
         $bgnln = $i;
         while ( !($iln =~ />/) && ($i < $lc)) {
            $i++;
            my $nxln = $lns[$i];
            chomp $nxln;
            prt( "Adding [$nxln] ...\n" ) if ($dbg10);
            $iln .= ' ' if !($iln =~ /=$/);
            $iln .= $nxln;
         }
         $endln = $i;
         my $ind = index($iln, '>');
         if ($ind != -1) {
            $bal = substr($iln, $ind+1);
            $iln = substr($iln, 0, $ind+1);
         }
         #if ($iln =~ /src=\"(.+)\"/i) {
         if ($iln =~ /href\s*=\s*\"(\S+)\"/i) {
            prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10);
            push(@isrc, [$1, "$bgnln:$endln"] );
            $scnt++;
         } else {
            if (( $iln =~ /name=\s*\"(\S+)\"/i )||( $iln =~ /name=(\S+)/i )) {
               # ignore BOOKMARKS
            } else {
               $msg = "WARNING: HREF NOT FOUND in [$iln]...";
               push(@warnings, $msg);
               prt( "$msg\n" ) if ($dbg16);
            }
         }
      } elsif ( $ln =~ /<script.*>/i ) {
         $inscript = 1;
         prt( "Entered a SCRIPT ...\n" ) if ($dbg4);
         $slns = 0;
         $ln = substr($ln, 7);
         if ($ln =~ /<\/script>/i) {
            $inscript =0;
            prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
            add_2_scripts( $fil, 1 );
            $slns = 0;
         }
      }
   }
   if ($inscript) {
      $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]...";
      push(@warnings, $msg);
      prt( "$msg\n" ) if ($dbg16);
   }
   prt( "Returning $scnt HREF sources ...\n") if ($dbg10);
   return @isrc;
}
#########################################################
# Passed an array of extensions,
# check if this is one of them?
#########################################################
sub is_my_ext {
   my ($fil, @exts) = @_;
   my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
   foreach my $ex (@exts) {
      if (lc($ex) eq lc($ext)) {
         return 1;
      }
   }
   return 0;
}
############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################
sub is_htm_ext {
   my ($fil) = shift;
   return( is_my_ext($fil, @html_ext) );
}
sub is_graphic_ext {
   my ($fil) = shift;
   return( is_my_ext($fil, @graf_ext) );
}
sub is_zip_ext {
   my ($fil) = shift;
   my @arr = qw( .zip );
   return( is_my_ext($fil, @arr) );
}
sub is_css_ext {
   my ($fil) = shift;
   return( is_my_ext($fil, @css_ext) );
}
sub is_txt_ext {
   my ($fil) = shift;
   my @arr = qw( .txt );
   return( is_my_ext($fil, @arr) );
}
sub is_script_ext {
   my ($fil) = shift;
   return( is_my_ext($fil, @script_ext) );
}
################################################
# my $ignfpd = 1;   # ignore FRONTPAGE folders
################################################
sub is_fp_folder {
   my ($inf) = shift;
   foreach my $fil (@fpfolders) {
      if (lc($inf) eq lc($fil)) {
         return 1;
      }
   }
   return 0;
}
####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes {
   my ($fil) = shift;
   my $lcf = lc($fil);
   foreach my $f (@excludes) {
      if (lc($f) eq $lcf) {
         return 1;
      }
   }
   return 0;
}
sub in_spl_excludes {
   my ($fldr) = shift;
   my $lfldr = lc($fldr);
   foreach my $f (@splexcludes) {
      if (lc($f) eq $lfldr) {
         return 1;
      }
   }
   return 0;
}
####################################################################
# process_folder(folder) 
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays
####################################################################
sub process_folder {
   my ($inf) = shift;
   my $fcnt = 0;
   prt( "Processing $inf folder ...\n" ) if ($dbg1);
   if ( opendir( DIR, $inf ) ) {
      my @files = readdir(DIR);
      closedir DIR;
      foreach my $fil (@files) {
         if (($fil eq ".")||($fil eq "..")) {
            next;
         }
         my $ff = $inf."\\".$fil;
         if ( -d $ff ) {
            if ($recurse) {
               if ($ignfpd && is_fp_folder($fil)) {   # ignore FRONTPAGE folders
                  next;
               }
               if (@splexcludes && in_spl_excludes($fil)) {
                  next;
               }
               process_folder( $ff );
            }
         } else {
            # NOTE: multidimensional arrays pushed - offsets into arrays
            # my $of_ff = 0;   # 1 - full file name
            # my $of_hr = 1;   # 2 - array ref of href links
            # my $of_im = 2;   # 3 - array ref of image links
            # my $of_lk = 3;   # 4 - linked count
            # my $of_sp = 4;   # 5 - spare
            # my $of_to = 5;   # links TO
            # my $of_fm = 6;    # links FROM
            if ( !in_excludes($fil) ) {  # NOT in @excludes
               my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
               my $val = 0;
               $val = $ext_hash{$ext} if ( defined $ext_hash{$ext} );
               $val++;
               $ext_hash{$ext} = $val;
               if (is_htm_ext($fil)) {
                  push(@htm_files, [$ff, '', '', 0, 0, '', ''] );
                  $fcnt++;
               } elsif (is_graphic_ext($fil)) {
                  push(@img_files, [$ff, '', '', 0, 0, '', ''] );
               } elsif (is_zip_ext($fil)) {
                  push(@zip_files, [$ff, '', '', 0, 0, '', ''] );
               } elsif (is_css_ext($fil)) {
                  push(@css_files, [$ff, '', '', 0, 0, '', ''] );
               } elsif (is_txt_ext($fil)) {
                  push(@txt_files, [$ff, '', '', 0, 0, '', ''] );
               } elsif (is_script_ext($fil)) {
                  push(@script_files, [$ff, '', '', 0, 0, '', ''] );
               } else {
                  push(@other_files, [$ff, '', '', 0, 0, '', ''] );
               }
            }
         }
      }
      prt( "Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1);
   } else {
      prt( "ERROR: Failed to open folder $inf ...\n" );
   }
}
##############################################
# Just to show the COUNTS in the ARRAYS
##############################################
sub show_found_counts {
   my $cnt = scalar @htm_files;
   prt( "Found $cnt HTML, ");
   $cnt = scalar @img_files;
   prt( "$cnt images, " );
   $cnt = scalar @css_files;
   prt( "$cnt css, " );
   $cnt = scalar @zip_files;
   prt( "$cnt zip, " );
   $cnt = scalar @txt_files;
   prt( "$cnt txt, " );
   $cnt = scalar @script_files;
   prt( "$cnt script, " );
   $cnt = scalar @other_files;
   prt( "and $cnt others ...\n" );
   $cnt = scalar keys %ext_hash;
   if ($dbg27 || $verbal) {
      prt( "$cnt extensions, and each count ...\n" );
      foreach my $key (keys %ext_hash) {
         my $val = $ext_hash{$key};
         prt( "$val $key ");
      }
      prt("\n");
   }
}
# @ipsfound = <INF>;
sub in_ips_found {
   my ($ip) = shift;
   my $lcip = lc($ip);
   foreach my $i (@ipsfound) {
      chomp $i;
      if (lc($i) eq $lcip) {
         return 1;
      }
   }
   return 0;
}
#######################################################
# Process the HTTP HREF sources
# if $chkip = 1; then attempt to resolve the IP
# addresses from the host name.
#######################################################
sub process_host_array {
   $hcnt = scalar @httprefs;
   if ($hcnt) {
      prt( "Found $hcnt HREF entries ...\n" );
      for (my $i = 0; $i < $hcnt; $i++) {
         $href = $httprefs[$i][0];
         $file = $httprefs[$i][1];
         my $lnn = $httprefs[$i][2];
         my ($nm,$dir) = fileparse($file);
         if (defined( $hrefs{$href} )) {
            $val = $hrefs{$href};
            $val .= ' '.$file;
         } else {
            $val = $file;
         }
         $val .= ":$lnn";
         $hrefs{$href} = $val;
         prt( "$href in [$file]$lnn\n" ) if ($dbg2);
      }
      $hcnt = scalar keys(%hrefs);
      prt( "Found $hcnt different entries ...\n" );
      if ($chkip) {
         my $inips = 0;
         prt( "Checking $hcnt IP addresses ... " );
         if ( !$refreships && ( -f $ipfile)) {
            if (open INF, "<$ipfile") {
               @ipsfound = <INF>;
               close INF;
               prt( "Have ".scalar @ipsfound." in $ipfile" );
            } else {
               prt( "Warning: Failed to open $ipfile" );
            }
         }
         prt("\n");
         $procnt = 0;
         foreach my $key (keys %hrefs) {
            $val = $hrefs{$key};
            $procnt++;
            prt( "$key in $val\n" ) if ($dbg8);
            if ($key =~ /^http:\/\//i) {
               my $hkey = substr($key, 7);
               my @arr = split( /\//, $hkey );
               $hkey = $arr[0];
               if ( !in_ips_found($hkey) ) {
                  if (showIPAddress( $hkey ) == 0) {
                     $msg = "FAILED: NO IP FOR HOST [$hkey][$val]";
                     push(@warnings, $msg);
                     prt( "$msg\n" ) if ($dbg16);
                  } elsif ($writeips) {
                     push(@ipsfound,"$hkey\n");
                  }
               } else {
                  $inips++;
               }
            }
            if (($procnt % 100) == 0) {
               prt( "Done $procnt IP Addresses ...\n" );
            }
         }
         prt( "Completed $procnt IP Addresses ... " );
         if ($writeips) {
            $val = join("\n", sort @ipsfound);
            $val = trimblanklines($val);
            write2file($val, $ipfile);
            prt( "$inips in previous. Written ".scalar @ipsfound." to $ipfile" );
         }
         prt("\n");
      }
   }
}
sub show_startup {
   prt( "Checking $in_folder ..." );
   prt( " Using HOME file $homefile ..." ) if length($homefile);
   prt("\n");
   if ($verbal) {
      prt( "\nOptions:\n" );
      prt( sprintf(" -checkips - HREF check %s, \n", ($chkip ? "On" : "Off")) );
      prt( sprintf(" -showhreflinks - Show HREF %s, \n", ($showhreflinks ? "On" : "Off")) );
      prt( sprintf(" -showlinks - Show links %s, \n", ($showlinks ? "On" : "Off")) );
      prt( sprintf(" -showscripts - Show script files %s, \n", ($showscripts ? "On" : "Off")) );
      prt( sprintf(" -writeips - Write HREF $ipfile %s, \n", ($writeips ? "On" : "Off")) );
      prt( sprintf(" -refreships - Refresh HREF %s\n", ($refreships ? "On" : "Off")) );
      prt( sprintf(" -shownohrefs - Show NO HREF found %s\n", ($shownohrefs ? "On" : "Off")) );
      if (@excludes) {
         prt( "Have ".scalar @excludes." excluded files - " );
         foreach my $ex (@excludes) {
            prt( "$ex " );
         }
         prt("\n");
      }
   }
}
sub show_help {
   prt( "$pgmname [Options] input-folder or home-file-name.\n" );
   prt( "Purpose: To take a folder, or home file in a folder, and\n" );
   prt( "check the assume local Web Site for internal consistency.\n" );
   prt( "Options:\n" );
   prt( " -checkips - Will check the IP resolution of REMOTE HREF items.\n" );
   prt( " -showhreflinks - Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" );
   prt( " -showlinks - Show the links for each file ...\n" );
   prt( " -showscripts - Show SCRIPT files ...\n" );
   prt( " -writeips - Write HREF of IP found to a file ...\n" );
   prt( " -refreships - If -checkips, and -writeips, re-write NEW check file...\n" );
   prt( " -ipfile out-file - Set HREF output file. Default is $ipfile.\n" );
   prt( " -ignore in-file - Ignore this file. Repeat for more. use '.none. to reset list.\n" );
   prt( " -shownohrefs - Show when NO HREF found in a file.\n" );
   prt( " -v - Set verbal on.\n" );
   prt( "If an input-folder given, then no trace of internal links will be done.\n" );
   prt( "If a home file name is given, the folder used will be of that file.\n" );
   prt( "Following are the current default settings ...\n" );
   $verbal = 1;
   show_startup();
   mydie("                                                      Happy link checking ;=))\n");
}
# Ensure argument exists, or die.
sub require_arg {
    my ($arg, @arglist) = @_;
    mydie( "ERROR: no argument given for option '$arg' ...\n" ) if ! @arglist;
}
##########################################################
# Parse USER input
# Largerly still to be done
##########################################################
sub parse_args {
   my (@av) = @_;
   while (@av) {
      my $arg = $av[0];
      if (substr($arg,0,1) eq '-') {
         if (($arg eq '-h')||($arg eq '--help')||($arg eq '-?')) {
            show_help();
         } elsif ($arg eq '-v') {
            $verbal = 1;
            prt( "Set verbal ON.\n" );
         } elsif ($arg eq '-checkips') {
            $chkip = 1;
            prt( "Will check IP of REMOTE HREF items.\n" );
         } elsif ($arg eq '-showhreflinks') {
            $showhreflinks = 1;
            prt( "Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" );
         } elsif ($arg eq '-showlinks') {
            $showlinks = 1;
            prt( "Show the links for each file ...\n" );
         } elsif ($arg eq '-showscripts') {
            $showscripts = 1;
            prt( "Show SCRIPT files ...\n" );
         } elsif ($arg eq '-writeips') {
            $writeips = 1;
            prt( "Write HREF of IP found to a file ...\n" );
         } elsif ($arg eq '-refreships') {
            $refreships = 1;
            prt( "If -checkips, and -writeips, re-write NEW check file...\n" );
         } elsif ($arg eq '-ipfile') {
            require_arg(@av);
            shift @av;
            $ipfile = $av[0];
            prt( "HREF output set to $ipfile ...\n" );
         } elsif ($arg eq '-ignore') {
            require_arg(@av);
            shift @av;
            $arg = $av[0];
            if ($arg eq '.none.') {
               @excludes = ();
               prt( "Reset EXCLUDES array ...\n" );
            } else {
               push(@excludes, $arg);
               prt( "Added file [$arg] to EXCLUDES ...\n" );
            }
         } else {
            mydie( "ERROR: Unknown argument [$arg] ...\n" );
         }
      } else {
         # no leading '-'
         $in_folder = $av[0];
         prt( "Input folder set to [$in_folder]...\n" );
      }
      shift @av;
   }
   # check the INPUT folder
   if ( !( (-d $in_folder) || (-f $in_folder ) ) ) {
      mydie( "ERROR: $in_folder is NOT VALID FOLDER OR FILE NAME\n" );
   }
}
sub secs_2_hhmmss {
   my ($secs) = shift;
   my $rt = '';
   my $mins = int($secs / 60);
   $secs = $secs - ($mins * 60);
   $secs = (int(($secs * 10) + 0.5)) / 10;
   if ($mins > 60) {
      my $hrs = int($mins / 60);
      $mins = $mins - ($hrs * 60);
      $mins = '0'.$mins if ($mins < 10);
      $secs = '0'.$secs if ($secs < 10);
      $rt = "$hrs:$mins:$secs";
   } else {
      $mins = '0'.$mins if ($mins < 10);
      $secs = '0'.$secs if ($secs < 10);
      $rt = "$mins:$secs";
   }
   return $rt;
}
sub show_time {
   my ($totcnt, $lncnt, $bgntime, $msg) = @_;
   my ($currtime, $difftime, $persec, $remains, $remsecs, $tenths, $remtm, $elapsed);
   $currtime = time();
   $difftime = $currtime - $bgntime;
   $persec = $lncnt / $difftime;
   $remains = $totcnt - $lncnt;
   $remsecs = $remains / $persec;
   $tenths = (int(($persec * 100) + 0.05)) / 100;
   $remtm = secs_2_hhmmss($remsecs);
   $elapsed = secs_2_hhmmss($difftime);
   prt( "$elapsed Done $lncnt, at $tenths/sec, left $remains in $remtm - $msg\n" );
}
# eof - chklinks02.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional