chkhlinks.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:23 2010 from chkhlinks.pl 2007/06/01 8.6 KB.

#!/perl -w
# NAME: chkhlinks.pl
# AIM: Given a input FOLDER, check all the HTML found for a <a href="...."
# reference and make sure that reference EXISTS,
# either as a LOCAL file,
# or that an IP address can be obtained for the HOST if http://<something> ...
# 31/05/2007 - geoff mclane - geoffair.com/mperl/index.htm
use strict;
use warnings;
use File::Basename;
use Socket;
unshift(@INC, 'C:/GTools/perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
   my @tmpsp = split(/\\/,$pgmname);
   $pgmname = $tmpsp[-1];
}
my $outfile = "temp.$pgmname.txt";
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
my $recurse = 1;   # recursive
my $ignfpd = 1;      # ignore FRONTPAGE folders
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );
my $in_folder = "C:\\HOMEPAGE\\GeoffAir";
my @in_files = ();
my $cnt = 0;
my $file = '';
my $warnings = '';
my @httprefs = ();
my @httpsrefs = ();
my @ftprefs = ();
my @mtrefs = ();
my $hcnt = 0;
my $href = '';
my %hrefs = ();
my $val = '';
my $msg = '';
my @scripts = ();
my $scnt = 0;
# debug only bits
my $dbg1 = 0;   # show entering folder ...
my $dbg2 = 0;   # show ALL HREF entries ...
my $dbg3 = 0;   # show IP found ...
my $dbg4 = 0;   # show entered/exit script
my $dbg5 = 0;   # show 'ok' when found
my $dbg6 = 0;   # show processing lines
my $dbg7 = 0;   # show anchor count
my $dbg8 = 0;   # show unique anchor href
my $dbg9 = 0;   # show files with SCRIPTS
my $dbg10 = 0;   # show diag for get_img_srcs() ...
my $dbg11 = 1;   # show FULL filename for missing IP ...
parse_args(@ARGV);
process_folder( $in_folder );
$cnt = scalar @in_files;
prt( "Found $cnt HTML files to process ...\n" );
foreach $file (@in_files) {
   my ($nm,$dir) = fileparse($file);
   if (open INF, "<$file") {
      my @lines = <INF>;
      close INF;
      @lines = dropcomments_from_array(@lines);
      my @srcs = get_href_srcs($file, @lines);
      my $scnt = scalar @srcs;
      if ($scnt) {
         prt( "Found $scnt anchor href= in $nm ...\n" ) if ($dbg7);
         for (my $i = 0; $i < $scnt; $i++) {
            my $src = $srcs[$i][0];
            my $lnnos = $srcs[$i][1];
            if ($src =~ /^http:/i) {
               # remote HREF
               push(@httprefs, [$src, $file, $lnnos] );
            } elsif ($src =~ /^https:/i) {
               # remote HREF
               push(@httpsrefs, [$src, $file, $lnnos] );
            } elsif ($src =~ /^ftp:/i) {
               # remote HREF
               push(@ftprefs, [$src, $file, $lnnos] );
            } elsif ($src =~ /^mailto:/i) {
               # remote HREF
               push(@mtrefs, [$src, $file, $lnnos] );
            } elsif ( $src =~ /^#/ ) {
               # local in page HREF
            } elsif ( $src =~ /^javascript:/i ) {
               # a JAVASCRIPT HREF
            } else {
               my $ind = index($src,'#');
               if ( $ind != -1 ) {
                  $src = substr($src,0,$ind);
               }
               $ind = index($src,'?');
               if ( $ind != -1 ) {
                  $src = substr($src,0,$ind);
               }
               my $ff = $dir.$src;
               if ( -f $ff ) {
                  prt( "$src - ok\n" ) if ($dbg5);
               } else {
                  my $msg = "WARNING: [$src] NOT FOUND! in [$file]$lnnos";
                  $warnings .= "\n" if length($warnings);
                  $warnings .= $msg;
                  prt( "$msg\n" );
               }
            }
         }
      } else {
         prt( "Found NO HREFs in $nm ...\n" );
      }
   }
}
$hcnt = scalar @httprefs;
if ($hcnt) {
   prt( "Found $hcnt HREF entries ..." );
   for (my $i = 0; $i < $hcnt; $i++) {
      $href = $httprefs[$i][0];
      $file = $httprefs[$i][1];
      my ($nm,$dir) = fileparse($file);
      if (defined( $hrefs{$href} )) {
         $val = $hrefs{$href};
         if ($dbg11) {
            $val .= ' '.$file;
         } else {
            $val .= ' '.$nm;
         }
         $hrefs{$href} = $val;
      } else {
         if ($dbg11) {
            $val = $file;
         } else {
            $val = $nm;
         }
         $hrefs{$href} = $val;
      }
      prt( "$href in [$file]\n" ) if ($dbg2);
   }
   $hcnt = scalar keys(%hrefs);
   prt( "Found $hcnt different entries ..." );
   foreach my $key (keys %hrefs) {
      $val = $hrefs{$key};
      prt( "$key in $val\n" ) if ($dbg8);
      if ($key =~ /^http:\/\//i) {
         my $hkey = substr($key, 7);
         my @arr = split( /\//, $hkey );
         $hkey = $arr[0];
         if (showIPAddress( $hkey ) == 0) {
            $msg = "FAILED: NO IP FOR HOST [$hkey][$val]";
            $warnings .= "\n" if length($warnings);
            $warnings .= $msg;
            prt( "$msg\n" );
         }
      }
   }
}
$scnt = scalar @scripts;
if ($scnt && $dbg9) {
   prt( "Got $scnt files containing SCRIPTS ...\n" );
   # push(@scripts, [$fil, $lns]);
   for (my $i = 0; $i < $scnt; $i++) {
      $file = $scripts[$i][0];
      $val = $scripts[$i][1];
      prt( "$file $val\n" );
   }
}
if (length($warnings)) {
   prt( "\nWARNINGS FOLLOW:\n$warnings\n" );
} else {
   prt( "No warnings ...\n" );
}
close_log($outfile,1);
exit(0);
##################################
sub showIPAddress {
   my ($nm) = shift;
   my @addr = gethostbyname($nm);
   my $cnt = 0;
   if( !@addr ) {
      prt( "Can't resolve $nm: $!\n" );
      return 0;
   }
   @addr = map { inet_ntoa($_) } @addr[4 .. $#addr];
   foreach my $k (@addr) {
      $cnt++;
      prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3);
   }
   return $cnt;
}
sub add_2_scripts {
   my ($fil, $lns) = @_;
   my $sc = scalar @scripts;
   for (my $i = 0; $i < $sc; $i++) {
      my $cf = $scripts[$i][0];
      if ($cf eq $fil) {
         my $lc = $scripts[$i][1];
         $lc .= ":$lns";
         $scripts[$i][1] = $lc;
         return 0;
      }
   }
   push(@scripts, [$fil, $lns]);
   return 1;
}
sub get_href_srcs {
   my ($fil, @lns) = @_;
   my $lc = scalar @lns;
   my $scnt = 0;
   my $slns = 0;   # count the SCRIPT lines
   my ($nm,$dir) = fileparse( $fil );
   prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6);
   my @isrc = ();
   my $ln = '';
   my $bal = '';
   my $inscript = 0;
   $slns = 0;
   my $bgnln = 0;
   my $endln = 0;
   for (my $i = 0; $i < $lc; $i++) {
      $ln = $bal;
      $bal = '';
      $ln .= $lns[$i];
      chomp $ln;
      prt( "$i [$ln] ...\n" ) if ($dbg10);
      if ($inscript) {
         if ($ln =~ /<\/script>/i) {
            $inscript =0;
            prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
            add_2_scripts( $fil, $slns );
            $slns = 0;
            next;
         }
         $slns++;
         next;
      }
      if ( $ln =~ /<a\s+(.*)/i ) {
         my $iln = $1;
         prt( "Found [$iln] ...\n" ) if ($dbg10);
         $bgnln = $i;
         while ( !($iln =~ />/) && ($i < $lc)) {
            $i++;
            my $nxln = $lns[$i];
            chomp $nxln;
            prt( "Adding [$nxln] ...\n" ) if ($dbg10);
            $iln .= ' '.$nxln;
         }
         $endln = $i;
         my $ind = index($iln, '>');
         if ($ind != -1) {
            $bal = substr($iln, $ind+1);
            $iln = substr($iln, 0, $ind+1);
         }
         #if ($iln =~ /src=\"(.+)\"/i) {
         if ($iln =~ /href=\s*\"(\S+)\"/i) {
            prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10);
            push(@isrc, [$1, "$bgnln:$endln"] );
            $scnt++;
         } else {
            if ( $iln =~ /name=\s*\"(\S+)\"/i ) {
               # ignore BOOKMARKS
            } else {
               $msg = "WARNING: HREF NOT FOUND in [$iln]...";
               $warnings .= "\n" if length($warnings);
               $warnings .= $msg;
               prt( "$msg\n" );
            }
         }
      } elsif ( $ln =~ /<script.*>/i ) {
         $inscript = 1;
         prt( "Entered a SCRIPT ...\n" ) if ($dbg4);
         $slns = 0;
         $ln = substr($ln, 7);
         if ($ln =~ /<\/script>/i) {
            $inscript =0;
            prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
            add_2_scripts( $fil, 1 );
            $slns = 0;
         }
      }
   }
   if ($inscript) {
      $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]...";
      $warnings .= "\n" if length($warnings);
      $warnings .= $msg;
      prt( "$msg\n" );
   }
   prt( "Returning $scnt HREF sources ...\n") if ($dbg10);
   return @isrc;
}
sub parse_args {
   my (@av) = @_;
   while (@av) {
      $in_folder = $av[0];
      shift @av;
   }
}
sub is_my_ext {
   my ($fil) = shift;
   my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
   if ((lc($ext) eq ".htm")||(lc($ext) eq ".html")) {
      return 1;
   }
   return 0;
}
# my $ignfpd = 1;   # ignore FRONTPAGE folders
sub is_fp_folder {
   my ($inf) = shift;
   foreach my $fil (@fpfolders) {
      if (lc($inf) eq lc($fil)) {
         return 1;
      }
   }
   return 0;
}
sub process_folder {
   my ($inf) = shift;
   my $fcnt = 0;
   prt( "Processing $inf folder ...\n" ) if ($dbg1);
   if ( opendir( DIR, $inf ) ) {
      my @files = readdir(DIR);
      closedir DIR;
      foreach my $fil (@files) {
         if (($fil eq ".")||($fil eq "..")) {
            next;
         }
         my $ff = $inf."\\".$fil;
         if ( -d $ff ) {
            if ($recurse) {
               if ($ignfpd && is_fp_folder($fil)) {   # ignore FRONTPAGE folders
                  next;
               }
               process_folder( $ff );
            }
         } else {
            if (is_my_ext($fil)) {
               push(@in_files, $ff);
               $fcnt++;
            }
         }
      }
      prt( "Processed $inf folder finding $fcnt HTML files ...\n" );
   } else {
      prt( "ERROR: Failed to open folder $inf ...\n" );
   }
}
# eof - chkhlinks.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional