Generated: Tue Feb 2 17:54:24 2010 from chklinks02.pl 2008/07/02 50.1 KB.
#!/perl -w # NAME: chklinks02.pl # AIM: Given a input FOLDER, check all the HTML found for a <a href="...." # reference and make sure that reference EXISTS, # either as a LOCAL file, # or that an IP address can be obtained for the HOST if http://<something> ... # AND check ALL image links <img src="..."...>, if it is a LOCAL file, # and other 'link' items, like .zip, .txt, etc. # 18/06/2007 - Add some command parameters, and help # 02/06/2007 - geoff mclane - geoffair.com/mperl/index.htm use strict; use warnings; use File::Basename; use Socket; unshift(@INC, 'C:/GTools/perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; # for htmltools, if functions used my @imgs = (); my @hrefs = (); # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = "temp.$pgmname.txt"; open_log($outfile); prt( "$pgmname ... Hello, World ...\n" ); # SET A DEFAULT INPUT FOLDER / FILE ###my $in_folder = "C:\\HOMEPAGE\\HOM\\test4\\index.htm"; ###my @splexcludes = qw( ok includes ); ###my $in_folder = "C:\\HOMEPAGE\\GA\\macpc\\index.htm"; ###my $in_folder = "C:\\HOMEPAGE\\Max7\\anew\\index.html"; ###my $in_folder = "C:\\HOMEPAGE\\simple\\index.htm"; ###my $in_folder = "C:\\HOMEPAGE\\GA\\index.html"; my $in_folder = "C:\\HOMEPAGE\\GA\\travel\\maroc\\index.htm"; my @splexcludes = qw( macpc ); ###my $in_folder = "C:\\HOMEPAGE\\GA\\flags\\index.htm"; ###my $in_folder = "C:\\HOMEPAGE\\GeoffAir\\welcome.html"; # some FEATURES and USER variables # my @excludes = qw( cvineng2.htm ); my @excludes = qw( desktop.ini php.ini blank.html blank.htm ); my $recurse = 1; # recursive my $ignfpd = 1; # ignore FRONTPAGE folders my $chkip = 1; # check the IP address my $showhreflinks = 0; # show a WARNING when an IMG, ICO, etc is a REMOTE link my $showlinks = 0; # show the links for each file my $showscripts = 0; # show SCRIPT files my $writeips = 1; # write IP found to a file my $refreships = 0; # if $chkip, and $writeips, re-write NEW check file my $shownohrefs = 0; # show when NO HREF found in file my $ipfile = "iplinks.txt"; my @ipsfound = (); my $verbal = 0; my @html_ext = qw( .htm .html .shtml .php ); my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg ); my @css_ext = qw( .css ); my @script_ext = qw( .js .class .cgi ); my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived ); my @excused = ( '?dir=test', '?dir=.' ); # program variables # NOTE: Each of these is a multidimensional array - see offset below my @htm_files = (); # store files found in folder my @img_files = (); my @css_files = (); my @zip_files = (); my @txt_files = (); my @script_files = (); my @other_files = (); # offsets in above arrays my $of_ff = 0; # full file name my $of_hr = 1; # array ref of href links my $of_im = 2; # array ref of image links my $of_lk = 3; # linked count my $of_sp = 4; # spare my $of_to = 5; # links TO my $of_fm = 6; # links FROM my @donesrcs = (); my @doneimgs = (); my %ext_hash = (); my $cnt = 0; my $file = ''; my @warnings = (); # list of errors, warnings during running my @httprefs = (); # set of HREF src values my @httpsrefs = (); my @ftprefs = (); my @mtrefs = (); my $hcnt = 0; my $href = ''; my %hrefs = (); my $val = ''; my $msg = ''; my @scripts = (); my $scnt = 0; my $imgcnt = 0; my $procnt = 0; my $homefile = ''; my $total_hrefs = 0; my $total_imgs = 0; my @missed = (); my $excusecnt = 0; my $hrflnkcnt = 0; # $showhreflinks my $homeoffset = -1; my @offsdone = (); my @htmlinks = (); # debug only bits my $dbg1 = 0; # show entering folder ... my $dbg2 = 0; # show ALL HREF entries ... my $dbg3 = 0; # show IP found ... my $dbg4 = 0; # show entered/exit script my $dbg5 = 0; # show 'ok' when found my $dbg6 = 0; # show processing lines my $dbg7 = 0; # show anchor count my $dbg8 = 0; # show unique anchor href my $dbg9 = 0; # show files with SCRIPTS my $dbg10 = 0; # show diag for get_img_srcs() ... my $dbg11 = 0; # in image processing show entered/exits script my $dbg12 = 0; # in image processing show processing count my $dbg13 = 0; # in image processing show ok - found file my $dbg14 = 0; # in image processing show image count found my $dbg15 = 0; # in image processing show image count when NONE found my $dbg16 = 0; # show WARNINGS during run ... my $dbg17 = 0; # show MISSING or BLANK HREF in PHP file my $dbg18 = 0; # check_linkages: show 'ok', in 2nd link check my $dbg19 = 0; # check_local_links: show progress ... my $dbg20 = 0; # check_local_links: show ALL link COUNTS - NONE IS ALWAYS SHOWN ... my $dbg21 = 0; # check_local_links: show LINK when found ... my $dbg22 = 0; # mark_image_link: show comparing, and comparision ... my $dbg23 = 0; # mark_image_link: show count of new images marked ... my $dbg24 = 0; # show each image file being marked my $dbg25 = 0; # show NO LINK FOUND my $dbg26 = 0; # show EACH HTML FILE BEING PROCESSED my $dbg27 = 0; # show EACH extesnions, and counts my $dbg28 = 0; # show image links information ... my $dbg29 = 0; # show ZIP, TXT, CSS, SCRIPT and OTHER file links information ... my $dbg30 = 0; # show HTML HREF links information ... my $dbg31 = 0; # like $dbg20 - check_local_links: show ALL links - NONE IS ALWAYS SHOWN ... my $dbg32 = 0; # show missing during processing my $dbg33 = 0; # show HAS NO LINKS during processing parse_args(@ARGV); if (length($in_folder) == 0) { mydie( "No input folder (or file) given/found in command ...\n" ); } if (-f $in_folder) { ($homefile, $in_folder) = fileparse($in_folder); $in_folder =~ s/[\\\/]$//; } show_startup(); process_folder( $in_folder ); show_found_counts(); process_file_array(); process_host_array(); if (length($homefile)) { ###trace_from_htm( $homefile, 0 ); check_linkages( $homefile ); check_local_links( $homefile ); show_link_counts("HTML File", \@htm_files); show_link_counts("IMG Files", \@img_files); show_link_counts("CSS Files", \@css_files); show_link_counts("ZIP Files", \@zip_files); show_link_counts("TXT Files", \@txt_files); show_link_counts("Script Files", \@script_files); show_link_counts("Other Files", \@other_files); } $scnt = scalar @scripts; if ($scnt && ($dbg9 || $showscripts)) { prt( "Got $scnt files containing SCRIPTS ...\n" ); # push(@scripts, [$fil, $lns]); for (my $i = 0; $i < $scnt; $i++) { $file = $scripts[$i][0]; $val = $scripts[$i][1]; prt( "$file $val\n" ); } } ############################################################## prt( "\n###### SHOW RESULTS ########\n" ); prt( "WARNING: $hrflnkcnt images by HREF not shown! (change \$showhreflinks)\n" ) if ($hrflnkcnt); $cnt = scalar @warnings; if ($cnt) { prt( "\nWARNINGS FOLLOW ($cnt):\n" ); foreach my $w (@warnings) { prt( "$w\n" ); } } else { prt( "No warnings ...\n" ); } if (@missed) { prt( "\nMISSING FOLLOW: ".scalar @missed."\n" ); foreach $file (@missed) { prt( "$file\n"); } } prt( "###### END RESULTS ########\n" ); ############################################################## close_log($outfile,1); exit(0); ################################## sub process_file_array { my $max = scalar @htm_files; my $bgntime = time(); my $msg = ''; for (my $i = 0; $i < $max; $i++) { $file = $htm_files[$i][$of_ff]; my ($nm,$dir,$ext) = fileparse( $file, qr/\.[^.]*/ ); my $htot = 0; my $itot = 0; $procnt++; if (open INF, "<$file") { my @lines = <INF>; close INF; @lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php'); # THIS IS USING htmltool.pl - get a single line of TEXT ... my $txt = join( '', @lines ); # get whole text my @is = ret_imgs_array( $txt ); my $ntxt = remove_script( $txt ); $ntxt = trimblanklines($ntxt); @hrefs = (); # clear my @hr = ret_hrefs_array( $ntxt ); ### collecthrefs( $txt, 0 ); ### collectimgs( $txt, 0 ); # bump the counts of HREF and IMGS collected $itot += scalar @is; $htot += scalar @hr; # store the references ... that is a reference to an array $htm_files[$i][$of_hr] = \@hr; $htm_files[$i][$of_im] = \@is; # new code to do some similar things, but while in the array ###@lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php'); @lines = dropcomments_from_array(@lines); my @srcs = get_img_srcs($file, @lines); $imgcnt += check_images( $file, @srcs ); @srcs = get_href_srcs($file, @lines); check_hrefs( $file, @srcs ); } $total_hrefs += $htot; $total_imgs += $itot; if ((($procnt % 100) == 0)||($max < 10)) { ###local $| = 1; ###prt( "\rDone $procnt HTML files ..." ); if ($max < 10) { prt( "Done $file HTML file ... href,other($htot,$itot)\n" ); } else { $msg = "href,other ($total_hrefs,$total_imgs)"; show_time( $max, $procnt, $bgntime, $msg ); #prt( "Done $procnt HTML files ... href,other ($total_hrefs,$total_imgs)\n" ); } } } prt( "Completed $procnt HTML files ... Found $total_hrefs HREF, and $total_imgs IMG/OTHER tokens.\n" ); } sub get_href_type { my ($src) = shift; if ($src =~ /^http:/i) { #push(@httprefs, [$src, $fil, $lnnos] ); return 1; # remote HREF } elsif ($src =~ /^https:/i) { return 1; # remote HREF #push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { #push(@ftprefs, [$src, $fil, $lnnos] ); return 3; # remote HREF } elsif ($src =~ /^mailto:/i) { #push(@mtrefs, [$src, $fil, $lnnos] ); return 4; # remote HREF } elsif ( $src =~ /^javascript:/i ) { return 5; # a JAVASCRIPT HREF } elsif ($src =~ /^file:/i) { return 5; # remote HREF } elsif ( substr($src,0,1) eq '#') { # local in page HREF return 6; } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; if (length($src)) { return 7; } } return 0; } sub get_local_href { my ($src) = shift; my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; # remove any TRAILING '/' char # 25/07/2007 - also 'convert' '%20' to space $src =~ s/%20/ /g; return $src; } sub dos_2_unix($) { my ($du) = shift; $du =~ s/\\/\//g; return $du; } ### my @donesrcs = (); sub in_done_srcs { my ($f) = shift; foreach my $fd (@donesrcs) { if ($fd eq $f) { return 1; } } return 0; } sub in_done_imgs { my ($f) = shift; foreach my $fd (@doneimgs) { if ($fd eq $f) { return 1; } } return 0; } sub fix_rel_unix_path { my ($path) = shift; $path = dos_2_unix($path); my @a = split(/\//, $path); my $npath = ''; my $max = scalar @a; my @na = (); for (my $i = 0; $i < $max; $i++) { my $p = $a[$i]; if ($p eq '.') { # ignore this } elsif ($p eq '..') { if (@na) { pop @na; # discard previous } else { prt( "WARNING: Got relative .. without previous!!!\n" ); } } else { push(@na,$p); } } foreach my $pt (@na) { $npath .= "/" if length($npath); $npath .= $pt; } return $npath; } sub add_new_link { my ($nlnk, $lnks) = @_; my @arr = split(',', $lnks); foreach my $lk (@arr) { if ($lk eq $nlnk) { return 0; } } return 1; } sub mark_image_link { my ($fmfil, $fnd, $src, $lev) = @_; my $fcnt = scalar @img_files; my $msrc = lc(dos_2_unix($src)); my $lnks = ''; prt( "Seeking [$msrc] in $fcnt images files ...\n" ) if ($dbg22); for (my $i = 0; $i < $fcnt; $i++) { my $fil = $img_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); prt( "Comparing to $mfil ...\n" ) if ($dbg22); if ($msrc eq $mfil) { $val = $img_files[$i][$of_lk]; $val++; $img_files[$i][$of_lk] = $val; # add image file linked to from what file $lnks = $img_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $img_files[$i][$of_fm] = $lnks; prt( "IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28); return 0; } } prt( "$src - NOT FOUND![1]\n" ) if ($dbg21); return 1; } sub mark_other_links { my ($fmfil, $fnd, $src, $lev) = @_; my $totcnt = 0; my $msrc = lc(dos_2_unix($src)); my $fcnt = scalar @img_files; my $i = 0; my $lnks = ''; $totcnt += $fcnt; if (mark_image_link( $fmfil, $fnd, $src, $lev ) == 0) { return 0; } # maybe ZIP files $fcnt = scalar @zip_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $zip_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $zip_files[$i][$of_lk]; $val++; $zip_files[$i][$of_lk] = $val; # add zip file linked to from what file $lnks = $zip_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $zip_files[$i][$of_fm] = $lnks; prt( "ZIP link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # maybe TXT files $fcnt = scalar @txt_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $txt_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $txt_files[$i][$of_lk]; $val++; $txt_files[$i][$of_lk] = $val; # add txt file linked to from what file $lnks = $txt_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $txt_files[$i][$of_fm] = $lnks; prt( "TXT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # maybe CSS files $fcnt = scalar @css_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $css_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $css_files[$i][$of_lk]; $val++; $css_files[$i][$of_lk] = $val; # add txt file linked to from what file $lnks = $css_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $css_files[$i][$of_fm] = $lnks; prt( "CSS link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # maybe SCRIPT files $fcnt = scalar @script_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $script_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $script_files[$i][$of_lk]; $val++; $script_files[$i][$of_lk] = $val; # add script file linked to from what file $lnks = $script_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $script_files[$i][$of_fm] = $lnks; prt( "SCRIPT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # OK, OTHER $fcnt = scalar @other_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $other_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $other_files[$i][$of_lk]; $val++; $other_files[$i][$of_lk] = $val; # add script file linked to from what file $lnks = $other_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $other_files[$i][$of_fm] = $lnks; prt( "OTHER link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } $totcnt += 1 if ($totcnt == 0); return $totcnt; } ####################################################################### # mark a link # parameters: # LINK IS IN FILE - $fmfil # Offset in $htm_files[$fnd] # Link item source - $src sub mark_link { my ($fmfil, $fnd, $src, $lev) = @_; my $fcnt = scalar @htm_files; my $msrc = lc(dos_2_unix($src)); my $ff = $htm_files[$fnd][$of_ff]; my $i = 0; my $totcnt = $fcnt; my $fil = ''; my $mfil = ''; my $val = 0; my $lnks = ''; for ($i = 0; $i < $fcnt; $i++) { if ($i != $fnd) { $fil = $htm_files[$i][$of_ff]; $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $htm_files[$i][$of_lk]; $val++; $htm_files[$i][$of_lk] = $val; # add HTML file linked to from what file $lnks = $htm_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $htm_files[$i][$of_fm] = $lnks; prt( "HTML link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg30); ###prt( "$ff ($fnd) linked to $fil ($i) $val\n" ) if ($dbg21); my $hr = $htm_files[$i][$of_hr]; # extract HREF ref.array my $im = $htm_files[$i][$of_im]; # extract IMAGE ref.array my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; # get count of images my $j = 0; my ($itmnam, $itmdir) = fileparse($fil); # get name and path prt( "$lev [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26); for ($j = 0; $j < $hrc; $j++) { my $hrf = ${$hr}[$j]; my $hrt = get_href_type($hrf); if ($hrt == 7) { my $nsrc = fix_rel_unix_path($itmdir.get_local_href($hrf)); if ( !in_done_srcs($nsrc) ) { push(@donesrcs, $nsrc); # put it in DONE list mark_link( $fil, $i, $nsrc, $lev + 1 ); # and MARK its links now } } } $val = 0; prt( "$fil - Checking $imc images files ...\n") if ($dbg24); for ($j = 0; $j < $imc; $j++) { # do each, in this linked file my $img = ${$im}[$j]; # get the image string my $isrc = $itmdir.$img; # join it with the path my $nisrc = fix_rel_unix_path($isrc); # fix rel, and force unix path prt( "Marking [$nisrc] - ".($j+1)." of $imc img/other links ..\n" ) if ($dbg19 || $dbg26); if ( !in_done_imgs($nisrc) ) { push(@doneimgs, $nisrc); # put it in DONE list mark_other_links( $fil, $j, $nisrc, 0 ); # and MARK the link in @img_files $val++; } else { prt( "Already IN doneimgs ...\n" ) if ($dbg19 || $dbg26); } } prt( "$fil - Marked $val of $imc images files ...\n") if ($val && $dbg24); return 0; } } } # hmmmm, LINK not found in HREF files, maybe IMAGES, zip, etc ... $val = mark_other_links( $fmfil, $fnd, $src, $lev ); if ($val) { $totcnt += $val; prt( "NO LINK FOUND HREF [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25); return 1; } return 0; } ########################################################################### # show link count, and links, in passed multidimensional file array # # If showlinks (or $dbg20 or $dbg31) is ON, shows internal LINKS # NOTE: Presently DOES NOT get all LINKS??? BAH!!! ########################################################################### sub show_link_counts { my ($m, $hf) = @_; my $fcnt = scalar @{$hf}; my $mcnt = 0; my $mss = "Checking LINKS for $fcnt $m files ...\n"; if ($fcnt) { for (my $i = 0; $i < $fcnt; $i++) { my $fil = ${$hf}[$i][$of_ff]; my $hrt = ${$hf}[$i][$of_lk]; if ($hrt) { if ($dbg20 || $dbg31 || $showlinks) { prt( $mss ) if (length($mss)); $mss = ''; if ($dbg31 || $showlinks) { my $lnks = ${$hf}[$i][$of_fm]; prt( "$i: $fil has $hrt links [$lnks]\n" ); } else { prt( "$i: $fil has $hrt links\n" ); } } } else { prt( $mss ) if (length($mss)); $mss = "WARNING: $fil($i) HAS NO LINKS!"; prt( "$mss\n" ) if ($dbg33); push(@warnings, $mss); $mss = ''; $mcnt++; } } if ($mcnt) { prt( "Done LINKS for $fcnt $m files ... MISSED $mcnt!!!\n" ); } } else { prt( "There are NO $m files ...\n" ) if ($dbg20); } } sub in_excused { my ($tx) = shift; foreach my $t (@excused) { if ($t eq $tx) { return 1; } } return 0; } # using the given HOME PAGE, try to TRACE ALL LINKS sub check_local_links { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); my $lchf = lc($hfnm); my $fnd = get_home_offset($hf); my ($fil,$nm,$dir,$ext); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $itmdir = ''; my $itmnam = ''; my $i = 0; my $i2 = 0; prt( "Checking local links, for $fcnt files, from $hf ...\n"); if ($fnd == -1) { prt( "WARNING: check_local_links: Unable to find [$hf] ...\n" ); return 1; } # process item 1 ... $procnt = 1; my $hr = $htm_files[$fnd][$of_hr]; my $im = $htm_files[$fnd][$of_im]; my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; my $hrf = ''; my $img = ''; my $hrt = 0; my $src = ''; my $nsrc = ''; my $ff = ''; my $shwerr = 0; my $emsg = ''; $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file ... $htm_files[$fnd][$of_lk] = 1; ($itmnam, $itmdir) = fileparse($fil); # get name and path prt( "HOME [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "IN[$fil] $i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[1]"; push(@warnings, "WARNING: $emsg!" ); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " IN PAGE"; } else { my $lref = get_local_href($hrf); $src = $itmdir.$lref; my $nusrc = fix_rel_unix_path($src); ### prt( "REL PATH [$src] to UNIX PATH [$nusrc]\n" ); push(@donesrcs, $nusrc); # put it in DONE list # mark link - FROM $fil if ( mark_link( $fil, $fnd, $nusrc, 0 ) ) { $emsg .= " SITE REF [$nusrc] ***NO IN-SITE LINK***???"; $msg = "$i2 [$fil] HREF [$hrf]$hrt SITE REF [$nusrc] ***NO IN-SITE LINK***???"; if (-f $src) { $msg .= "\n*** BUT FILE EXISTS [$src] ***"; $emsg .= "\n*** BUT FILE EXISTS [$src] ***"; push(@warnings, "WARNING: Local HREF [$lref] in [$fil] OUTSIDE WEB! but EXISTS!" ); } else { push(@missed, $msg ); $shwerr = 1; } } else { $emsg .= " SITE REF [$src] ok" if ($dbg19); } } prt( "$emsg\n" ) if ($dbg19 || $shwerr); } prt( "HOME - Marking $imc images files ...\n") if ($dbg24); for ($i = 0; $i < $imc; $i++) { $img = ${$im}[$i]; $src = $itmdir.$img; $nsrc = fix_rel_unix_path($src); prt( "HOME $fil - Mark $src ($nsrc) image ...\n" ) if ($dbg24); push(@doneimgs, $nsrc); # put it in DONE list mark_other_links( $fil, $i, $nsrc, 0 ); } return 0; } sub offset_done { my ($off, @done) = @_; foreach my $num (@done) { if ($off == $num) { return 1; } } return 0; } sub trace_from_htm { my ($hf, $lev) = @_; my $fnd = get_offset_of_htm($hf); my $msg = ''; if (($fnd != -1) && !offset_done($fnd,@offsdone)) { push(@offsdone,$fnd); my $hr = $htm_files[$fnd][$of_hr]; my $hrc = scalar @{$hr}; my @offsets = (); my($itmnam, $itmdir) = fileparse($hf); # get name and path for (my $i = 0; $i < $hrc; $i++) { my $hrf = ${$hr}[$i]; my $hrt = get_href_type($hrf); if ($hrt == 7) { #my $src = fix_rel_unix_path($itmdir.get_local_href($hrf)); my $src = $itmdir.get_local_href($hrf); push(@offsets,$src); trace_from_htm($src, ($lev + 1)); } } $hrc = scalar @offsets; my $cnt = $lev; $msg = sprintf("%4d ", $lev); prt( $msg ); while($cnt) { prt( ' ' ); $msg .= ' '; $cnt--; } prt( "$hf links to $hrc files ...\n" ); $msg .= "$hf links to $hrc files ..."; push(@htmlinks, [$lev, $msg]); foreach my $fil (@offsets) { $cnt = $lev; $msg = sprintf("%4d ", $lev); prt( $msg ); while($cnt) { prt( ' ' ); $msg .= ' '; $cnt--; } prt( "$fil\n" ); $msg .= $fil; push(@htmlinks, [$lev, $msg]); } } } sub get_offset_of_htm { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $lchf = lc($hfnm); my $fnd = -1; for (my $i = 0; $i < $fcnt; $i++) { my $fil = $htm_files[$i][$of_ff]; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); if (lc($nm.$ext) eq $lchf) { # have at least the NAME, but maybe not the FOLDER if (lc($hfdir) eq lc($dir)) { $fnd = $i; last; } } } return $fnd; } sub get_home_offset { if ($homeoffset != -1) { return $homeoffset; } my ($hf) = shift; prt( "Getting offset of HOME file $hf ...\n"); my $fnd = get_offset_of_htm($hf); if ($fnd == -1) { prt( "WARNING: Unable to find [$hf] ...\n" ); push(@warnings, "WARNING: Unable to find [$hf] ..."); } else { prt( "Found $hf at index $fnd\n" ); } $homeoffset = $fnd; return $homeoffset; } ############################################################################# # check linkages ############################################################################# sub check_linkages { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); my $lchf = lc($hfnm); my $fnd = get_home_offset($hf); my ($fil,$nm,$dir,$ext); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $itmdir = ''; my $itmnam = ''; my $i = 0; my $i2 = 0; prt( "Re-checking HREF and IMG/OTHER links, for $fcnt files ...\n"); if ($fnd == -1) { prt( "WARNING: Unable to find [$hf] ...\n" ); push(@warnings, "WARNING: Unable to find [$hf] ..."); return 1; } # process item 1 ... $procnt = 1; my $hr = $htm_files[$fnd][$of_hr]; my $im = $htm_files[$fnd][$of_im]; my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; my $hrf = ''; my $img = ''; my $hrt = 0; my $src = ''; my $ff = ''; my $shwerr = 0; my $emsg = ''; $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file ... ($itmnam, $itmdir) = fileparse($fil); # get name and path prt( "\n" ) if ($dbg18); prt( "$procnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "HH[$fil] "; $emsg .= "$i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[2]"; push(@warnings, "WARNING: $emsg" ); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " LOCAL"; } else { $src = $itmdir.get_local_href($hrf); if (-f $src) { $emsg .= " SITE REF [$src] ok"; } else { $emsg .= " SITE REF [$src] ***MISSING***?[1]"; push(@missed, $emsg ); $shwerr = 1; } } prt( "$emsg\n" ) if ($dbg18 || $shwerr); } # From this beginning for (my $j = 0; $j < $fcnt; $j++) { $fil = $htm_files[$j][$of_ff]; ($itmnam, $itmdir) = fileparse($fil); # get name and path if ($j != $fnd) { $procnt++; $hr = $htm_files[$j][$of_hr]; $im = $htm_files[$j][$of_im]; $hrc = scalar @{$hr}; $imc = scalar @{$im}; prt( "\n" ) if ($dbg18); prt( "$procnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "HF[$fil] "; $emsg .= "$i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[3]"; push(@warnings, "WARNING: $emsg"); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " LOCAL"; } else { $src = get_local_href($hrf); if ($src eq '.') { if (length($homefile)) { # && ($fdir eq $in_folder) $src = $homefile; # translate a DOT to HOME FILE } } $ff = $itmdir.$src; if (-f $ff) { $emsg .= " SITE REF [$ff] ok"; } else { $emsg .= " SITE REF [$src][$ff] ***MISSING***?[3]"; push(@missed, $emsg ); $shwerr = 1; } } prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32)); } for (my $i = 0; $i < $imc; $i++) { $img = ${$im}[$i]; $emsg = "IF[$fil] [$img] "; if ($img =~ /^http:\/\/.*/i) { if ($showhreflinks) { push(@warnings, "WARNING: IMG link is HREF $emsg [1]"); } else { $hrflnkcnt++; } } else { # 25/07/2007 - deal with '%20' in text $img =~ s/%20/ /g; $src = $itmdir.$img; $shwerr = 0; if (-f $src) { $emsg .= " IMG ok"; } else { $emsg .= " IMG ***MISSING***?[5]"; push(@missed, $emsg ); $shwerr = 1; } } prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32)); } } } return 0; } sub check_images { my ($ifile, @srcs) = @_; my ($nm, $dir) = fileparse($ifile); my $scnt = scalar @srcs; if ($scnt) { prt( "Found $scnt imgs in $nm ...\n" ) if ($dbg14); for (my $i = 0; $i < $scnt; $i++) { my $src = $srcs[$i][0]; my $lnn = $srcs[$i][1]; if ($src =~ /^http:\/\//i) { # remote HREF } else { # 25/07/2007 - deal with '%20' to space $src =~ s/%20/ /g; my $ff = $dir.$src; if ( -f $ff ) { prt( "$src - ok\n" ) if ($dbg13); } else { my $msg = "WARNING: [$src] $ifile:$lnn NOT FOUND![2]"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } } } else { prt( "Found NO imgs in [$ifile] ...\n" ) if ($dbg15); } return $scnt; } sub get_img_srcs { my ($fil, @lns) = @_; my $lc = scalar @lns; my $scnt = 0; my ($nm,$dir) = fileparse( $fil ); #my $sdbg12 = $dbg12; #my $sdbg11 = $dbg11; #my $sdbg16 = $dbg16; #my $sdbg10 = $dbg10; #if (lc($nm) eq 'moon.htm') { # $dbg12 = 1; # $dbg11 = 1; # $dbg16 = 1; # $dbg10 = 1; #} prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg12); my @isrc = (); my $ln = ''; my $bal = ''; my $inscript = 0; my $msg = ''; my $bgnln = 0; my $lnnos = ''; for (my $i = 0; $i < $lc; $i++) { $ln = $bal; $bal = ''; $ln .= $lns[$i]; chomp $ln; prt( "$i [$ln] ...\n" ) if ($dbg10); if ($inscript) { if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg11); } next; } if ( $ln =~ /<img\s+(.*)/i ) { my $iln = $1; if ( $ln =~ /<script.*>/i ) { $msg = "WARNING: Also found SCRIPT in IMG line ...[$ln]"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } prt( "Found [$iln] ...\n" ) if ($dbg10); $bgnln = $i; while ( !($iln =~ />/) && ($i < $lc)) { $i++; my $nxln = $lns[$i]; chomp $nxln; prt( "Adding [$nxln] ...\n" ) if ($dbg10); $iln .= ' ' if !($iln =~ /=$/); $iln .= $nxln; } $lnnos = "$bgnln:$i"; my $ind = index($iln, '>'); if ($ind != -1) { $bal = substr($iln, $ind+1); $iln = substr($iln, 0, $ind+1); } $iln = trim_all($iln); #if ($iln =~ /src=\"(.+)\"/i) { if ($iln =~ /src=\s*\"(\S+)\"/i) { prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } elsif ($iln =~ /src=\s*(\S+)/i) { # without QUOTES prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } elsif ($iln =~ /src=\s*\'(\S+)\'/i) { # single QUOTES prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } else { $msg = "WARNING: SRC NOT FOUND in [$iln]$fil:$lnnos..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } elsif ( $ln =~ /<script.*>/i ) { $inscript = 1; prt( "Entered a SCRIPT ...\n" ) if ($dbg11); if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg11); } } } prt( "Returning $scnt img sources ...\n") if ($dbg10); #$dbg12 = $sdbg12; #$dbg11 = $sdbg11; #$dbg16 = $sdbg16; #$dbg10 = $sdbg10; return @isrc; } sub check_hrefs { my ($fil, @srcs) = @_; my ($fnm,$fdir,$fext) = fileparse( $fil, qr/\.[^.]*/ ); my $scnt = scalar @srcs; my $isphp = (lc($fext) eq '.php'); if ($scnt) { prt( "Found $scnt anchor href= in $fnm$fext ...\n" ) if ($dbg7); for (my $i = 0; $i < $scnt; $i++) { my $orgsrc = $srcs[$i][0]; my $lnnos = $srcs[$i][1]; my $src = $orgsrc; if ($src =~ /^http:/i) { # remote HREF push(@httprefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^https:/i) { # remote HREF push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { # remote HREF push(@ftprefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^mailto:/i) { # remote HREF push(@mtrefs, [$src, $fil, $lnnos] ); } elsif ( $src =~ /^#/ ) { # local in page HREF } elsif ( $src =~ /^javascript:/i ) { # a JAVASCRIPT HREF } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; if (length($src)) { if ($src eq '.') { # HREF is just a DOT if (length($homefile)) { # && ($fdir eq $in_folder) $src = $homefile; # translate a DOT to HOME FILE } } # 25/07/2007 - deal with '%20' back to space $src =~ s/%20/ /g; my $ff = $fdir.$src; if ( -f $ff ) { prt( "$src - ok\n" ) if ($dbg5); } else { my $msg = "WARNING: href [$src] file NOT FOUND![3] in [$fil]$lnnos"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } else { if ($isphp) { prt( "Found BLANK HREF in PHP $fil ...\n" ) if ($dbg17); } else { $msg = "WARNING: Found BLANK HREF in $fil ..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } } } } else { if ($isphp) { prt( "Found NO HREFs in PHP $fil ...\n" ) if ($dbg17); } else { prt( "NO HREF FOUND in $fil ...\n" ) if ($shownohrefs); } } } ############################################################ # Only used is $chkip = 1; # Show IP Address # uses sockets, gethostbyname # Return 0, if can NOT be resolved. # else the number of IP addresses resolved. ############################################################ sub showIPAddress { my ($nm) = shift; my @addr = gethostbyname($nm); my $cnt = 0; if( !@addr ) { prt( "Can't resolve $nm: $!\n" ); return 0; } @addr = map { inet_ntoa($_) } @addr[4 .. $#addr]; foreach my $k (@addr) { $cnt++; prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3); } return $cnt; } ################################################ # Add to @scripts multidimensional array, # if NOT already in there, when on the line # numbers are added. ############################################### sub add_2_scripts { my ($fil, $lns) = @_; my $sc = scalar @scripts; for (my $i = 0; $i < $sc; $i++) { my $cf = $scripts[$i][0]; if ($cf eq $fil) { my $lc = $scripts[$i][1]; $lc .= ":$lns"; $scripts[$i][1] = $lc; return 0; } } push(@scripts, [$fil, $lns]); return 1; } #################################################### # Get HREF sources # Given an ARRAY of file lines, check for # anchor href="something" ... # Return the "something" in an array #################################################### sub get_href_srcs { my ($fil, @lns) = @_; my $lc = scalar @lns; my $scnt = 0; my $slns = 0; # count the SCRIPT lines my ($nm,$dir) = fileparse( $fil ); prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6); my @isrc = (); my $ln = ''; my $bal = ''; my $inscript = 0; $slns = 0; my $bgnln = 0; my $endln = 0; for (my $i = 0; $i < $lc; $i++) { $ln = $bal; $bal = ''; $ln .= $lns[$i]; chomp $ln; prt( "$i [$ln] ...\n" ) if ($dbg10); if ($inscript) { if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, $slns ); $slns = 0; next; } $slns++; next; } if ( $ln =~ /<a\s+(.*)/i ) { my $iln = $1; prt( "Found [$iln] ...\n" ) if ($dbg10); $bgnln = $i; while ( !($iln =~ />/) && ($i < $lc)) { $i++; my $nxln = $lns[$i]; chomp $nxln; prt( "Adding [$nxln] ...\n" ) if ($dbg10); $iln .= ' ' if !($iln =~ /=$/); $iln .= $nxln; } $endln = $i; my $ind = index($iln, '>'); if ($ind != -1) { $bal = substr($iln, $ind+1); $iln = substr($iln, 0, $ind+1); } #if ($iln =~ /src=\"(.+)\"/i) { if ($iln =~ /href\s*=\s*\"(\S+)\"/i) { prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10); push(@isrc, [$1, "$bgnln:$endln"] ); $scnt++; } else { if (( $iln =~ /name=\s*\"(\S+)\"/i )||( $iln =~ /name=(\S+)/i )) { # ignore BOOKMARKS } else { $msg = "WARNING: HREF NOT FOUND in [$iln]..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } } elsif ( $ln =~ /<script.*>/i ) { $inscript = 1; prt( "Entered a SCRIPT ...\n" ) if ($dbg4); $slns = 0; $ln = substr($ln, 7); if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, 1 ); $slns = 0; } } } if ($inscript) { $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } prt( "Returning $scnt HREF sources ...\n") if ($dbg10); return @isrc; } ######################################################### # Passed an array of extensions, # check if this is one of them? ######################################################### sub is_my_ext { my ($fil, @exts) = @_; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); foreach my $ex (@exts) { if (lc($ex) eq lc($ext)) { return 1; } } return 0; } ############################################ # only looking for HTM, HTML, PHP, # could be extended to others maybe ... ############################################ sub is_htm_ext { my ($fil) = shift; return( is_my_ext($fil, @html_ext) ); } sub is_graphic_ext { my ($fil) = shift; return( is_my_ext($fil, @graf_ext) ); } sub is_zip_ext { my ($fil) = shift; my @arr = qw( .zip ); return( is_my_ext($fil, @arr) ); } sub is_css_ext { my ($fil) = shift; return( is_my_ext($fil, @css_ext) ); } sub is_txt_ext { my ($fil) = shift; my @arr = qw( .txt ); return( is_my_ext($fil, @arr) ); } sub is_script_ext { my ($fil) = shift; return( is_my_ext($fil, @script_ext) ); } ################################################ # my $ignfpd = 1; # ignore FRONTPAGE folders ################################################ sub is_fp_folder { my ($inf) = shift; foreach my $fil (@fpfolders) { if (lc($inf) eq lc($fil)) { return 1; } } return 0; } #################################### # Check if FILE is in EXCLUDE list #################################### sub in_excludes { my ($fil) = shift; my $lcf = lc($fil); foreach my $f (@excludes) { if (lc($f) eq $lcf) { return 1; } } return 0; } sub in_spl_excludes { my ($fldr) = shift; my $lfldr = lc($fldr); foreach my $f (@splexcludes) { if (lc($f) eq $lfldr) { return 1; } } return 0; } #################################################################### # process_folder(folder) # Main DIRECTORY processing function # # Open the FOLDER given, and collect ALL files found, # iterate into sub-directories, if $recurse is non-zero, # and it is NOT a special FRONTPAGE (hidden) FOLDER. # # Files are collected into multidemensional arrays #################################################################### sub process_folder { my ($inf) = shift; my $fcnt = 0; prt( "Processing $inf folder ...\n" ) if ($dbg1); if ( opendir( DIR, $inf ) ) { my @files = readdir(DIR); closedir DIR; foreach my $fil (@files) { if (($fil eq ".")||($fil eq "..")) { next; } my $ff = $inf."\\".$fil; if ( -d $ff ) { if ($recurse) { if ($ignfpd && is_fp_folder($fil)) { # ignore FRONTPAGE folders next; } if (@splexcludes && in_spl_excludes($fil)) { next; } process_folder( $ff ); } } else { # NOTE: multidimensional arrays pushed - offsets into arrays # my $of_ff = 0; # 1 - full file name # my $of_hr = 1; # 2 - array ref of href links # my $of_im = 2; # 3 - array ref of image links # my $of_lk = 3; # 4 - linked count # my $of_sp = 4; # 5 - spare # my $of_to = 5; # links TO # my $of_fm = 6; # links FROM if ( !in_excludes($fil) ) { # NOT in @excludes my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); my $val = 0; $val = $ext_hash{$ext} if ( defined $ext_hash{$ext} ); $val++; $ext_hash{$ext} = $val; if (is_htm_ext($fil)) { push(@htm_files, [$ff, '', '', 0, 0, '', ''] ); $fcnt++; } elsif (is_graphic_ext($fil)) { push(@img_files, [$ff, '', '', 0, 0, '', ''] ); } elsif (is_zip_ext($fil)) { push(@zip_files, [$ff, '', '', 0, 0, '', ''] ); } elsif (is_css_ext($fil)) { push(@css_files, [$ff, '', '', 0, 0, '', ''] ); } elsif (is_txt_ext($fil)) { push(@txt_files, [$ff, '', '', 0, 0, '', ''] ); } elsif (is_script_ext($fil)) { push(@script_files, [$ff, '', '', 0, 0, '', ''] ); } else { push(@other_files, [$ff, '', '', 0, 0, '', ''] ); } } } } prt( "Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1); } else { prt( "ERROR: Failed to open folder $inf ...\n" ); } } ############################################## # Just to show the COUNTS in the ARRAYS ############################################## sub show_found_counts { my $cnt = scalar @htm_files; prt( "Found $cnt HTML, "); $cnt = scalar @img_files; prt( "$cnt images, " ); $cnt = scalar @css_files; prt( "$cnt css, " ); $cnt = scalar @zip_files; prt( "$cnt zip, " ); $cnt = scalar @txt_files; prt( "$cnt txt, " ); $cnt = scalar @script_files; prt( "$cnt script, " ); $cnt = scalar @other_files; prt( "and $cnt others ...\n" ); $cnt = scalar keys %ext_hash; if ($dbg27 || $verbal) { prt( "$cnt extensions, and each count ...\n" ); foreach my $key (keys %ext_hash) { my $val = $ext_hash{$key}; prt( "$val $key "); } prt("\n"); } } # @ipsfound = <INF>; sub in_ips_found { my ($ip) = shift; my $lcip = lc($ip); foreach my $i (@ipsfound) { chomp $i; if (lc($i) eq $lcip) { return 1; } } return 0; } ####################################################### # Process the HTTP HREF sources # if $chkip = 1; then attempt to resolve the IP # addresses from the host name. ####################################################### sub process_host_array { $hcnt = scalar @httprefs; if ($hcnt) { prt( "Found $hcnt HREF entries ...\n" ); for (my $i = 0; $i < $hcnt; $i++) { $href = $httprefs[$i][0]; $file = $httprefs[$i][1]; my $lnn = $httprefs[$i][2]; my ($nm,$dir) = fileparse($file); if (defined( $hrefs{$href} )) { $val = $hrefs{$href}; $val .= ' '.$file; } else { $val = $file; } $val .= ":$lnn"; $hrefs{$href} = $val; prt( "$href in [$file]$lnn\n" ) if ($dbg2); } $hcnt = scalar keys(%hrefs); prt( "Found $hcnt different entries ...\n" ); if ($chkip) { my $inips = 0; prt( "Checking $hcnt IP addresses ... " ); if ( !$refreships && ( -f $ipfile)) { if (open INF, "<$ipfile") { @ipsfound = <INF>; close INF; prt( "Have ".scalar @ipsfound." in $ipfile" ); } else { prt( "Warning: Failed to open $ipfile" ); } } prt("\n"); $procnt = 0; foreach my $key (keys %hrefs) { $val = $hrefs{$key}; $procnt++; prt( "$key in $val\n" ) if ($dbg8); if ($key =~ /^http:\/\//i) { my $hkey = substr($key, 7); my @arr = split( /\//, $hkey ); $hkey = $arr[0]; if ( !in_ips_found($hkey) ) { if (showIPAddress( $hkey ) == 0) { $msg = "FAILED: NO IP FOR HOST [$hkey][$val]"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } elsif ($writeips) { push(@ipsfound,"$hkey\n"); } } else { $inips++; } } if (($procnt % 100) == 0) { prt( "Done $procnt IP Addresses ...\n" ); } } prt( "Completed $procnt IP Addresses ... " ); if ($writeips) { $val = join("\n", sort @ipsfound); $val = trimblanklines($val); write2file($val, $ipfile); prt( "$inips in previous. Written ".scalar @ipsfound." to $ipfile" ); } prt("\n"); } } } sub show_startup { prt( "Checking $in_folder ..." ); prt( " Using HOME file $homefile ..." ) if length($homefile); prt("\n"); if ($verbal) { prt( "\nOptions:\n" ); prt( sprintf(" -checkips - HREF check %s, \n", ($chkip ? "On" : "Off")) ); prt( sprintf(" -showhreflinks - Show HREF %s, \n", ($showhreflinks ? "On" : "Off")) ); prt( sprintf(" -showlinks - Show links %s, \n", ($showlinks ? "On" : "Off")) ); prt( sprintf(" -showscripts - Show script files %s, \n", ($showscripts ? "On" : "Off")) ); prt( sprintf(" -writeips - Write HREF $ipfile %s, \n", ($writeips ? "On" : "Off")) ); prt( sprintf(" -refreships - Refresh HREF %s\n", ($refreships ? "On" : "Off")) ); prt( sprintf(" -shownohrefs - Show NO HREF found %s\n", ($shownohrefs ? "On" : "Off")) ); if (@excludes) { prt( "Have ".scalar @excludes." excluded files - " ); foreach my $ex (@excludes) { prt( "$ex " ); } prt("\n"); } } } sub show_help { prt( "$pgmname [Options] input-folder or home-file-name.\n" ); prt( "Purpose: To take a folder, or home file in a folder, and\n" ); prt( "check the assume local Web Site for internal consistency.\n" ); prt( "Options:\n" ); prt( " -checkips - Will check the IP resolution of REMOTE HREF items.\n" ); prt( " -showhreflinks - Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" ); prt( " -showlinks - Show the links for each file ...\n" ); prt( " -showscripts - Show SCRIPT files ...\n" ); prt( " -writeips - Write HREF of IP found to a file ...\n" ); prt( " -refreships - If -checkips, and -writeips, re-write NEW check file...\n" ); prt( " -ipfile out-file - Set HREF output file. Default is $ipfile.\n" ); prt( " -ignore in-file - Ignore this file. Repeat for more. use '.none. to reset list.\n" ); prt( " -shownohrefs - Show when NO HREF found in a file.\n" ); prt( " -v - Set verbal on.\n" ); prt( "If an input-folder given, then no trace of internal links will be done.\n" ); prt( "If a home file name is given, the folder used will be of that file.\n" ); prt( "Following are the current default settings ...\n" ); $verbal = 1; show_startup(); mydie(" Happy link checking ;=))\n"); } # Ensure argument exists, or die. sub require_arg { my ($arg, @arglist) = @_; mydie( "ERROR: no argument given for option '$arg' ...\n" ) if ! @arglist; } ########################################################## # Parse USER input # Largerly still to be done ########################################################## sub parse_args { my (@av) = @_; while (@av) { my $arg = $av[0]; if (substr($arg,0,1) eq '-') { if (($arg eq '-h')||($arg eq '--help')||($arg eq '-?')) { show_help(); } elsif ($arg eq '-v') { $verbal = 1; prt( "Set verbal ON.\n" ); } elsif ($arg eq '-checkips') { $chkip = 1; prt( "Will check IP of REMOTE HREF items.\n" ); } elsif ($arg eq '-showhreflinks') { $showhreflinks = 1; prt( "Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" ); } elsif ($arg eq '-showlinks') { $showlinks = 1; prt( "Show the links for each file ...\n" ); } elsif ($arg eq '-showscripts') { $showscripts = 1; prt( "Show SCRIPT files ...\n" ); } elsif ($arg eq '-writeips') { $writeips = 1; prt( "Write HREF of IP found to a file ...\n" ); } elsif ($arg eq '-refreships') { $refreships = 1; prt( "If -checkips, and -writeips, re-write NEW check file...\n" ); } elsif ($arg eq '-ipfile') { require_arg(@av); shift @av; $ipfile = $av[0]; prt( "HREF output set to $ipfile ...\n" ); } elsif ($arg eq '-ignore') { require_arg(@av); shift @av; $arg = $av[0]; if ($arg eq '.none.') { @excludes = (); prt( "Reset EXCLUDES array ...\n" ); } else { push(@excludes, $arg); prt( "Added file [$arg] to EXCLUDES ...\n" ); } } else { mydie( "ERROR: Unknown argument [$arg] ...\n" ); } } else { # no leading '-' $in_folder = $av[0]; prt( "Input folder set to [$in_folder]...\n" ); } shift @av; } # check the INPUT folder if ( !( (-d $in_folder) || (-f $in_folder ) ) ) { mydie( "ERROR: $in_folder is NOT VALID FOLDER OR FILE NAME\n" ); } } sub secs_2_hhmmss { my ($secs) = shift; my $rt = ''; my $mins = int($secs / 60); $secs = $secs - ($mins * 60); $secs = (int(($secs * 10) + 0.5)) / 10; if ($mins > 60) { my $hrs = int($mins / 60); $mins = $mins - ($hrs * 60); $mins = '0'.$mins if ($mins < 10); $secs = '0'.$secs if ($secs < 10); $rt = "$hrs:$mins:$secs"; } else { $mins = '0'.$mins if ($mins < 10); $secs = '0'.$secs if ($secs < 10); $rt = "$mins:$secs"; } return $rt; } sub show_time { my ($totcnt, $lncnt, $bgntime, $msg) = @_; my ($currtime, $difftime, $persec, $remains, $remsecs, $tenths, $remtm, $elapsed); $currtime = time(); $difftime = $currtime - $bgntime; $persec = $lncnt / $difftime; $remains = $totcnt - $lncnt; $remsecs = $remains / $persec; $tenths = (int(($persec * 100) + 0.05)) / 100; $remtm = secs_2_hhmmss($remsecs); $elapsed = secs_2_hhmmss($difftime); prt( "$elapsed Done $lncnt, at $tenths/sec, left $remains in $remtm - $msg\n" ); } # eof - chklinks02.pl