gethrefs02.pl to HTML.

index -|- end

Generated: Mon Aug 29 19:34:39 2016 from gethrefs02.pl 2016/08/03 33.4 KB. text copy

#!/perl -w
# NAME: gethrefs02.pl
# AIM: Parse a HTML file, and extract HREF links
# 2016-08-03 - Reciew
# 05/11/2015 - Lots of quick improvements
# 18/07/2010 - revisit and test...
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use constant {
     HRT_UNKNOWN => 0,
     HRT_LOCAL => 1,
     HRT_LINK => 2,
     HRT_SCRIPT => 4,
    HRT_FILE => 8,
     HRT_BASE => 16,
     HRT_PARAMS => 32
};
use constant {
   FT_UNKNOWN => 0,
   FT_HTML => 1,
   FT_GRAF => 2,
   FT_CSS => 3,
   FT_SCRIPT => 4,
   FT_TEXT => 5,
   FT_ZIP => 6,
    FT_BIN => 7,
    FT_CODE => 8,
    FT_DIR => 9,
    FT_HIDDEN => 10,
    FT_PARAM => 11

};

# offsets in file array
use constant {
   OF_FF => 0,   # full file name
   OF_HR => 1,   # array ref of href links
   OF_IM => 2,   # array ref of image links
   OF_LK => 3,   # linked count
   OF_SP => 4,   # spare
   OF_TO => 5,   # links TO
   OF_FM => 6, # links FROM
   OF_FT => 7   # file type
};

my $perl_root = 'C:\Gtools\perl';
unshift(@INC,$perl_root);
#require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'lib_utils.pl' or die "Unable to load lib_utils.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
   my @tmpsp = split(/\\/,$pgmname);
   $pgmname = $tmpsp[-1];
}
my $outfile = $perl_root."\\temp.$pgmname.txt";
open_log($outfile);
###prt( "$0 ... Hello, World ...\n" );
my $os = $^O;

my $in_file = '';
## my $in_file = 'C:\GTools\java\examples\JavaTech\Code_List.htm';
## my $in_file = 'C:\HOMEPAGE\GA\travel\maroc\index.htm';
## my $in_file = 'temphtml.htm';
my $usr_url = '';
my @all_hrefs = ();

my $outtemp = $perl_root."\\templist.txt";
my $show_full_list = 0;
my $show_missed_files = 0;
my $verbosity = 0;
my $load_log = 0;
my $out_file = '';
my $VERS = "0.0.5 2015-11-05";
##my $VERS = "0.0.4 2010-07-18";

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }


# CONSTANTS
###########
# File Type Extensions
my @html_extension = qw( .htm .html .shtml .php );
my @graf_extension = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_extension  = qw( .css );
my @script_extension = qw( .js .class .cgi .java .remote );
my @zip_extension = qw( .zip .tar .gz .jar .tgz );
my @txt_extension = qw( .txt .doc .bat .cmd .old .bak .policy .pdf .cfg );
my @code_extension = qw( .c .cxx .cpp .h .hxx .hpp .idl .mak );
my @bin_extension = qw( .dat .exe .au );

# private FRONTPAGE folders
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );

# features
my $ignfpd = 1;   # ignore FRONTPAGE folders
my @excludes = qw( desktop.ini php.ini blank.html blank.htm );
my $recurse = 0;   # recursive
my @splexcludes = qw( macpc );

my %ext_hash = ();
my @all_files = ();
my $refcnt = 0;
my @done_files = ();
my %not_found = ();

my ($base_file,$base_dir);
my $base_href = ''; # set if <BASE href="..."> found

# DEBUG
my $dbg1 = 0;   # show discarded material
my $dbg2 = 0;   # show "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]...
my $dbg3 = 0;   # show Processing $lncnt lines from $fil ...
my $dbg4 = 0;   # show File [$name], in [$rdir] ...
my $dbg5 = 0;   # show HREF immediately
my $dbg6 = 0;   # show FOLDERS searched...

# ### DEBUG ###
my $debug_on = 0;
my $def_file = 'C:\Users\user\Downloads\temp\index.html';

my @warnings = ();

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        prt( "\nNo warnings issued.\n\n" ) if (VERB9());
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}


##################################################################
#   OF_FF => 0,   # full file name
#   OF_HR => 1,   # array ref of href links
#   OF_IM => 2,   # array ref of image links
#   OF_LK => 3,   # linked count
#   OF_SP => 4,   # spare
#   OF_TO => 5,   # links TO
#   OF_FM => 6, # links FROM
#   OF_FT => 7   # file type
sub show_results {
   my $fcnt = scalar @all_files;
   my ($i, $ff, $ft, $cnt, $mcnt);
    $mcnt = 0;
   for ($i = 0; $i < $fcnt; $i++) {
      $ft = $all_files[$i][OF_FT];
      if ($ft == FT_HTML) {
         $cnt = $all_files[$i][OF_SP];
         if ($cnt == 0) {
            $ff = $all_files[$i][OF_FF];
            $mcnt++;    # prt( "Missed [$ff]\n" );
         }
      }
   }
    if ($mcnt) {
        prt("Got $mcnt 'missed' files... not marked...\n");
        for ($i = 0; $i < $fcnt; $i++) {
            $ft = $all_files[$i][OF_FT];
            if ($ft == FT_HTML) {
                $cnt = $all_files[$i][OF_SP];
                if ($cnt == 0) {
                    $ff = $all_files[$i][OF_FF];
                    prt( "Missed [$ff]\n" );
                }
            }
        }
    }
}

# =========================================================================
# url_parse - needs some more to remove any other post, like index.htm?a=b...
# ---------
sub url_parse($) {
    my ($url) = @_;
    my $post = '';
    my $name = '';
    my $dir  = '';
    my $ind = index($url,'#');
    if ($ind > 0) {
        $post = substr($url,$ind);
        $url = substr($url,0,$ind);
    }
    if ($url =~ /\/$/) {
        $dir = $url;
    } else {
        ($name,$dir) = fileparse($url);
        if ( !($name =~ /\./) ) {
            # without an EXTENT, assume directory
            $dir .= $name.'/';
            $name = '';
        }
    }
    return $dir,$name,$post;    # url_parse - return (dir,name,post)
}

sub uri_parse2($) {
    my ($uri) = shift;
    $uri =~ /^(([^:\/\?#]+):)?(\/\/([^\/\?#]*))?([^\?#]*)(\?([^#]*))?(#(.*))?/;
    # Then:
    my $scheme    = (defined $2) ? $2 : '';
    my $authority = (defined $4) ? $4 : '';
    my $path      = (defined $5) ? $5 : '';
    my $query     = (defined $7) ? $7 : '';
    my $fragment  = (defined $9) ? $9 : '';
    return $scheme,$authority,$path,$query,$fragment;
}

##################################################################
#########################################################
# Passed an array of extensions,
# check if this is one of them?
#########################################################
sub is_my_extension {
   my ($fil, $rexts) = @_;
   my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
    my $lcext = lc($ext);
    my ($ex);
   foreach $ex (@{$rexts}) {
      return 1 if (lc($ex) eq $lcext);
   }
   return 0;
}

############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################
sub is_htm_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@html_extension) );
}
sub is_graphic_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@graf_extension) );
}
sub is_zip_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@zip_extension) );
}
sub is_css_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@css_extension) );
}
sub is_txt_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@txt_extension) );
}
sub is_code_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@code_extension) );
}
sub is_script_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@script_extension) );
}
sub is_bin_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, \@bin_extension) );
}

#use constant {
#   FT_UNKNOWN => 0,
##   FT_HTML => 1,
##   FT_GRAF => 2,
##   FT_CSS => 3,
##   FT_SCRIPT => 4,
##   FT_TEXT => 5,
##   FT_ZIP => 6,
##  FT_DIR => 7
#};
sub get_file_type_const($);

sub get_file_type_const($) {
   my ($fil) = shift;
   if (is_htm_extension($fil)) {
      return FT_HTML;
   } elsif (is_graphic_extension($fil)) {
      return FT_GRAF;
   } elsif (is_zip_extension($fil)) {
      return FT_ZIP;
   } elsif (is_css_extension($fil)) {
      return FT_CSS;
   } elsif (is_txt_extension($fil)) {
      return FT_TEXT;
   } elsif (is_script_extension($fil)) {
      return FT_SCRIPT;
   } elsif (is_bin_extension($fil)) {
      return FT_BIN;
   } elsif (is_code_extension($fil)) {
      return FT_CODE;
   } elsif ($fil =~ /\/$/) {
        return FT_DIR;
    } elsif ($fil =~ /#/) {
        my $ih = index($fil,'#');
        if ($ih > 0) {
            my $f2 = substr($fil,0,$ih);
            return get_file_type_const($f2);
        }
   } elsif ($fil =~ /\//) {
        return FT_DIR;  # gross assumption
    } elsif ($fil =~ /^\w+$/) {
        return FT_DIR;  # another gross assumption
    }

    return FT_HIDDEN if ($fil =~ /^\./);
    return FT_PARAM if ($fil =~ /^\?.+/);

    ### pgm_exit(1,"Why UNKNOWN for [$fil]?\n");
   return FT_UNKNOWN;
}

sub file_type_const_to_string {
   my ($ft) = shift;
   if ($ft == FT_HTML) {
      return "html";
   } elsif ($ft == FT_GRAF) {
      return "graphic";
   } elsif ($ft == FT_ZIP) {
      return "zip";
   } elsif ($ft == FT_CSS) {
      return "css";
   } elsif ($ft == FT_TEXT) {
      return "text";
   } elsif ($ft == FT_SCRIPT) {
      return "script";
   } elsif ($ft == FT_BIN) {
        return "binary";
   } elsif ($ft == FT_CODE) {
        return "code";
   } elsif ($ft == FT_DIR) {
        return "directory";
   } elsif ($ft == FT_UNKNOWN) {
      return "unknown";
   } elsif ($ft == FT_HIDDEN) {
        return "hidden";
   } elsif ($ft == FT_PARAM) {
        return "parameter";
   }
   pgm_exit(1,"***FIX ME*** uncased type [$ft]!");
    return "";
}

sub fix_rel_url($) {
   my ($path) = @_;
   my @a = split(/\//, $path);
   my $npath = '';
   my $max = scalar @a;
   my @na = ();
   for (my $i = 0; $i < $max; $i++) {
      my $p = $a[$i];
      if ($p eq '.') {
         # ignore this
      } elsif ($p eq '..') {
         if (@na) {
            pop @na;   # discard previous
         } else {
            pgm_exit(1,"ERROR: Got relative .. without previous!!! path=$path\n" );
         }
      } else {
         push(@na,$p);
      }
   }
   foreach my $pt (@na) {
      $npath .= "/" if length($npath);
      $npath .= $pt;
   }
   return $npath;
}

sub is_http_link($) {
    my ($hr) = shift;
    return 1 if ($hr =~ /^http(s*):\/\//);
    return 0;
}

sub get_full_base_href($$) {
    my ($rrhr, $bhr) = @_;
    my $rhr = ${$rrhr};
    my ($nm,$dir,$fhr);
    if ( length($bhr) && !is_http_link($rhr) && !($rhr =~ /^#/) ) {
        if ($bhr =~ /\/$/) {
            $dir = $bhr;    # assume a DIRECTORY
        } else {
            ($nm,$dir) = fileparse($bhr); # assume a FILE, so get the dir only...
        }
        $fhr = $dir.$rhr;
        $fhr = fix_rel_url($fhr);
        ${$rrhr} = $fhr;
        return 1;
    }
    return 0;
}

sub get_href_type_const($);

sub get_href_type_const($) {
   my ($hrf) = shift;
    my ($ih,$id,$is);
   if (is_http_link($hrf)) {
      return HRT_LINK;
   } elsif ($hrf =~ /^ftp:\/\//i) {
      return HRT_LINK;
   } elsif ($hrf =~ /^javascript:/i) {
      return HRT_SCRIPT;
   } elsif (substr($hrf,0,1) eq '#') {
      return HRT_LOCAL;
    }
    #if ( get_full_base_href(\$hrf,$base_href) ) {
    #    if ($hrf =~ /^http(s*):\/\//i) {
    #        return HRT_LINK;
    #    }
    #}

    $ih = index($hrf,'#');
    $id = rindex($hrf,'.');
    $is = rindex($hrf,'/');
    if ($ih > 0) {
        my $hr2 = substr($hrf,0,$ih);
        my $srt = get_href_type_const($hr2);
        $id = rindex($hr2,'.');
        $is = rindex($hr2,'/');
        if ($id > 0) {
            return ($srt | HRT_FILE);
        }
    }
    if ($id > 0) {
        # contains a DOT - assume a file
      return HRT_FILE;
   }
    if ($hrf =~ /\/$/) {
        # ends in '/', assume file - acutally directory
      return HRT_FILE;
    }
    if ($hrf =~ /\//) {
        # contains any '/', assume a file
      return HRT_FILE;
    }
    if ($hrf =~ /^\w+$/) {
        # contains any alphanumeric only, assume a file
      return HRT_FILE;
    }
    if ($hrf =~ /^\?.+/) {
        # if a href param, like '?C=N;O=D', ...
        return HRT_PARAMS;
    }
    prtw("WARNING: Why UNKNOWN on href [$hrf] ih=$ih id=$id is=$is\n");
   return HRT_UNKNOWN;
}

my %done_warning = ();
sub href_type_to_string {
   my ($hrt) = shift;
    my $ret = '';
   if ($hrt & HRT_LINK) {
      $ret .= "extern link ";
    }
   if ($hrt & HRT_SCRIPT) {
      $ret .= "script ";
    }
    if ($hrt & HRT_LOCAL) {
      $ret .= "local ";
    }
    if ($hrt & HRT_FILE) {
        $ret .= "file ";
    }
    if ($hrt & HRT_BASE) {
      $ret .= "BASE ";
    }
   if ($hrt == HRT_UNKNOWN) {
      $ret = "unknown";
   }
   if ($hrt == HRT_PARAMS) {
      $ret = "parameter";
   }
    $ret =~ s/\s+$//;

    if (length($ret) == 0) {
        my $err = "***FIX ME*** uncased type [$hrt]!";
        if (!defined $done_warning{$err}) {
            $done_warning{$err} = 1;
            prtw("WARNING: $err\n");
        }
    }

   return $ret;
}


##############################################


sub get_hrefs_from_string($) {
   my ($ln) = shift;
   my ($i, $j, $line, $ch, $ch2, $len, $tag, $disc, $hcnt);
    my ($bgn, $fhr, $hr, $tail, $max, $hrt, $ft, $ctyp);
   my ($sp,$tag2,$gottag);
   my @hrf = ();
   $ln =~ s/\n/ /g;
   $ln = trim_all($ln);
   # sub write2file {    my ($txt,$fil) = @_;
   # write2file($fulln,'tempfl.txt');
   $len = length($ln);
   $disc = '';
   $hcnt = 0;
    $base_href = '';    # assume NO <BASE href="...">
    # process single long string, char by char
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($ln,$i,1);
      if ($ch eq '<') {
         $tag = $ch; # start a tag
         $i++;
         $ch = substr($ln,$i,1);
         # could check for things like <! ... later maybe
            $tag2 = '';
            $gottag = 0;
         for (; $i < $len; $i++) {
            $ch = substr($ln,$i,1);
            $tag .= $ch;
            if ($ch eq '>') {
               last;
            }
                if (!$gottag) {
                    if ($ch =~ /\w/) {
                        $tag2 .= $ch;
                    } else {
                        $gottag = 1;
                    }
                }
         }

         if ($tag =~ /(.*\s+)href(\s*)=/i) {
            $bgn = $1;
            $sp = length($2);
            $hcnt++;
            $fhr = substr($tag,length($bgn)+5+$sp);
            $fhr = substr($fhr,1) while ($fhr =~ /^\s/); # remove all LEADING space
            $ch = substr($fhr,0,1);
                prt("$tag [$tag2] [$fhr]\n") if ($dbg5);
            if (($ch eq '"')||($ch eq "'")) {
               $max = length($fhr);
               $hr = '';
               $tail = '';
               # collect actual HREF=
               for ($j = 1; $j < $max; $j++) {
                  $ch2 = substr($fhr,$j,1);
                  if ($ch eq $ch2) {
                     $tail = substr($fhr,$j);
                     last;
                  }
                  $hr .= $ch2;
               }

                    if ($tag2 =~ /^BASE$/i) {
                        $hrt = HRT_BASE;
                        #prt("Got [$tag2] [$fhr] [$hr]\n");
                        my ($d,$n,$p) = url_parse($hr);
                        prt("Got BASE [$hr] = [$d]+[$n]+[$p]\n");
                        $base_href = $hr;
                    } else {
                        get_full_base_href(\$hr,$base_href);
                        $hrt = get_href_type_const($hr);
                    }
               $ctyp = '';
               $ft = FT_UNKNOWN;
               if ($hrt & HRT_FILE) {
                  $ft = get_file_type_const($hr);
                  $ctyp = "ext[".file_type_const_to_string($ft)."] ";
               }
               $ctyp = 'type['.href_type_to_string($hrt)."] $ctyp";
                    #prt("tag [$tag2] [$hr] $ctyp\n");
               prt( "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]\n" ) if ($dbg2);
               #          href  HRT-type FT-file
               #            0     1       2
               push(@hrf, [$hr, $hrt,    $ft]);
            } else {
               prt( "$hcnt:HREF: fhr[$fhr] pre[$disc] tag[$tag] bgn[$bgn] CHECK ME\n" );
            }
         } else {
            prt( "DISCARDED: pre[$disc] tag[$tag] ...\n" ) if ($dbg1);
         }
         $disc = '';
      } else {
         $disc .= $ch;
      }
   }
   return @hrf;
}

sub trim_href($) {
    my $fil = shift;
    my $nfil = '';
    my $len = length($fil);
    my ($i,$ch);
    for ($i = 0; $i < $len; $i++) {
        $ch = substr($fil,$i,1);
        if (($i == 0)&&($ch eq '/')) {
            next;
        }
        if (($ch eq '#')||($ch eq '?')) {
            last;
        }
        $nfil .= $ch;
    }
    return $nfil;
}

sub parse_file($$) {
    my ($bdir,$bfil) = @_;
    #if ($bdir = /^\.(\\|\/)$/) {
    #    $bdir = '';
    #}
   my $fil = $bdir.$bfil;
    prt( "Processing file '$fil' ...\n" );
   my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt,$linkcnt,$ra,$hr,$ci2);
   my @hrf = ();
   if ( ! open INF, "<$fil") {
        prt( "WARNING: Can NOT open file [$fil]...\n" );
       return @hrf;
    }
    my @lines = <INF>;
    close INF;
    $lncnt = scalar @lines;
    prt( "Processing $lncnt lines from [$fil] ...\n" );
    $full = join('',@lines);
    # sub write2file {    my ($txt,$fil) = @_;
    #my $scrp = return_tag($full,'script');
    ##my $scrp = get_all_tag_text($full,'script');
    ##write2file($scrp,'tempscript.txt');
    ##prt( "Got script text [$scrp]\n" );
    @hrf = get_hrefs_from_string($full);
    $hrcnt = scalar @hrf;
    prt( "Got $hrcnt HREF entries, from $fil...\n" );
    $filcnt = 0;
   #           href  HRT-type FT-file
   #             0     1       2
   # push(@hrf, [$hr, $hrt,    $ft]);
    my %hr_dupes = ();
    my $url = $usr_url; # get any user url
    my @list = ();
    if (length($url)) {
        $url .= '/' if (!($url =~ /\/$/));  # ensure ends with '/'
    }
    for ($i = 0; $i < $hrcnt; $i++) {
        $ra = $hrf[$i];
        $hr  = ${$ra}[0];
        $typ = ${$ra}[1];
        $fil = $bdir.$hr;
        $fil = trim_href($fil);
        if (defined $hr_dupes{$hr}) {
            $hr_dupes{$hr}++;
            next;
        } else {
            $ci2 = sprintf("%3d", ($i + 1));
            push(@list,"$url$hr");
            prt("$ci2: $url$hr\n") if (VERB1());
        }
        next if (-d $fil);
        if ( ($typ & HRT_FILE) && !($typ & HRT_LINK) ) {
            $filcnt++;
            if (! -f $fil) {
                if (defined $not_found{$fil}) {
                    $not_found{$fil}++;
                } else {
                    prt( "WARNING: File [$fil] NOT found ...\n" ) if (VERB2());
                    $not_found{$fil} = 1;
                }
            }
        }
    }
    my $cnt = scalar @list;
    prt( "Got $cnt diff HREF entries, from $bfil... $filcnt appear file refs...\n" );
    if ($cnt && length($out_file)) {
        $fil = join("\n",@list)."\n";
        write2file($fil,$out_file);
        prt("List written to '$out_file'...\n");
    }
    $linkcnt = 0;
    my %counted = ();
    my %by_extent = ();
    my %by_fn = ();
    my @dupes = ();
    my $msg = '';
    my ($nm,$dir,$ext,$ind,$ff);
    for ($i = 0; $i < $hrcnt; $i++) {
        $fil = $hrf[$i][0];
        $typ = $hrf[$i][1];
        if ($typ & HRT_LINK) {
            $ind = index($fil,'#');
            $fil = substr($fil,0,$ind) if ($ind > 0);
            if (defined $counted{$fil}) {
                $counted{$fil}++;
            } else {
                $counted{$fil} = 1;
                $linkcnt++;
               ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
                $by_extent{$ext} = [] if (!defined $by_extent{$ext});
                push( @{$by_extent{$ext}}, $fil );
                $ff = $nm.$ext;
                if ($ext =~ /^\.java/) {
                    if (defined $by_fn{$ff}) {
                        push(@dupes,$ff);
                        $by_fn{$ff}++;
                    } else {
                        $by_fn{$ff} = 1;
                    }
                }
            }
        }
    }
    if ($linkcnt) {
        %counted = ();
        prt("Listing $linkcnt links...\n");
        if ($show_full_list) {
            for ($i = 0; $i < $hrcnt; $i++) {
                $fil = $hrf[$i][0];
                $typ = $hrf[$i][1];
                if ($typ & HRT_LINK) {
                    $ind = index($fil,'#');
                    $fil = substr($fil,0,$ind) if ($ind > 0);
                    if (defined $counted{$fil}) {
                        $counted{$fil}++;
                    } else {
                        $counted{$fil} = 1;
                        prt("$fil\n");
                        $msg .= "$fil\n";
                    }
                }
            }
        }
        foreach $ext (keys %by_extent) {
            my $list = $by_extent{$ext};
            foreach $fil (@{$list}) {
                #prt("$fil\n");
                $msg .= "$fil\n";
            }
        }
        write2file($msg,$outtemp);
        prt("Written list to $outtemp...\n");
        if (@dupes) {
            prt("Note: ".scalar @dupes." duplicated file names...\n");
            prt( join(" ",@dupes)."\n");
        } else {
            prt("Appears NO duplicated names...\n");
        }
    #} else {
    #    prt("No link count in $fil...\n");
    }

   return @hrf;
}

####################################
####################################################################
# process_folder(folder) 
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays
####################################################################
sub process_folder {
   my ($inf) = shift;
   my ($ft,$ff,$nm,$dir,$ext,$val,$fil,$idir);
   my $fcnt = 0;
   prt( "Processing $inf folder ...\n" ) if ($dbg1 || VERB5());
   if ( opendir( DIR, $inf ) ) {
      my @files = readdir(DIR);
      closedir DIR;
        $idir = $inf;
        $idir .= "\\" if (!($idir =~ /(\\|\/)$/));
      foreach $fil (@files) {
         next if (($fil eq ".")||($fil eq ".."));
            $ff = $idir.$fil;
         if ( -d $ff ) {
            if ($recurse) {
               if ($ignfpd && is_fp_folder($fil)) {   # ignore FRONTPAGE folders
                  next;
               }
               if (@splexcludes && in_spl_excludes($fil)) {
                  next;
               }
               process_folder( $ff );
            }
         } else {
             $ft = get_file_type_const($fil);
            # NOTE: multidimensional arrays pushed - offsets into arrays
            if ( !in_excludes($fil) ) {  # NOT in @excludes
               ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
               $val = 0;
               $val = $ext_hash{$ext} if ( defined $ext_hash{$ext} );
               $val++;
               $ext_hash{$ext} = $val;
               push(@all_files, [$ff, '', '', 0, 0, '', '', $ft] );
               $fcnt++;
            }
         }
      }
      prt( "Processed $inf folder finding $fcnt files ...\n" ) if ($dbg6 || VERB2());
   } else {
      prt( "ERROR: Failed to open folder $inf ...\n" );
   }
}

################################################
# my $ignfpd = 1;   # ignore FRONTPAGE folders
################################################
sub is_fp_folder {
   my ($inf) = shift;
   foreach my $fil (@fpfolders) {
      if (lc($inf) eq lc($fil)) {
         return 1;
      }
   }
   return 0;
}

####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes {
   my ($fil) = shift;
   my $lcf = lc($fil);
   foreach my $f (@excludes) {
      if (lc($f) eq $lcf) {
         return 1;
      }
   }
   return 0;
}

sub in_spl_excludes {
   my ($fldr) = shift;
   my $lfldr = lc($fldr);
   foreach my $f (@splexcludes) {
      if (lc($f) eq $lfldr) {
         return 1;
      }
   }

   return 0;
}

sub set_status_case {
   my ( $ch, $pch, $inccm, $inlnc, $inqot, $qot ) = @_;
   my $ldbg2 = 0;
   if ($$inccm) {
      if (($ch eq '/')&&($pch eq '*')) {
         $$inccm = 0;
         prt( "status: End C comment /* */ ...\n" ) if ($ldbg2);
      }
   } elsif ($$inlnc ) {
      if ($ch eq "\n") {
         $$inlnc = 0;
         prt( "status: End line comment // ...\n" ) if ($ldbg2);
      }
   } elsif ($$inqot ) {
      if ($ch eq $$qot) {
         prt( "status: End quote $$qot ...\n" ) if ($ldbg2);
         $$inqot = 0;
         $$qot = '';
      }
   } else {
      if ($ch eq '/') {
         if ($pch eq '/') {
            $$inlnc = 1;
            prt( "status: Entered line comment // ...\n" ) if ($ldbg2);
         }
      } elsif ($ch eq '*') {
         if ($pch eq '/') {
            $$inccm = 1;
            prt( "status: Entered C comment /* */ ...\n" ) if ($ldbg2);
         }
      } elsif (($ch eq '"')||($ch eq "'")) {
         $$qot = $ch;
         $$inqot = 1;
         prt( "status: Entered quote $$qot ...\n" ) if ($ldbg2);
      }
   }
}


sub get_all_tag_text {
   my ($txt, $tag) = @_;
   my $len = length($txt);
   my $ldbg1 = 0;
   my $ntxt = '';
   my $ch = '';
   my $pch = '';
   my $ftag = '';
   my $nline = '';
   my $i = 0;
   my $intag = 0;
   my $incomment = 0;
   my $inqot = 0;   # in quotes ' or "
   my $qot = '';
   my $inlnc = 0;   # in line comment
   my $inccm = 0;  # in C comment
   my ($part, $shlen);
   ###prt("Processing $len chars for $tag ...\n");
   for ($i = 0; $i < $len; $i++) {
      $pch = $ch;
      $ch = substr($txt, $i, 1);
      set_status_case( $ch, $pch, \$inccm, \$inlnc, \$inqot, \$qot );
      if ($incomment) {
         $ntxt .= $ch;
         if ($ch eq '>') {
            $shlen = -15;
            if (length($ntxt) < 15) {
               $shlen = 0 - length($ntxt);
            }
            prt( "Potential close [".substr($ntxt,$shlen)."] ...($i)" ) if ($ldbg1);
            if (substr($ntxt,-3) eq '-->') {
               if (!$inqot && !$inlnc && !$inccm) {
                  prt( " Yes\n" ) if ($ldbg1);
                  $incomment = 0;   # no longer IN comment
                  prt("End comment <!-- --> ...\n") if ($ldbg1);
               } else {
                  if ($inqot) {
                     prt( " NO DUE TO IN QUOTE\n" ) if ($ldbg1);
                  } elsif ($inlnc) {
                     prt( " NO DUE TO IN LINE COMMENT\n" ) if ($ldbg1);
                  } elsif ($inccm) {
                     prt( " NO DUE TO IN C COMMENT\n" ) if ($ldbg1);
                  } else {
                     prt( " NO DUE TO SOME REASON!!! **** CHECK ME!!! ****\n" ) if ($ldbg1);
                  }
               }
            } else {
               prt( " NO!\n" ) if ($ldbg1);
            }
         }
      } elsif ($intag) {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $part = substr($txt,$i,4);
            if ($part eq '<!--') {   # if a powerful comment starts
               prt("Entering comment <!-- ...\n") if ($ldbg1);
               $incomment = 1;         # unconditionally go to end of this comment
               $ntxt .= $ch;
               next;
            }
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $pch = $ch;
               $ch = substr($txt, $i, 1);
               if ($ch eq '>') {
                  last;
               }
               $ftag .= $ch;
            }
            $ntxt .= '<'.$ftag;
            ###prt("Got tag [$ftag] ...\n");
            if (lc(substr($ftag,1)) eq lc($tag)) {
               $intag = 0;
            }
         } 
         $ntxt .= $ch;
      } else {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $pch = $ch;
               $ch = substr($txt, $i, 1);
               if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) {
                  last;
               }
               $ftag .= $ch;
            }
            ###prt("Got tag [$ftag] ...\n");
            if (lc($ftag) eq lc($tag)) {
               $ntxt .= '<'.$ftag.$ch;
               if (($ch eq ' ')||($ch =~ /\s/)) {
                  $i++;
                  for ( ; $i < $len; $i++ ) {
                     $pch = $ch;
                     $ch = substr($txt, $i, 1);
                     $ntxt .= $ch;
                     if ($ch eq '>') {
                        last;
                     }
                  }
               }
               ###prt( "Entered tag <$ftag...> ($tag)...\n" );
               $intag = 1;
            }
         }
      }
   }
   return $ntxt;
}

sub unix_2_dos {
   my ($f) = shift;
   $f =~ s/\//\\/g;
   return $f;
}

#   OF_FF => 0,   # full file name
#   OF_HR => 1,   # array ref of href links
#   OF_IM => 2,   # array ref of image links
#   OF_LK => 3,   # linked count
#   OF_SP => 4,   # spare
#   OF_TO => 5,   # links TO
#   OF_FM => 6, # links FROM
#   OF_FT => 7   # file type
sub mark_in_all_files {
   my ($fil, $dir) = @_;
   my $max = scalar @all_files;
   my ($i, $ff, $nm, $dr, $fnd);
   $dir = unix_2_dos($dir);
   $fnd = 0;
   for ($i = 0; $i < $max; $i++) {
      $ff = $all_files[$i][OF_FF];
      ($nm,$dr) = fileparse($ff);
      $dr = unix_2_dos($dr);
      if (($nm eq $fil)&&($dir eq $dr)) {
         $all_files[$i][OF_SP]++;
         $fnd++;
      }
   }
   if (!$fnd) {
      prt( "WARNING: [$fil] in [$dir] NOT found ...\n" );
   }
}

sub in_done_files {
   my ($nm, $dr, @dn) = @_;
   my $ct = scalar @dn;
   my ($nnm, $ndr);
   for (my $j = 0; $j < $ct; $j++) {
      $nnm = $dn[$j][0];
      $ndr = $dn[$j][1];
      if (($nnm eq $nm) && ($ndr eq $dr)) {
         return 1;
      }
   }
   return 0;
}


sub process_references {
   my ($bdir) = @_;
   my $rcnt = scalar @all_hrefs;
   my ($i, $ff, $ft, $name, $rdir, $hrt, @new_hrefs, @nxt_hrefs, $dncnt);
   $bdir = unix_2_dos($bdir);
   prt( "Processing $rcnt HREF found ...\n" );
   $dncnt = 0;
   for ($i = 0; $i < $rcnt; $i++) {
      $ff = $bdir.$all_hrefs[$i][0];
      $hrt = $all_hrefs[$i][1];
      $ft = $all_hrefs[$i][2];
      if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
         ($name,$rdir) = fileparse( $ff );
         $rdir = unix_2_dos($rdir);
         if ($ff =~ /\.\./) {
            # this back up ...
            prt( "SKIPPING file [$name], in [$rdir] ...\n" );
         } else {
            prt( "File [$name], in [$rdir] ...\n" ) if ($dbg4);
            mark_in_all_files($name, $rdir);
            @new_hrefs = parse_file($rdir, $name);
            $dncnt++;
         }
         push(@done_files, [$name, $rdir]);
      }
   }

   $rcnt = scalar @new_hrefs;
   prt( "Processed $dncnt file, for new $rcnt files ...\n" ) if ($rcnt);
   $dncnt = 0;
   while ($rcnt) {
      @nxt_hrefs = ();
      $dncnt = 0;
      prt( "Processing $rcnt NEW HREF found ...\n" );
      for ($i = 0; $i < $rcnt; $i++) {
         $ff = $bdir.$new_hrefs[$i][0];
         $hrt = $new_hrefs[$i][1];
         $ft = $new_hrefs[$i][2];
         if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
            ($name,$rdir) = fileparse( $ff );
            $rdir = unix_2_dos($rdir);
            if ( !in_done_files($name, $rdir, @done_files) ) {
               if ($ff =~ /\.\./) {
                  # this back up ...
                  prt( "SKIPPING file [$name], in [$rdir] ...\n" );
               } else {
                  prt( "File [$name], in [$rdir] ...\n" );
                  mark_in_all_files($name, $rdir);
                  @nxt_hrefs = parse_file($rdir, $name);
                  $dncnt++;
               }
               push(@done_files, [$name, $rdir]);
            }
         }
      }
      @new_hrefs = @nxt_hrefs;
      $rcnt = scalar @new_hrefs;
      prt( "Processed $dncnt file, for new $rcnt files ...\n" ) if ($rcnt);
   }
}

########################################################
### MAIN ###

parse_args(@ARGV);

prt("Processing input file '$in_file'...\n") if (VERB9());

($base_file,$base_dir) = fileparse( $in_file );
$base_dir = unix_2_dos($base_dir);

process_folder($base_dir);   # get ALL the files, in mutidemensional array

prt("Doing parse_file($base_dir, $base_file)...\n") if (VERB9());

@all_hrefs = parse_file($base_dir, $base_file);

push(@done_files, [$base_file, $base_dir]);

mark_in_all_files($base_file, $base_dir);

$refcnt = scalar @all_hrefs;

process_references($base_dir);

show_results() if ($show_missed_files);

pgm_exit(0,"");

########################################################

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    my $verb = VERB2();
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                $verb = VERB2();
                prt("Verbosity = $verbosity\n") if ($verb);
            } elsif ($sarg =~ /^l/) {
                if ($sarg =~ /^ll/) {
                    $load_log = 2;
                } else {
                    $load_log = 1;
                }
                prt("Set to load log at end. ($load_log)\n") if ($verb);
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_file = $sarg;
                prt("Set out file to [$out_file].\n") if ($verb);

            } elsif ($sarg =~ /^u/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $usr_url = $sarg;
                prt("Set usr url to [$usr_url].\n") if ($verb);
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n") if ($verb);
        }
        shift @av;
    }

    if ($debug_on) {
        prtw("WARNING: DEBUG is ON!\n");
        if (length($in_file) ==  0) {
            $in_file = $def_file;
            prt("Set DEFAULT input to [$in_file]\n");
        }
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
}

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --out <file>  (-o) = Write output to this file.\n");
    prt(" --url <url>   (-u) = Add this URL to link, when shown.\n");

}

########################################################

# eof - gethrefs02.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional