gethrefs.pl to HTML.

index -|- end

Generated: Sun Apr 15 11:46:19 2012 from gethrefs.pl 2011/10/20 25.9 KB.

#!/perl -w
# NAME: gethrefs.pl
# AIM: Parse a HTML file, and extract HREF links
# 20/10/2011 - Add user options - parse_args()
# 18/07/2010 - revisit and test...
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;
my $perl_dir = 'C:\GTools\perl';
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
###require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";

use constant {
     HRT_UNKNOWN => 0,
     HRT_LOCAL => 1,
     HRT_LINK => 2,
     HRT_SCRIPT => 3,
    HRT_FILE => 4,
     HRT_BASE => 5
};
use constant {
   FT_UNKNOWN => 0,
   FT_HTML => 1,
   FT_GRAF => 2,
   FT_CSS => 3,
   FT_SCRIPT => 4,
   FT_TEXT => 5,
   FT_ZIP => 6,
    FT_DIR => 7
};

# offsets in file array
use constant {
   OF_FF => 0,   # full file name
   OF_HR => 1,   # array ref of href links
   OF_IM => 2,   # array ref of image links
   OF_LK => 3,   # linked count
   OF_SP => 4,   # spare
   OF_TO => 5,   # links TO
   OF_FM => 6, # links FROM
   OF_FT => 7   # file type
};

# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.2 2011-10-20";
my $load_log = 0;

my $in_file = '';

my $debug_on = 0;
my $def_file = 'C:\GTools\java\examples\JavaTech\Code_List.htm';
##my $in_file = 'C:\HOMEPAGE\GA\travel\maroc\index.htm';
###my $in_file = 'temphtml.htm';
my @all_hrefs = ();
my $verbosity = 0;

my $splout = $perl_dir."\\tempspec.txt";

# CONSTANTS
###########
# File Type Extensions
my @html_extension = qw( .htm .html .shtml .php );
my @graf_extension = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_extension  = qw( .css );
my @script_extension = qw( .js .class .cgi .java );
my @zip_extension = qw( .zip .tar .gz .jar );
my @txt_extension = qw( .txt .doc );

# private FRONTPAGE folders
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );

# features
my $show_not_found = 0;
my $show_local_links = 0;   # also show INTERNAL (in page) links
my $ignfpd = 1;   # ignore FRONTPAGE folders
my @excludes = qw( desktop.ini php.ini blank.html blank.htm );
my $recurse = 1;   # recursive
my @splexcludes = qw( macpc );

my %ext_hash = ();
my @all_files = ();
my $refcnt = 0;
my @done_files = ();
my %not_found = ();

my ($base_file,$base_dir);
my $base_href = ''; # set if <BASE href="..."> found


# DEBUG
my $dbg1 = 0;   # show discarded material
my $dbg2 = 0;   # show "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]...
my $dbg3 = 0;   # show Processing $lncnt lines from $fil ...
my $dbg4 = 0;   # show File [$name], in [$rdir] ...
my $dbg5 = 0;   # show HREF immediately
my $dbg6 = 0;   # show 'tag' immediately

### program variables
my @warnings = ();
my $cwd = cwd();
my $os = $^O;

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        ### prt( "\nNo warnings issued.\n\n" );
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}


##################################################################
#   OF_FF => 0,   # full file name
#   OF_HR => 1,   # array ref of href links
#   OF_IM => 2,   # array ref of image links
#   OF_LK => 3,   # linked count
#   OF_SP => 4,   # spare
#   OF_TO => 5,   # links TO
#   OF_FM => 6, # links FROM
#   OF_FT => 7   # file type
sub show_results {
   my $fcnt = scalar @all_files;
   my ($i, $ff, $ft, $cnt);
   for ($i = 0; $i < $fcnt; $i++) {
      $ft = $all_files[$i][OF_FT];
      if ($ft == FT_HTML) {
         $cnt = $all_files[$i][OF_SP];
         if ($cnt == 0) {
            $ff = $all_files[$i][OF_FF];
            prt( "Missed [$ff]\n" );
         }
      }
   }
}

##################################################################
sub get_href_type_const {
   my ($hrf) = shift;
   if ($hrf =~ /^http(s*):\/\//i) {
      return HRT_LINK;
   } elsif ($hrf =~ /^ftp:\/\//i) {
      return HRT_LINK;
   } elsif ($hrf =~ /^javascript:/i) {
      return HRT_SCRIPT;
   } elsif (substr($hrf,0,1) eq '#') {
      return HRT_LOCAL;
    } elsif (index($hrf,'.') >= 0) {
      return HRT_FILE;
   }
   return HRT_UNKNOWN;
}

sub href_type_to_string {
   my ($hrt) = shift;
   if ($hrt == HRT_LINK) {
      return "extern link";
   } elsif ($hrt == HRT_SCRIPT) {
      return "script";
   } elsif ($hrt == HRT_LOCAL) {
      return "local";
    } elsif ($hrt == HRT_FILE) {
      return "file";
    } elsif ($hrt == HRT_BASE) {
      return "BASE";
   } elsif ($hrt == HRT_UNKNOWN) {
      return "unknown";
   }
   return '***FIX ME*** uncased type';
}

sub get_hrefs_from_string($) {
   my ($ln) = shift;
   my ($i, $j, $line, $ch, $ch2, $len, $tag, $disc, $hcnt);
    my ($bgn, $fhr, $hr, $tail, $max, $hrt, $ft, $ctyp);
   my ($sp,$tag2,$gottag);
   my @hrf = ();
   $ln =~ s/\n/ /g;
   $ln = trim_all($ln);
   # sub write2file {    my ($txt,$fil) = @_;
   # write2file($fulln,'tempfl.txt');
   $len = length($ln);
   $disc = '';
   $hcnt = 0;
    # process single long string, char by char
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($ln,$i,1);
      if ($ch eq '<') {
         $tag = $ch; # start a tag
         $i++;
         $ch = substr($ln,$i,1);
         # could check for things like <! ... later maybe
            $tag2 = '';
            $gottag = 0;
         for (; $i < $len; $i++) {
            $ch = substr($ln,$i,1);
            $tag .= $ch;
            if ($ch eq '>') {
               last;
            }
                if (!$gottag) {
                    if ($ch =~ /\w/) {
                        $tag2 .= $ch;
                    } else {
                        $gottag = 1;
                    }
                }
         }

         if ($tag =~ /(.*\s+)href(\s*)=/i) {
            $bgn = $1;
            $sp = length($2);
            $hcnt++;
            $fhr = substr($tag,length($bgn)+5+$sp);
            $fhr = substr($fhr,1) while ($fhr =~ /^\s/); # remove all LEADING space
            $ch = substr($fhr,0,1);
                prt("$tag [$tag2] [$fhr]\n") if ($dbg5);
            if (($ch eq '"')||($ch eq "'")) {
               $max = length($fhr);
               $hr = '';
               $tail = '';
               # collect actual HREF=
               for ($j = 1; $j < $max; $j++) {
                  $ch2 = substr($fhr,$j,1);
                  if ($ch eq $ch2) {
                     $tail = substr($fhr,$j);
                     last;
                  }
                  $hr .= $ch2;
               }
                    if ($tag2 =~ /^BASE$/i) {
                        $hrt = HRT_BASE;
                        prt("Got [$tag2] [$fhr]\n");
                    } else {
                        $hrt = get_href_type_const($hr);
                    }
               $ctyp = '';
               $ft = FT_UNKNOWN;
               if ($hrt == HRT_FILE) {
                  $ft = get_file_type_const($hr);
                  $ctyp = "ext[".file_type_const_to_string($ft)."] ";
               }
               $ctyp = 'type['.href_type_to_string($hrt)."] $ctyp";
                    prt("tag [$tag2] [$hr] $ctyp\n") if ($dbg6);
                    $base_href = $hr if ($hrt == HRT_BASE);
               prt( "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]\n" ) if ($dbg2);
               #          href  HRT-type FT-file
               #            0     1       2
               push(@hrf, [$hr, $hrt,    $ft]);
            } else {
               prt( "$hcnt:HREF: fhr[$fhr] pre[$disc] tag[$tag] bgn[$bgn] CHECK ME\n" );
            }
         } else {
            prt( "DISCARDED: pre[$disc] tag[$tag] ...\n" ) if ($dbg1);
         }
         $disc = '';
      } else {
         $disc .= $ch;
      }
   }
   return @hrf;
}

sub parse_file($$) {
    my ($bdir,$bfil) = @_;
   my $fil = $bdir.$bfil;
   my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt);
   my @hrf = ();
   if ( ! open INF, "<$fil") {
        prt( "WARNING: Can NOT open file [$fil]...\n" );
       return \@hrf;
    }
    my @lines = <INF>;
    close INF;
    $lncnt = scalar @lines;
    prt( "Processing $lncnt lines from [$fil] ...\n" );
    $full = join('',@lines);
    # sub write2file {    my ($txt,$fil) = @_;
    #my $scrp = return_tag($full,'script');
    ##my $scrp = get_all_tag_text($full,'script');
    ##write2file($scrp,'tempscript.txt');
    ##prt( "Got script text [$scrp]\n" );
    @hrf = get_hrefs_from_string($full);
    $hrcnt = scalar @hrf;
    $filcnt = 0;
    for ($i = 0; $i < $hrcnt; $i++) {
        $fil = $bdir.$hrf[$i][0];
        $typ = $hrf[$i][1];
        if ($typ == HRT_FILE) {
            $filcnt++;
            if (! -f $fil) {
                if (defined $not_found{$fil}) {
                    $not_found{$fil}++;
                } else {
                    prt( "WARNING: File [$fil] NOT found ...\n" ) if ($show_not_found);
                    $not_found{$fil} = 1;
                }
            }
        }
    }
    prt( "Got $hrcnt HREF entries, from $bfil ... $filcnt link files\n" );
   return \@hrf;
}

sub parse_file2($$) {
    my ($bdir,$bfil) = @_;
   my $fil = $bdir.$bfil;
   my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt);
   my @hrf = ();
   if ( ! open INF, "<$fil") {
        prt( "WARNING: Can NOT open file [$fil]...\n" );
       return @hrf;
    }
    my @lines = <INF>;
    close INF;
    $lncnt = scalar @lines;
    prt( "Processing $lncnt lines from [$fil] ...\n" );
    $full = join('',@lines);
    # sub write2file {    my ($txt,$fil) = @_;
    #my $scrp = return_tag($full,'script');
    ##my $scrp = get_all_tag_text($full,'script');
    ##write2file($scrp,'tempscript.txt');
    ##prt( "Got script text [$scrp]\n" );
    @hrf = get_hrefs_from_string($full);
    $hrcnt = scalar @hrf;
    $filcnt = 0;
    for ($i = 0; $i < $hrcnt; $i++) {
        $fil = $bdir.$hrf[$i][0];
        $typ = $hrf[$i][1];
        if ($typ == HRT_FILE) {
            $filcnt++;
            if (! -f $fil) {
                if (defined $not_found{$fil}) {
                    $not_found{$fil}++;
                } else {
                    prt( "WARNING: File [$fil] NOT found ...\n" ) if ($show_not_found);
                    $not_found{$fil} = 1;
                }
            }
        }
    }
    prt( "Got $hrcnt HREF entries, from $bfil ... $filcnt link files\n" );
   return @hrf;
}

####################################
#########################################################
# Passed an array of extensions,
# check if this is one of them?
#########################################################
sub is_my_extension {
   my ($fil, @exts) = @_;
   my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
   foreach my $ex (@exts) {
      if (lc($ex) eq lc($ext)) {
         return 1;
      }
   }
   return 0;
}

############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################
sub is_htm_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, @html_extension) );
}
sub is_graphic_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, @graf_extension) );
}
sub is_zip_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, @zip_extension) );
}
sub is_css_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, @css_extension) );
}
sub is_txt_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, @txt_extension) );
}
sub is_script_extension {
   my ($fil) = shift;
   return( is_my_extension($fil, @script_extension) );
}

#use constant {
#   FT_UNKNOWN => 0,
##   FT_HTML => 1,
##   FT_GRAF => 2,
##   FT_CSS => 3,
##   FT_SCRIPT => 4,
##   FT_TEXT => 5,
##   FT_ZIP => 6,
##  FT_DIR => 7
#};
sub get_file_type_const {
   my ($fil) = shift;
   if (is_htm_extension($fil)) {
      return FT_HTML;
   } elsif (is_graphic_extension($fil)) {
      return FT_GRAF;
   } elsif (is_zip_extension($fil)) {
      return FT_ZIP;
   } elsif (is_css_extension($fil)) {
      return FT_CSS;
   } elsif (is_txt_extension($fil)) {
      return FT_TEXT;
   } elsif (is_script_extension($fil)) {
      return FT_SCRIPT;
   } elsif ($fil =~ /\/$/) {
        return FT_DIR;
    }
   return FT_UNKNOWN;
}

sub file_type_const_to_string {
   my ($ft) = shift;
   if ($ft == FT_HTML) {
      return "html";
   } elsif ($ft == FT_GRAF) {
      return "graphic";
   } elsif ($ft == FT_ZIP) {
      return "zip";
   } elsif ($ft == FT_CSS) {
      return "css";
   } elsif ($ft == FT_TEXT) {
      return "text";
   } elsif ($ft == FT_SCRIPT) {
      return "script";
   } elsif ($ft == FT_DIR) {
        return "directory";
   } elsif ($ft == FT_UNKNOWN) {
      return "unknown";
   }
   return "***FIX ME*** uncased type $ft!";
}

##############################################
####################################################################
# process_folder(folder) 
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays
####################################################################
sub process_folder {
   my ($inf) = shift;
   my ($ft, $ff, $nm, $dir, $ext, $val, $fil);
   my $fcnt = 0;
   prt( "Processing $inf folder ...\n" ) if ($dbg1);
   if ( opendir( DIR, $inf ) ) {
      my @files = readdir(DIR);
      closedir DIR;
      foreach $fil (@files) {
         if (($fil eq ".")||($fil eq "..")) {
            next;
         }
         $ft = get_file_type_const($fil);
         $ff = $inf;
         $ff .= "\\" if !($inf =~ /[\\\/]$/);
         $ff .= $fil;
         if ( -d $ff ) {
            if ($recurse) {
               if ($ignfpd && is_fp_folder($fil)) {   # ignore FRONTPAGE folders
                  next;
               }
               if (@splexcludes && in_spl_excludes($fil)) {
                  next;
               }
               process_folder( $ff );
            }
         } else {
            # NOTE: multidimensional arrays pushed - offsets into arrays
            if ( !in_excludes($fil) ) {  # NOT in @excludes
               ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
               $val = 0;
               $val = $ext_hash{$ext} if ( defined $ext_hash{$ext} );
               $val++;
               $ext_hash{$ext} = $val;
               push(@all_files, [$ff, '', '', 0, 0, '', '', $ft] );
               $fcnt++;
            }
         }
      }
      prt( "Processed $inf folder finding $fcnt files ...\n" );
   } else {
      prt( "ERROR: Failed to open folder $inf ...\n" );
   }
}

################################################
# my $ignfpd = 1;   # ignore FRONTPAGE folders
################################################
sub is_fp_folder {
   my ($inf) = shift;
   foreach my $fil (@fpfolders) {
      if (lc($inf) eq lc($fil)) {
         return 1;
      }
   }
   return 0;
}

####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes {
   my ($fil) = shift;
   my $lcf = lc($fil);
   foreach my $f (@excludes) {
      if (lc($f) eq $lcf) {
         return 1;
      }
   }
   return 0;
}

sub in_spl_excludes {
   my ($fldr) = shift;
   my $lfldr = lc($fldr);
   foreach my $f (@splexcludes) {
      if (lc($f) eq $lfldr) {
         return 1;
      }
   }

   return 0;
}

sub set_status_case {
   my ( $ch, $pch, $inccm, $inlnc, $inqot, $qot ) = @_;
   my $ldbg2 = 0;
   if ($$inccm) {
      if (($ch eq '/')&&($pch eq '*')) {
         $$inccm = 0;
         prt( "status: End C comment /* */ ...\n" ) if ($ldbg2);
      }
   } elsif ($$inlnc ) {
      if ($ch eq "\n") {
         $$inlnc = 0;
         prt( "status: End line comment // ...\n" ) if ($ldbg2);
      }
   } elsif ($$inqot ) {
      if ($ch eq $$qot) {
         prt( "status: End quote $$qot ...\n" ) if ($ldbg2);
         $$inqot = 0;
         $$qot = '';
      }
   } else {
      if ($ch eq '/') {
         if ($pch eq '/') {
            $$inlnc = 1;
            prt( "status: Entered line comment // ...\n" ) if ($ldbg2);
         }
      } elsif ($ch eq '*') {
         if ($pch eq '/') {
            $$inccm = 1;
            prt( "status: Entered C comment /* */ ...\n" ) if ($ldbg2);
         }
      } elsif (($ch eq '"')||($ch eq "'")) {
         $$qot = $ch;
         $$inqot = 1;
         prt( "status: Entered quote $$qot ...\n" ) if ($ldbg2);
      }
   }
}


sub get_all_tag_text {
   my ($txt, $tag) = @_;
   my $len = length($txt);
   my $ldbg1 = 0;
   my $ntxt = '';
   my $ch = '';
   my $pch = '';
   my $ftag = '';
   my $nline = '';
   my $i = 0;
   my $intag = 0;
   my $incomment = 0;
   my $inqot = 0;   # in quotes ' or "
   my $qot = '';
   my $inlnc = 0;   # in line comment
   my $inccm = 0;  # in C comment
   my ($part, $shlen);
   ###prt("Processing $len chars for $tag ...\n");
   for ($i = 0; $i < $len; $i++) {
      $pch = $ch;
      $ch = substr($txt, $i, 1);
      set_status_case( $ch, $pch, \$inccm, \$inlnc, \$inqot, \$qot );
      if ($incomment) {
         $ntxt .= $ch;
         if ($ch eq '>') {
            $shlen = -15;
            if (length($ntxt) < 15) {
               $shlen = 0 - length($ntxt);
            }
            prt( "Potential close [".substr($ntxt,$shlen)."] ...($i)" ) if ($ldbg1);
            if (substr($ntxt,-3) eq '-->') {
               if (!$inqot && !$inlnc && !$inccm) {
                  prt( " Yes\n" ) if ($ldbg1);
                  $incomment = 0;   # no longer IN comment
                  prt("End comment <!-- --> ...\n") if ($ldbg1);
               } else {
                  if ($inqot) {
                     prt( " NO DUE TO IN QUOTE\n" ) if ($ldbg1);
                  } elsif ($inlnc) {
                     prt( " NO DUE TO IN LINE COMMENT\n" ) if ($ldbg1);
                  } elsif ($inccm) {
                     prt( " NO DUE TO IN C COMMENT\n" ) if ($ldbg1);
                  } else {
                     prt( " NO DUE TO SOME REASON!!! **** CHECK ME!!! ****\n" ) if ($ldbg1);
                  }
               }
            } else {
               prt( " NO!\n" ) if ($ldbg1);
            }
         }
      } elsif ($intag) {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $part = substr($txt,$i,4);
            if ($part eq '<!--') {   # if a powerful comment starts
               prt("Entering comment <!-- ...\n") if ($ldbg1);
               $incomment = 1;         # unconditionally go to end of this comment
               $ntxt .= $ch;
               next;
            }
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $pch = $ch;
               $ch = substr($txt, $i, 1);
               if ($ch eq '>') {
                  last;
               }
               $ftag .= $ch;
            }
            $ntxt .= '<'.$ftag;
            ###prt("Got tag [$ftag] ...\n");
            if (lc(substr($ftag,1)) eq lc($tag)) {
               $intag = 0;
            }
         } 
         $ntxt .= $ch;
      } else {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $pch = $ch;
               $ch = substr($txt, $i, 1);
               if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) {
                  last;
               }
               $ftag .= $ch;
            }
            ###prt("Got tag [$ftag] ...\n");
            if (lc($ftag) eq lc($tag)) {
               $ntxt .= '<'.$ftag.$ch;
               if (($ch eq ' ')||($ch =~ /\s/)) {
                  $i++;
                  for ( ; $i < $len; $i++ ) {
                     $pch = $ch;
                     $ch = substr($txt, $i, 1);
                     $ntxt .= $ch;
                     if ($ch eq '>') {
                        last;
                     }
                  }
               }
               ###prt( "Entered tag <$ftag...> ($tag)...\n" );
               $intag = 1;
            }
         }
      }
   }
   return $ntxt;
}

sub unix_2_dos {
   my ($f) = shift;
   $f =~ s/\//\\/g;
   return $f;
}

#   OF_FF => 0,   # full file name
#   OF_HR => 1,   # array ref of href links
#   OF_IM => 2,   # array ref of image links
#   OF_LK => 3,   # linked count
#   OF_SP => 4,   # spare
#   OF_TO => 5,   # links TO
#   OF_FM => 6, # links FROM
#   OF_FT => 7   # file type
sub mark_in_all_files {
   my ($fil, $dir) = @_;
   my $max = scalar @all_files;
   my ($i, $ff, $nm, $dr, $fnd);
   $dir = unix_2_dos($dir);
   $fnd = 0;
   for ($i = 0; $i < $max; $i++) {
      $ff = $all_files[$i][OF_FF];
      ($nm,$dr) = fileparse($ff);
      $dr = unix_2_dos($dr);
      if (($nm eq $fil)&&($dir eq $dr)) {
         $all_files[$i][OF_SP]++;
         $fnd++;
      }
   }
   if (!$fnd) {
      prt( "WARNING: [$fil] in [$dir] NOT found ...\n" );
   }
}

sub in_done_files {
   my ($nm, $dr, @dn) = @_;
   my $ct = scalar @dn;
   my ($nnm, $ndr);
   for (my $j = 0; $j < $ct; $j++) {
      $nnm = $dn[$j][0];
      $ndr = $dn[$j][1];
      if (($nnm eq $nm) && ($ndr eq $dr)) {
         return 1;
      }
   }
   return 0;
}


sub process_references {
   my ($bdir) = @_;
   my $rcnt = scalar @all_hrefs;
   my ($i, $ff, $ft, $name, $rdir, $hrt, @new_hrefs, @nxt_hrefs, $dncnt);
   $bdir = unix_2_dos($bdir);
   prt( "Processing $rcnt HREF found ...\n" );
   $dncnt = 0;
   for ($i = 0; $i < $rcnt; $i++) {
      $ff = $bdir.$all_hrefs[$i][0];
      $hrt = $all_hrefs[$i][1];
      $ft = $all_hrefs[$i][2];
      if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
         ($name,$rdir) = fileparse( $ff );
         $rdir = unix_2_dos($rdir);
         if ($ff =~ /\.\./) {
            # this back up ...
            prt( "SKIPPING file [$name], in [$rdir] ...\n" );
         } else {
            prt( "File [$name], in [$rdir] ...\n" ) if ($dbg4);
            mark_in_all_files($name, $rdir);
            @new_hrefs = parse_file2($rdir, $name);
            $dncnt++;
         }
         push(@done_files, [$name, $rdir]);
      }
   }

   $rcnt = scalar @new_hrefs;
   prt( "Processed $dncnt file, for new $rcnt files ...\n" );
   $dncnt = 0;
   while ($rcnt) {
      @nxt_hrefs = ();
      $dncnt = 0;
      prt( "Processing $rcnt NEW HREF found ...\n" );
      for ($i = 0; $i < $rcnt; $i++) {
         $ff = $bdir.$new_hrefs[$i][0];
         $hrt = $new_hrefs[$i][1];
         $ft = $new_hrefs[$i][2];
         if (($hrt == HRT_FILE)&&($ft == FT_HTML)) {
            ($name,$rdir) = fileparse( $ff );
            $rdir = unix_2_dos($rdir);
            if ( !in_done_files($name, $rdir, @done_files) ) {
               if ($ff =~ /\.\./) {
                  # this back up ...
                  prt( "SKIPPING file [$name], in [$rdir] ...\n" );
               } else {
                  prt( "File [$name], in [$rdir] ...\n" );
                  mark_in_all_files($name, $rdir);
                  @nxt_hrefs = parse_file2($rdir, $name);
                  $dncnt++;
               }
               push(@done_files, [$name, $rdir]);
            }
         }
      }
      @new_hrefs = @nxt_hrefs;
      $rcnt = scalar @new_hrefs;
      prt( "Processed $dncnt file, for new $rcnt files ...\n" );
   }
}

sub fits_special_filter($$) {
    my ($hr,$ra) = @_;
    # https://git.gitorious.org/flightgear-aircraft/717.git
    if ($hr =~ /^https:\/\/git\.gitorious\.org\/flightgear-aircraft\/(.+)\.git$/) {
        push(@{$ra},$1);
        return 1;
    }
    return 0;
}

sub show_href_list($) {
    my ($raorg) = @_;
    #          href  HRT-type FT-file
   #            0     1       2
   #push(@hrf, [$hr, $hrt,    $ft]);
    my $cnt = scalar @{$raorg};
    my @arr = sort @{$raorg};
    my $ra = \@arr;
    my %dupes = ();
    prt("Display of up to $cnt hrefs... eliminating duplicates\n");
    my ($i,$i2,$ccnt,$hr,$hrt,$ft,$min,$len,$shrt,$form,$splcnt);
    my @specials = ();
    $i = int($cnt / 10);
    $form = '%d';
    $i2 = 1;
    while ($i) {
        $i2++;
        $form = '%'.sprintf("%d",$i2).'d';
        $i = int($i / 10);
    }
    $min = 0;
    for ($i = 0; $i < $cnt; $i++) {
        $hr = ${$ra}[$i][0];
        $hrt = ${$ra}[$i][1];
        $ft = ${$ra}[$i][2];
        $len = length($hr);
        $min = $len if ($len > $min);
    }
    $i2 = 0;
    for ($i = 0; $i < $cnt; $i++) {
        $hr = ${$ra}[$i][0];
        $hrt = ${$ra}[$i][1];
        $ft = ${$ra}[$i][2];
        
        if (!defined $dupes{$hr}) {
            # display it
            $dupes{$hr} = 1;
            $splcnt++ if (fits_special_filter($hr,\@specials));
            $i2++;
            $ccnt = sprintf($form,$i2);
            $hr .= ' ' while (length($hr) < $min);
            $shrt = href_type_to_string($hrt);
            prt("$ccnt: $hr $shrt\n");
        }
    }
    prt("Displayed $i2 different hrefs...\n");
    if (@specials) {
        my @arr = sort @specials;
        write2file(join("\n",@arr)."\n",$splout);
        prt("Written $splcnt to 'special' list $splout\n");
    }
}

########################################################
### MAIN ###

parse_args(@ARGV);

($base_file,$base_dir) = fileparse( $in_file );
$base_dir = unix_2_dos($base_dir);

# process_folder($base_dir);   # get ALL the files, in mutidemensional array

my $ref_hrefs = parse_file($base_dir, $base_file);
show_href_list($ref_hrefs);

push(@done_files, [$base_file, $base_dir]);

# mark_in_all_files($base_file, $base_dir);
# $refcnt = scalar @all_hrefs;

# process_references($base_dir);

# show_results();

pgm_exit(0,"");

########################################################

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end.\n");
}
sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            } elsif ($sarg =~ /^l/) {
                $load_log = 1;
                prt("Set to load log at end\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n");
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
}

# eof - gethrefs.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional