htmllib.pl to HTML.

index -|- end

Generated: Mon Aug 16 14:14:24 2010 from htmllib.pl 2010/04/14 28.1 KB.

#!/perl -w
# NAME: htmllib.pl
# AIM: To process a HTML file, parse HTML elements, and return hash reference
# 2010/04/13 - looking good...
# 2010/04/12 - geoff mclane - http://geoffair.net/mperl/
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;

my $no_ignore_close_element = 1;   # dangerous - ignoring a close element

my @closed_tags = ( "meta", "link", "applet", "img", "input", "object", "embed", "servlet",
"br", "hr", "area", "base", "basefont", "frame", "isindex", "param", "bgsound", "embed", "keygen" );

# tags which do NOT need a closing, like </p>, tag
my @opt_tags = ( "body", "colgroup", "dd", "dt", "head", "html", "li", "optgroup", "option",
"p", "tbody", "td", "tfoot", "th", "thead", "tr", "marquee" );

### program variables
my $verbosity = 0;
my @warnings = ();
my $cwd = cwd();

# debug
my $hl_dbg01 = 0; # show each item pushed to the stack
my $hl_dbg38 = 0; # prt( "[hl_dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($hl_dbg38);
my $hl_dbg39 = 0; # prt( "[hl_dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($hl_dbg39);

sub HL_VERB1() { return (($verbosity > 0) ? 1 : 0); }
sub HL_VERB5() { return (($verbosity >= 5) ? 1 : 0); }
sub HL_VERB9() { return (($verbosity >= 9) ? 1 : 0); }

sub is_closed_tag($) {
    my ($tt) = shift;
    my $lctt = lc($tt);
    foreach my $tag (@closed_tags) {
        return 1 if ($tag eq $lctt);
    }
    return 0;
}

sub is_opt_tag($) {
    my ($tt) = shift;
    my $lctt = lc($tt);
    foreach my $tag (@opt_tags) {
        return 1 if ($tag eq $lctt);
    }
    return 0;
}

# $drop = can_find_this_tag($tag,\@elements);
sub can_find_this_tag($$) {
    my ($tag,$re) = @_;
    my $len = scalar @{$re};
    my $drop = 0;
    my $bu = -1;
    my $last = '';
    my $lctag = lc($tag);
    while ($len) {
        $drop++;    # can pop this one
        $last = ${$re}[$bu][0]; # get tag
        if (($last eq $tag)||(lc($last) eq $lctag)) {    # if the desired tag
            return $drop;   # return drop value
        } elsif ( ! is_opt_tag($last) ) {
            return 0;   # oop, have a non-optional tag
        }
        $bu--;  # back up one more
        $len--; # and reduce available to check
    }
    return 0;
}

sub is_all_optional($) {
    my ($re) = @_;
    my $len = scalar @{$re};
    my $bu = -1;
    my ($last);
    while ($len) {
        $last = ${$re}[$bu][0]; # get tag
        if ( ! is_opt_tag($last) ) {
            return 0;   # oop, have a non-optional tag
        }
        $bu--;  # back up one more
        $len--; # and reduce available to check
    }
    return 1;   # ALL were optiona
}

sub pop_optional_elements($) {
    my ($re) = @_;
    my $len = scalar @{$re};
    my $bu = -1;
    my $pop = 0;
    my ($last);
    while ($len--) {
        $last = ${$re}[$bu][0]; # get tag
        last if (!is_opt_tag($last));
        $pop++;
        $bu--;
    }
    return $pop;
}

sub count_optional_elements($) {
    my ($re) = @_;
    my $len = scalar @{$re};
    my $opts = 0;
    my $cnt =  0;
    my ($last);
    while ($len--) {
        $last = ${$re}[$cnt][0]; # get tag
        $opts++ if (is_opt_tag($last));
        $cnt++;
    }
    return $opts;
}

sub show_stack_elements($$$) {
    my ($tag,$rele,$rlns) = @_;
    my $cnt = scalar @{$rele};
    my $lcnt = scalar @{$rlns};
    if ($cnt) {
        prt("The stack has $cnt elements... The current closing element is [$tag]\n");
        for (my $i = 0; $i < $cnt; $i++) {
            my $e = ${$rele}[$i][0];
            my $n = ${$rele}[$i][1];
            prt("$n: element:[$e]");
            prt(" SAME as tag [$tag]!") if ($e eq $tag);
            if ($n <= $lcnt) {
                my $ln = trim_all(${$rlns}[$n-1]);
                prt(" line=[$ln]");
            }
            prt("\n");
        }
    }
}

sub get_element_chain($) {
    my ($rele) = @_;
    my $cnt = scalar @{$rele};
    my $chn = '';
    if ($cnt) {
        for (my $i = 0; $i < $cnt; $i++) {
            my $e = ${$rele}[$i][0];
            $chn .= '|' if length($chn);
            $chn .= $e;
        }
    }
    return $chn;
}

sub get_attribute_hash_ref($$$$) {
    my ($fank,$fil,$xml,$dbg) = @_;
    my %hash = ();
    my ($ank,$len,$i,$ch,$pc,$hr2,$txt,$msg);
    $ank = trim_all($fank);
    $len = length($ank);
    $ch = '';
    $hr2 = '';
    for ($i = 0; $i < $len; $i++) {
        $pc = $ch;
        $ch = substr($ank,$i,1);
        # if ($ch =~ /\w/) - this missed xml:link="abc"
        # and 'http-equiv="..."
        if ($ch =~ /(\w|:|-)/) {
            $hr2 .= $ch;   # accumulate \w chars - alphanumeric, including _
        } elsif (length($hr2)) {
            if (($ch ne '=') && ($ch =~ /\s/)) {
                $i++;
                for (; $i < $len; $i++) {
                    $ch = substr($ank,$i,1);
                    last if ($ch eq '=');
                    last if !($ch =~ /\s/);
                }
            }
            if ($ch eq '=') {
                # found our equal sign
                $i++; # move on... eat any space
                for (; $i < $len; $i++) {
                    $ch = substr($ank,$i,1);
                    last if ($ch =~ /('|")/);
                    last if !($ch =~ /\s/);
                }
                if (($ch eq '"')||($ch eq "'")) {
                    $pc = $ch;
                    $i++; # move on...
                    $txt = '';
                    for (; $i < $len; $i++) {
                        $ch = substr($ank,$i,1);
                        last if ($ch eq $pc);   # ONLY stop when chars are equal (should look for \"???)
                        $txt .= $ch;
                    }
                    if ($ch eq $pc) {
                        $hr2 = lc($hr2) if ($xml);
                        $hash{$hr2} = $txt;
                        prt( "[hl_dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($hl_dbg38);
                    } else {
                        prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n");
                        pgm_exit(1,"NEED CODE FIX!") if ($dbg);
                    }
                } else {
                    #if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) {
                    #if ($ch =~ /(\w|-)/) {
                    # collect any non-white space chars
                    if ($ch =~ /\S/) {
                        # accept ALL WITHOUT inverted comma
                        $txt = $ch; # start the text
                        $i++; # MOVING ON
                        for (; $i < $len; $i++) {
                            $ch = substr($ank,$i,1);
                            #last if !($ch =~ /\w/); # can ONLY stop on NOT alphanumeric
                            #last if !($ch =~ /(\w|-|:)/); # can ONLY stop on NOT alphanumeric or some specials
                            last if ($ch =~ /\s/); # can ONLY stop on white
                            $txt .= $ch;    # accumulate all
                        }
                        $hr2 = lc($hr2) if ($xml);
                        $hash{$hr2} = $txt;
                        prt( "[hl_dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($hl_dbg39);
                    } else {
                        prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n");
                        pgm_exit(1,"NEED CODE FIX!") if ($dbg);
                    }
                }
            } else {
                $msg = "PROBLEM: NO EQUAL SIGN! got key[".$hr2."] = text[".$txt."] so far. At pos $i in \n[$ank]\n";
                $msg .= " " x ($i+1) ."^\n";
                my @arr = keys(%hash);
                if (@arr) {
                    $msg .= "Already stacked ".scalar @arr." element(s)\n";
                    foreach $hr2 (keys %hash) {
                        $txt = $hash{$hr2};
                        $msg .= " $hr2 = [".$txt."]\n"
                    }
                }
                $msg .= "File=[$fil]\n";
                prtw($msg);
                pgm_exit(1,"NEED CODE FIX!") if ($dbg);
            }
            $hr2 = '';
        }
    }
    return \%hash;
}

sub get_html_file_hash($$$) {
    my ($inf,$opt,$dbg) = @_;
    if (!open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]!\n");
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Got $lncnt lines, from file [$inf]...\n") if ($dbg & 1);
    my ($i,$line,$ch,$tag,$len,$intag,$txt,$j,$pc,$ppc,$incdata,$hadsp,$attrs);
    my ($lnn,$last,$lln,$bgnlnn,$endlnn,$clnn,$stkdep,$maxdep);
    my ($maxelement,$echn,$hr,$msg,$ctag);
    my ($incomm,$pppc,$drop,$bgncdata,$bgntag,$bgncomm,$k);
    $tag = '';
    $attrs = '';
    $intag = 0;
    $incdata = 0;
    $hadsp = 0;
    $txt = '';
    $ch = '';
    $pc = '';
    $ppc = '';
    my @elements = ();
    $lnn = 0;
    $maxdep = 0;
    $maxelement = '';
    $incomm = 0;
    $last = '';
    $lln = -1;
    my @html = ();
    my @probcloses = ();
    for ($i = 0; $i < $lncnt; $i++) {
        $line = $lines[$i];
        chomp $line;
        $len = length($line);
        $lnn++;
        $clnn = "$lnn";
        for ($j = 0; $j < $len; $j++) {
            $pppc = $ppc;
            $ppc = $pc;
            $pc = $ch;
            $ch = substr($line,$j,1);
            if ($incdata) {
                if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']')) {
                    $incdata = 0;
                    prt("$clnn: End CDATA - line=[$line]\n") if (HL_VERB5() || ($dbg & 16));
                    # ==========================================
                    push(@html,[$txt,$tag,$attrs,$hr]);
                    $msg = "$lnn: Store1 {".$txt."}{".$tag."}{".$attrs."}";
                    $msg =~ s/\n/\*nl\*/g;
                    prt( "$msg\n" ) if ( $hl_dbg01 || HL_VERB9() || ($dbg & 256));
                    $hr = get_attribute_hash_ref("",$inf,1,1);
                    # ==========================================
                    prtw("WARNING: CDATA: Attribute collect has length! [$attrs]\n") if (length($attrs));
                    # reset
                    $txt = '';
                    $tag = '';
                    $attrs = '';
                    $hadsp = 0;
                    $intag = 0;
                    next;
                } elsif (($ch eq '>') && ($pc eq ']')) {
                    push(@probcloses,[$lnn,$line]);
                } elsif (($ch eq ']') && ($pc eq ']')) {
                    push(@probcloses,[$lnn,$line]);
                }
                $tag .= $ch;
                if ($tag =~ /<\!\[CDATA\[$/) {
                    # YEEK found another CDATA nested in a CDATA - NESTED CDATA sections are NOT allowed
                    $echn = get_element_chain(\@elements);
                    $msg = '';
                    if (@probcloses) {
                        $k = scalar @probcloses;
                        $msg = "INFO:$lnn: Possible partial $k close(s) was/were\n";
                        for ($k = 0; $k < scalar(@probcloses); $k++) {
                            $msg .= "INFO:".$probcloses[$k][0].": line=[".$probcloses[$k][1]."]\n";
                        }
                    }
                    $msg .= "INFO:$lnn: Stacked elements [$echn]\n";
                    pgm_exit(1,"ERROR:$lnn: In CDATA, and found NESTED CDATA - NOT ALLOWED! line=[$line]\n".
                        "INFO:$lnn: May be unclosed CDATA beginning line $bgncdata!\n$msg");
                }
            } elsif ($incomm) {
                # very specific --> exit for this tag
                if (($ch eq '>')&&($pc eq '-')&&($ppc eq '-')) {
                    $incomm = 0;
                    prt("$clnn: End comment\n") if (HL_VERB5() || ($dbg & 16));
                    # ==========================================
                    push(@html,[$txt,$tag,$attrs,$hr]);
                    $msg = "$lnn: Store2 {".$txt."}{".$tag."}{".$attrs."}";
                    $msg =~ s/\n/\*nl\*/g;
                    prt( "$msg\n" ) if ( $hl_dbg01 || HL_VERB9() || ($dbg & 256) );
                    $hr = get_attribute_hash_ref("",$inf,1,1);
                    # ==========================================
                    prtw("WARNING: end comment: Attribute collect has length! [$attrs]\n") if (length($attrs));
                    # reset
                    $txt = '';
                    $tag = '';
                    $attrs = '';
                    $hadsp = 0;
                    $intag = 0;
                    next;
                } elsif (($ch eq '-')&&($pc eq '-')) {
                    push(@probcloses,[$lnn,$line]);
                } elsif (($ch eq '>')&&($pc eq '-')) {
                    push(@probcloses,[$lnn,$line]);
                }
                $tag .= $ch;
            } elsif ($intag) {
                if ($hadsp) {
                    $attrs .= $ch if !($ch eq '>');
                } elsif ($ch =~ /\s/) {
                    $hadsp = 1;
                    $attrs .= $ch;
                } else {
                    $tag .= $ch if !($ch eq '>');
                }

                if ($ch eq '>') {
                    $intag = 0;
                    $endlnn = $lnn;
                } elsif (($ch eq '[')&&($pc eq 'A')&&($tag =~ /\!\[CDATA\[$/)) {
                    $incdata = 1;
                    prt("$clnn: Begin CDATA - line=[$line]\n") if (HL_VERB5()|| ($dbg & 16));
                    $bgncdata = $lnn;
                    @probcloses = ();    # clear probable closes
                    next;
                } elsif (($ch eq '-')&&($pc eq '-')&&($ppc eq '!')&&($pppc eq '<')) {
                    $incomm = 1;
                    prt("$clnn: Begin comment - line=[$line]\n") if (HL_VERB5()|| ($dbg & 16));
                    $bgncomm = $lnn;
                    @probcloses = ();    # clear probable closes
                    next;
                }
                if (!$intag) {
                    $tag = trim_all($tag);
                    $clnn = "$lnn";
                    $msg = "$clnn: ";
                    $msg .= "Text [".trim_all($txt)."]\n$clnn: " if (length($txt) && !($txt =~ /^\s+$/));
                    $msg .= "End tag [$tag] ";
                    $msg .= "Attrs [".trim_all($attrs)."] " if (length($attrs));
                    if ($tag =~ /^(\!|\?)/) {
                        $hr = get_attribute_hash_ref("",$inf,1,1);
                        $msg .= "Special";
                    } else {
                        # if ($attrs =~ /\/$/) but it may NOT end with '/'
                        $hr = get_attribute_hash_ref(trim_all($attrs),$inf,1,1);
                        if (($attrs =~ /\/$/) || is_closed_tag($tag)) {
                            $msg .= "self-closed";
                        } elsif ($tag =~ /^\//) {
                            $ctag = substr($tag,1);
                            $msg .= "Close";
                            if (@elements) {
                                $last = $elements[-1][0]; 
                                $lln  = $elements[-1][1]; 
                                if ($last eq $ctag) {
                                    pop @elements;
                                } else {
                                    # but may have 'opt' tags - tags that need no close on the stack, which
                                    # can be dropped to get to this tag
                                    $drop = can_find_this_tag($ctag,\@elements);
                                    if ($drop) {
                                        while($drop--) {
                                            pop @elements;
                                        }
                                    } else {
                                        if ($opt & $no_ignore_close_element) {
                                            prt("Was processing [$msg] in FILE:[$inf]\n");
                                            $msg = '';
                                            prt("\nERROR: Last [$last]$lln NE [$ctag]$lnn line=[".trim_all($line)."]\n");
                                            show_stack_elements($ctag,\@elements,\@lines);
                                            pgm_exit(1,"ERROR:[1]: It is useless to continue when the element stack is out of order!\n");
                                        } else {
                                            $echn = get_element_chain(\@elements);
                                            prtw("WARNING:$lnn: Last close [<$tag>] NOT LAST in stack [$echn] - IGNORING!\n") if ($dbg & 4);
                                        }
                                    }
                                }
                                $echn = get_element_chain(\@elements);
                                prt("$lnn: Popped element [$tag] remains [$echn]\n") if (HL_VERB9() || ($dbg & 256));
                            } else {
                                prt("$msg\n");
                                $msg = '';
                                prt("\nERROR: The stack has NO elements... The current closing element is [$ctag]\n");
                                pgm_exit(1,"ERROR:[2]: It is useless to continue when the element stack is out of order! [2]\n");
                            }
                        } else {
                            $msg .= "Open";
                            if (@elements) {
                                $last = $elements[-1][0]; 
                                $lln  = $elements[-1][1]; 
                            }
                            push(@elements,[$tag,$bgnlnn,$endlnn]);
                            $echn = get_element_chain(\@elements);
                            $stkdep = scalar @elements;
                            if ($stkdep > $maxdep) {
                                $maxdep = $stkdep;
                                $maxelement = "$clnn: $tag $bgnlnn $endlnn [$echn]";
                            }
                            prt("$lnn: Pushed element [$tag] chain=[$echn]\n") if (HL_VERB9() || ($dbg & 256));
                       }
                    }
                    prt("$msg\n") if (HL_VERB1());
                    # ==========================================
                    push(@html,[$txt,$tag,$attrs,$hr]);
                    $msg = "$lnn: Store3 {".$txt."}{".$tag."}{".$attrs."}";
                    $msg =~ s/\n/\*nl\*/g;
                    prt("$msg\n") if ( $hl_dbg01 || HL_VERB9() || ($dbg & 256) );
                    # ==========================================

                    # reset
                    $txt = '';
                    $tag = '';
                    $attrs = '';
                    $hadsp = 0;
                }
            } else {
                if ($ch eq '<') {
                    $tag = '';
                    $intag = 1;
                    $hadsp = 0;
                    $bgnlnn = $lnn;
                    $bgntag = $lnn;
                    prt("$lnn: Begin tag line=[$line]\n") if (HL_VERB9() || ($dbg & 256));
                } else {
                    $txt .= $ch;
                }
            }
        } # reached end of line - get next
        #=================================
        $ch = "\n";
        if ($incdata) {
            $tag .= $ch;
        } else {
            if ($intag) {
                if ($hadsp) {
                    $attrs .= $ch; # if (length($attrs)); # && !($attrs =~ /\s$/));
                } else {
                    $tag .= $ch; # if (length($tag)); # && !($tag =~ /\s$/));
                }
            } else {
                $txt .= $ch; # if (length($txt)); # && !($txt =~ /\s$/));
            }
        }
        $pppc = $ppc;
        $ppc = $pc;
        $pc = $ch;
    }

    # enf line by line processing
    prt("Max. element stack $maxdep...$maxelement\n") if ($dbg & 2);
    if ($incdata) {
        $echn = get_element_chain(\@elements);
        $msg = '';
        if (@probcloses) {
            $k = scalar @probcloses;
            $msg = "INFO:$lnn: Possible partial $k close(s) was/were\n";
            for ($k = 0; $k < scalar(@probcloses); $k++) {
                $msg .= "INFO:".$probcloses[$k][0].": line=[".$probcloses[$k][1]."]\n";
            }
        }
        $msg .= "INFO:$lnn: Stacked elements [$echn]\n";
        pgm_exit(1,"ERROR: File expired while in CDATA, commenced line $bgncdata\n$msg\n");
    } elsif ($incomm) {
        $echn = get_element_chain(\@elements);
        $msg = '';
        if (@probcloses) {
            $k = scalar @probcloses;
            $msg = "INFO:$lnn: Possible partial $k close(s) was/were\n";
            for ($k = 0; $k < scalar(@probcloses); $k++) {
                $msg .= "INFO:".$probcloses[$k][0].": line=[".$probcloses[$k][1]."]\n";
            }
        }
        $msg .= "INFO:$lnn: Stacked elements [$echn]\n";
        pgm_exit(1,"ERROR: File expired while in 'comment', commenced line $bgncomm\n$msg\n");
    } elsif ($intag) {
        prtw("WARNING: File expired while in 'tag', commenced line $bgntag\n");
    }
    if (@elements && !is_all_optional(\@elements)) {
        $drop = pop_optional_elements(\@elements);
        if ($drop) {
            prt("Dropping $drop optional elements from stack... ");
            while($drop--) {
                pop @elements;
            }
            $drop = scalar @elements;
            if ($drop) {
                prt("leaving $drop...");
                $drop = count_optional_elements(\@elements);
                if ($drop) {
                    prt(" $drop are optional...");
                }
            } else {
                prt("leaving none...");
            }
            prt("\n");
        }
        if (@elements) {
            show_stack_elements("At-End-of-File",\@elements,\@lines);
            pgm_exit(1,"ERROR: This file [$inf] is NOT clean!\n");
        }
    }
    prt("Done $lncnt lines... [$inf] appears ok...\n") if ($dbg & 1);
    my %hash = ();
    $hash{$inf} = [@html];
    return \%hash;
}

sub get_href_type($) {
    my ($src) = shift;
    if ($src =~ /^http:/i) {
        #push(@httprefs, [$src, $fil, $lnnos] );
        return 1; # remote HREF
    } elsif ($src =~ /^https:/i) {
        return 1; # remote HREF
        #push(@httpsrefs, [$src, $fil, $lnnos] );
    } elsif ($src =~ /^ftp:/i) {
        #push(@ftprefs, [$src, $fil, $lnnos] );
        return 3; # remote HREF
    } elsif ($src =~ /^mailto:/i) {
        #push(@mtrefs, [$src, $fil, $lnnos] );
        return 4; # remote HREF
    } elsif ( $src =~ /^javascript:/i ) {
        return 5; # a JAVASCRIPT HREF
    } elsif ($src =~ /^file:/i) {
        return 5; # remote HREF
    } elsif ( substr($src,0,1) eq '#') {
        # local in page HREF
        return 6;
    } else {
        my $ind = index($src,'#');
        $src = substr($src,0,$ind) if ( $ind != -1 );
        $ind = index($src,'?');
        $src = substr($src,0,$ind) if ( $ind != -1 );
        $src =~ s/\/$//;
        return 7 if (length($src));
    }
    return 0;
}

sub dos_2_unix($) {
    my ($du) = shift;
    $du =~ s/\\/\//g;
    return $du;
}

sub sub_common_folder_unix {
    my ($f1, $f2) = @_;
    my $df1 = dos_2_unix($f1);
    my $df2 = dos_2_unix($f2);
    if ($^O eq 'MSWin32') {
        $df1 = lc($df1);
        $df2 = lc($df2);
   }
   # paddle across, stopping at first difference
    my $off = 0;
    while ( substr($df1,$off,1) && substr($df2,$off,1) &&
            ( substr($df1,$off,1) eq substr($df2,$off,1) ) ) {
        $off++;
    }
    return substr($f1,$off);
}

sub fix_rel_unix_path($) {
    my ($path) = shift;
    $path = dos_2_unix($path);
    # pgm_exit(1,"ERROR: Passed PATH that starts relative! [$path]\n") if (($path =~ /^\.\./)||($path =~ /^\.(\\|\/)\.\./));
    my @a = split(/\//, $path);
    my $npath = '';
    my $max = scalar @a;
    my @na = ();
    for (my $i = 0; $i < $max; $i++) {
        my $p = $a[$i];
        if ($p eq '.') {
            # ignore this
        } elsif ($p eq '..') {
            if (@na) {
                pop @na;    # discard previous
            } else {
                prt( "WARNING: Got relative .. without previous!!! path=[$path]\n" );
            }
        } else {
            push(@na,$p);
        }
    }
    foreach my $pt (@na) {
        $npath .= "/" if length($npath);
        $npath .= $pt;
    }
    return $npath;
}

sub get_local_href($) {
    my ($src) = shift;
    my $ind = index($src,'#');
    $src = substr($src,0,$ind) if ( $ind != -1 );
    $ind = index($src,'?');
    $src = substr($src,0,$ind) if ( $ind != -1 );
    $src =~ s/\/$//;    # remove any TRAILING '/' char
    # 25/07/2007 - also 'convert' '%20' to space
    $src =~ s/%20/ /g;
    return $src;
}

sub find_anchor_name($$) {
    my ($nm,$rhtml) = @_;
    my $len = scalar @{$rhtml};
    for (my $i = 0; $i < $len; $i++) {
        my $tag = ${$rhtml}[$i][1];
        if ($tag =~ /^a$/i) {
            my $rah = ${$rhtml}[$i][3];
            if (defined ${$rah}{'name'}) {
                return 1 if (${$rah}{'name'} eq $nm);
            }
        }
    }
    return 0;   # NOT found
}

sub list_files_in_hash_ref($$) {
    my ($hr,$out) = @_;
    my ($fil,$rhtml,$len,$htxt,$i,$txt,$tag,$attrs,$rah,$ra);
    my ($ftit,$fdir);
    my %h = ();
    foreach $fil (keys %{$hr}) {
        ($ftit,$fdir) = fileparse($fil);
        $fdir = $cwd.'/' if ($fdir =~ /^\.(\\|\/)$/);
        $rhtml = ${$hr}{$fil};
        $len = scalar @{$rhtml};
        $htxt = '';
        for ($i = 0; $i < $len; $i++) {
            #             0    1    2      3
            # push(@html,[$txt,$tag,$attrs,$hr]);
            $txt = ${$rhtml}[$i][0];
            $tag = ${$rhtml}[$i][1];
            $attrs = ${$rhtml}[$i][2];
            $htxt .= $txt;
            $htxt .= '<'.$tag;
            $htxt .= $attrs;
            $htxt .= '>';
            # get the attribute HASH REFERENCE
            $rah = ${$rhtml}[$i][3];
            if (defined ${$rah}{'src'}) {
                $h{$tag} = [] if (!defined $h{$tag});
                $ra = $h{$tag};
                push(@{$ra},${$rah}{'src'});
            }
            if (defined ${$rah}{'href'}) {
                $h{$tag} = [] if (!defined $h{$tag});
                $ra = $h{$tag};
                push(@{$ra},${$rah}{'href'});
            }
        }

        $htxt .= "\n" if !($htxt =~ /\n$/);
        if (length($out)) {
            write2file($htxt,$out);
            prt("Written to $out file...\n");
        }
        # =======================================================
        # list what has been collected as 'src' or 'href' entries
        # -------------------------------------------------------
        my ($key,$val,$itm,$typ,$loc,$ok,$ff,$msg,$cnt);
        my $min = 65;
        prt("Link contents of $fil...\n");
        foreach $key (keys %h) {
            $val = $h{$key};
            $cnt = scalar @{$val};
            prt("$key: Has $cnt items...\n");
            foreach $itm (@{$val}) {
                $typ = get_href_type($itm);
                $msg = "[$itm]$typ";
                $msg .= ' ' while (length($msg) < $min);
                $ok = 'extern';
                if ($typ == 6) {
                    $ok = 'ok1';
                    if (length($itm) > 1) {
                        if (find_anchor_name(substr($itm,1),$rhtml)) {
                            $ok = 'ok';
                        } else {
                            $ok = 'NF';
                        }
                    }
                } elsif ($typ == 7) {
                    $loc = get_local_href($itm);
                    $ff = $fdir.$loc;
                    if (-f $ff) {
                        $ok = 'ok';
                    } elsif (-d $ff) {
                        $ok = 'okd';
                    } else {
                        $ok = 'NF';
                    }
                }
                prt(" $msg $ok\n");
            }
            prt("\n");
        }
    }
}

1;

# eof - htmllib.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional