lib_html.pl to HTML.

index -|- end

Generated: Mon Aug 29 19:34:45 2016 from lib_html.pl 2014/09/11 22.6 KB. text copy

#!/usr/bin/perl -w
#< lib_html.pl

use strict;
use warnings;

my $TAG_NORM    = 0;    # normal has open <title>, and close </title>
my $TAG_CLOSE   = 1;    # is close of item </title>
my $TAG_CLOSED  = 2;    # has xml close <br />
my $TAG_CLOSEA  = 3;    # is self closing <br>
my $TAG_SPECIAL = 4;    # special like <!DOCTYPE ...>
my $TAG_COMMENT = 5;    # comment <!-- ... -->
my $TAG_TEXT    = 6;    # just 'text'

my $ATT_NV = '<no_value>';

my $mod_name = 'lib_html';

# feature
my $show_warnings = 1;

sub set_show_warnings($) { $show_warnings = shift; }


# debug
my $lh_dbg_01 = 0; # show tags as decoded
my $lh_dbg_02 = 0; # load failure
my $lh_dbg_03 = 0;  # show counts

my %tag_type_hash = (
    $TAG_NORM => 'norm',
    $TAG_CLOSE => 'close',
    $TAG_CLOSED => 'closed',
    $TAG_CLOSEA => 'closea',
    $TAG_SPECIAL => 'special',
    $TAG_COMMENT => 'comment',
    $TAG_TEXT => 'text'
    );

sub tag_type_to_text($) {
    my $typ = shift;
    if (defined $tag_type_hash{$typ}) {
        return $tag_type_hash{$typ};
    }
    return "Type [$typ] NOT DEFINED";
}

sub get_attr_no_value() { return $ATT_NV; }
sub get_tag_normal_value() { return $TAG_NORM; };
sub get_tag_close_value() { return $TAG_CLOSE; }
sub get_tag_closed_value() { return $TAG_CLOSED; }
sub get_tag_closea_value() { return $TAG_CLOSEA; }
sub get_tag_special_value() { return $TAG_SPECIAL; }
sub get_tag_comment_value() { return $TAG_COMMENT; }
sub get_tag_text_value() { return $TAG_TEXT; }

sub get_attr_refhash($) {
    my $txt = shift;
    my %hash = ();
    my @arr = space_split($txt);
    my ($item,@arr2,$cnt,$att,$val,$j);
    foreach $item (@arr) {
        @arr2 = split("=",$item);
        $cnt = scalar @arr2;
        $att = trim_all($arr2[0]);
        $val = '';
        if ($cnt > 1) {
            if ($cnt > 2) {
                for ($j = 1; $j < $cnt; $j++) {
                    $val .= '=' if (length($val));
                    $val .= $arr2[$j];
                }
            } else {
                $val = $arr2[1];
            }
        } else {
            next if ($att eq '/');
            $val = $ATT_NV;
        }
        $hash{$att} = $val;
    }
    return \%hash;
}

sub is_auto_closed_tag($) {
    my ($tag) = @_;
    return 1 if ($tag =~ /^meta$/i);
    return 2 if ($tag =~ /^link$/i);
    return 3 if ($tag =~ /^img$/i);
    return 4 if ($tag =~ /^input$/i);
    return 5 if ($tag =~ /^br$/i);
    ####return 6 if ($tag =~ /^form$/i);
    return 0;
}

sub is_optional_auto_closed_tag($) {
    my ($tag) = @_;
    return 1 if ($tag =~ /^td/i);
    return 0;
}

sub deal_with_script($) {
    my ($content) = @_;
    my ($len,$text,$i,$inquot,$qc,$ch,$nc,$i2);
    my ($rem);
    $len = length($content);
    $text = '';
    $inquot = 0;
    for ($i = 0; $i < $len; $i++) {
        $i2 = $i + 1;
        $ch = substr($content,$i,1);
        $nc = ($i2 < $len) ? substr($content,$i2,1) : '';
        if ($inquot) {
            $text .= $ch;
            if ($ch eq $qc) {
                $inquot = 0;
            }
       } else {
           # not in QUOTE
           if (($ch eq '"') || ($ch eq "'")) {
               $inquot = 1;
               $qc = $ch;
               $text .= $ch; # accumulate the script
           } elsif (($ch eq '<')&&($nc eq '/')) {
               # potential END of SCRIPT
               $rem = substr($content,$i2+1);
               if ($rem =~ /^script>/i) {
                   return $i - 1;
               }
           } else {
               $text .= $ch; # accumulate the script
           }
        }
    }
    return $i;
}

sub deal_with_pre($$) {
    my ($content,$rtxt) = @_;
    my ($len,$text,$i,$inquot,$qc,$ch,$nc,$i2);
    my ($rem);
    $len = length($content);
    $text = '';
    $inquot = 0;
    for ($i = 0; $i < $len; $i++) {
        $i2 = $i + 1;
        $ch = substr($content,$i,1);
        $nc = ($i2 < $len) ? substr($content,$i2,1) : '';
       if (($ch eq '<')&&($nc eq '/')) {
           # potential END of PRE
           $rem = substr($content,$i2+1);
           if ($rem =~ /^pre>/i) {
               last;
           }
       } else {
           if ($ch =~ /\n/) {
               if (!($text =~ /\n$/)) {
                   $text .= $ch;
               }
           } else {
               $text .= $ch; # accumulate the pre - AS IS
           }
       }
    }
    ${$rtxt} = $text;
    return $i - 1;
}


# HTML parsing
sub get_html_refarray($) {
    my $content = shift;
    my ($len,$i,$ch,$nc,$nc2,$tag,$i2,$i3,$intag,$text);
    my ($inquot,$qc,$incomm,$rem,$lnn);
    my (@arr,$cnt,$j,$attrs,$ftag,$ptag,$rah,$typ);
    my ($msg,$tlnn,$tlnn2,$ptag2);
    $len = length($content);
    $intag = 0;
    $inquot = 0;
    $incomm = 0;
    $lnn = 1;
    $text = '';
    my @tag_stack = ();
    my @html_array = ();
    for ($i = 0; $i < $len; $i++) {
        $i2 = $i + 1;
        $i3 = $i + 2;
        $ch = substr($content,$i,1);
        $nc = ($i2 < $len) ? substr($content,$i2,1) : '';
        if ($ch eq "\n") {
            $lnn++;
            $ch = ' ';
        }
        if ($intag) {
            if ($inquot) {
                $tag .= $ch;
                if ($ch eq $qc) {
                    $inquot = 0;
                }
            } else {
                $tag .= $ch;
                if (($ch eq '"')||($ch eq "'")) {
                    $inquot = 1;
                    $qc = $ch;
                } elsif ($ch eq '>') {
                    $intag = 0;
                    $tag =~ s/^<//;
                    $tag =~ s/>$//;
                    $ftag = $tag;
                    @arr = space_split($tag);
                    $tag = $arr[0];
                    $cnt = scalar @arr;
                    $attrs = '';
                    for ($j = 1; $j < $cnt; $j++) {
                        $attrs .= ' ' if (length($attrs));
                        $attrs .= $arr[$j];
                    }
                    $rah = get_attr_refhash($attrs);
                    $msg = "$lnn: ";
                    if ($tag =~ /^!/) {
                        $tag =~ s/^!//;
                        $msg .= " spl <![$tag]";
                        $typ = $TAG_SPECIAL;
                    } elsif ($tag =~ /^\//) {
                        $tag =~ s/^\///;
                        $typ = $TAG_CLOSE# starts with slash </title>
                        $msg .= " close </[$tag]";
                        if (@tag_stack) {
                            $tlnn = $tag_stack[-1][0];
                            $ptag = $tag_stack[-1][1];
                            pop @tag_stack;
                            if (lc($tag) ne lc($ptag)) {
                                $msg = "WARNING: $mod_name: $lnn: Close TAG [$ftag], NOT PREVIOUS [$tlnn: $ptag]! ";
                                if (@tag_stack) {
                                    $tlnn2 = $tag_stack[-1][0];
                                    $ptag2 = $tag_stack[-1][1];
                                    $msg .= "(prev [$ptag2]";
                                    if (is_optional_auto_closed_tag($ptag) && ($tag eq $ptag2)) {
                                        pop @tag_stack;
                                        next; # continue - no problem
                                    }
                                } else {
                                    $msg .= "last!";
                                }
                                prtw("$msg\n") if ($show_warnings);
                            }
                        } else {
                            prtw("WARNING: $mod_name: $lnn: Close TAG [$ftag], NOT ON STACK!\n") if ($show_warnings);
                        }
                    } elsif ($ftag =~ /\/$/) {
                        $msg .= " closed <[$tag]";
                        $typ = $TAG_CLOSED;
                    } elsif (is_auto_closed_tag($tag)) {
                        $msg = " auto-closed <[$tag]";
                        $typ = $TAG_CLOSEA;
                    } else {
                        push(@tag_stack,[$lnn,$tag]);
                        $msg .= " tag <[$tag]";
                        $typ = $TAG_NORM;
                    }
                    if (length($attrs)) {
                        $msg .= " attr [$attrs]";
                    }
                    $msg .= ">";
                    prt("$msg\n") if ($lh_dbg_01);
                    push(@html_array,[$typ,$tag,$rah,$lnn]);
                    if ($typ == $TAG_NORM) {
                        # some extra checks
                        if ($tag =~ /^script$/i) {
                            # DEAL WITH SCRIPT!
                            $rem = substr($content,$i2);
                            $i += deal_with_script($rem);
                            #$rem = substr($content,$i);
                            #pgm_exit(1,"DEAL WITH SCRIPT [$rem]\n");
                        } elsif ($tag =~ /^pre$/i) {
                            $rem = substr($content,$i2);
                            $text = '';
                            $i += deal_with_pre($rem,\$text);
                            if (length($text)) {
                                $typ = $TAG_TEXT;
                                $rah = get_attr_refhash("");
                                prt("$lnn: TXT [$text]\n") if ($lh_dbg_01);
                                push(@html_array,[$typ,$text,$rah,$lnn]);
                            }
                        }
                    }
                }
            }
        } else {
            if ($inquot) {
                $text .= $ch;
                if ($ch eq $qc) {
                    $inquot = 0;
                }
            } else {
                if (($ch eq '"')||($ch eq "'")) {
                    $inquot = 1;
                    $qc = $ch;
                    $text .= $ch;
                } elsif ($ch eq '<') {
                    $typ = $TAG_TEXT;
                    $rah = get_attr_refhash("");
                     if (length($text)) {
                        prt("$lnn: TXT [$text]\n") if ($lh_dbg_01);
                        push(@html_array,[$typ,$text,$rah,$lnn]);
                     }
                    $text = '';
                    if ($nc eq '!') {
                        $rem = substr($content,$i2+1);
                        if ($rem =~ /^--/) {
                            # in a comment
                            $i = $i2 + 1;
                            $rem = "!";
                            for (; $i < $len; $i++) {
                                $i2 = $i + 1;
                                $i3 = $i + 2;
                                $ch = substr($content,$i,1);
                                if ($ch eq "\n") {
                                    $lnn++;
                                    $ch = ' ';
                                }
                                $nc = ($i2 < $len) ? substr($content,$i2,1) : '';
                                $nc2 = ($i3 < $len) ? substr($content,$i3,1) : '';
                                $lnn++ if ($ch eq "\n");
                                if (($ch eq '-')&&($nc eq '-')&&($nc2 eq '>')) {
                                    $i = $i3;
                                    $rem .= "--";
                                    last;
                                }
                                $rem .= $ch;
                            }
                            push(@html_array,[$TAG_COMMENT,$rem,$rah,$lnn]);
                            next;
                        }
                    }
                    $tag = '<';
                    $intag = 1;
                } else {
                    if ($ch =~ /\s/) {
                        if ( length($text) && !($text =~ /\s$/) ) {
                            $text .= $ch;
                        }
                    } else {
                        $text .= $ch;
                    }
                }
            }
        }
    }
    #prt("Done $lnn lines...\n");
    $len = scalar @tag_stack;
    if ($len) {
        $text = '';
        for ($i = 0; $i < $len; $i++) {
            $tlnn = $tag_stack[$i][0];
            $ptag = $tag_stack[$i][1];
            if ($lh_dbg_02) {
                # noisy output
                $text .= "\n$tlnn [$ptag]";
            } else {
                # simple list
                $text .= ' ' if (length($text));
                $text .= $ptag;
            }
        }
        prtw("WARNING: $mod_name: End of document, with $len tags on stack [$text]\n") if ($show_warnings);
    }
    return \@html_array;
}

sub show_html_refarray($) {
    my ($ra) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn);
    my ($hcnt,$key,$val,$att);
    $cnt = scalar @{$ra};
    prt("HTML ref array has $cnt items\n");
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        $hcnt = scalar keys(%{$rha});
        $att = '';
        foreach $key (keys %{$rha}) {
            $val = ${$rha}{$key};
            $att .= " " if (length($att));
            if ($val eq $ATT_NV) {
                $att .= $key;
            } else {
                $att .= "$key=$val";
            }
        }
        if ($typ == $TAG_NORM) {
            prt("$lnn: norm [$tag]");
        } elsif ($typ == $TAG_CLOSE) {
            prt("$lnn: close [$tag]");
        } elsif ($typ == $TAG_CLOSED) {
            prt("$lnn: closed [$tag]");
        } elsif ($typ == $TAG_CLOSEA) {
            prt("$lnn: closea [$tag]");
        } elsif ($typ == $TAG_SPECIAL) {
            prt("$lnn: spl [$tag]");
        } elsif ($typ == $TAG_COMMENT) {
            prt("$lnn: comm [$tag]");
        } elsif ($typ == $TAG_TEXT) {
            prt("$lnn: text [$tag]");
        } else {
            prt("$lnn: unknown [$tag]");
        }
        prt(" attr [$att]") if (length($att));
        prt("\n");
    }
}

sub get_html_body_only($) {
    my ($ra) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn);
    $cnt = scalar @{$ra};
    #prt("get_html_body_only: HTML ref array had $cnt items\n");
    my @html_array = ();
    my $inbody = 0;
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        if ($inbody) {
            last if (($typ == $TAG_CLOSE)&&($tag =~ /^body$/i));
            $rha = ${$ra}[$i][2];
            $lnn = ${$ra}[$i][3];
            push(@html_array,[$typ,$tag,$rha,$lnn]);
        } else {
            $inbody = 1 if (($typ == $TAG_NORM) && ($tag =~ /^body$/i));
        }
    }
    $cnt = scalar @html_array;
    #prt("get_html_body_only: HTML returning $cnt items\n");
    return \@html_array;
}

sub drop_html_tags($$) {
    my ($ra,$rd) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add);
    $cnt = scalar @{$ra};
    prt("HTML ref array has $cnt items\n");
    my @html_array = ();
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        $add = 1;
        foreach $tt (@{$rd}) {
            if ($tag =~ /^$tt$/i) {
                $add = 0;
                last;
            }
        }
        push(@html_array,[$typ,$tag,$rha,$lnn]) if ($add);
    }
    $cnt = scalar @html_array;
    prt("HTML returning $cnt items\n");
    return \@html_array;
}


sub drop_div_tag($) {
    my ($ra) = shift;
    my @arr = qw(div);
    return drop_html_tags($ra,\@arr);
}

sub get_tag_array($$$) {
    my ($ra,$find,$opts) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add);
    $cnt = scalar @{$ra};
    my $dbg = ($opts & 0x8000);
    prt("HTML ref array has $cnt items. Finding [$find]\n") if ($dbg);
    my @html_array = ();
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        $add = 0;
        if ($tag =~ /^$find$/i) {
            if ($typ == $TAG_NORM) {
                $add = 1;
            } else {
                # add only if options says so
                if ($opts & 1) {
                    $add = 1;
                }
            }
        }
        push(@html_array,[$typ,$tag,$rha,$lnn]) if ($add);
    }
    $cnt = scalar @html_array;
    prt("HTML returning $cnt items\n") if ($dbg);
    return \@html_array;
}

# only for NORMAL tag - collect all items until it closes
sub get_whole_tag_array($$$) {
    my ($ra,$find,$opts) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add,$i2);
    my ($typ2,$tag2,$rha2,$lnn2,$tnam);
    my $dbg = ($opts & 0x8000);
    $cnt = scalar @{$ra};
    prt("[dbg] get_whole_tag_array: ref has $cnt items. Finding [$find]\n") if ($dbg);
    my @html_array = ();
    my @arr = ();
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        $add = 0;
        if ($tag =~ /^$find$/i) {
            $add = 1 if ($typ == $TAG_NORM);
        }
        if ($add) {
            @arr = ();
            push(@arr,[$typ,$tag,$rha,$lnn]);
            $i2 = $i + 1;
            $add = 0;
            for (; $i2 < $cnt; $i2++) {
                $typ2 = ${$ra}[$i2][0];
                $tag2 = ${$ra}[$i2][1];
                $rha2 = ${$ra}[$i2][2];
                $lnn2 = ${$ra}[$i2][3];
                $tnam = tag_type_to_text($typ2);
                # prt("$lnn2 $tag2 $tnam \n");
                push(@arr,[$typ2,$tag2,$rha2,$lnn2]);
                if (($tag2 =~ /^$find$/i) && ($typ2 == $TAG_CLOSE)) {
                    $add = 1;
                    last;
                }
            }
            if ($add) {
                push(@html_array, @arr);
            } else {
                prt("[dbg] FAILED to get close of [$tag]\n") if ($dbg);
            }
        }
    }
    $cnt = scalar @html_array;
    prt("[dbg] get_whole_tag_array: HTML returning $cnt items\n") if ($dbg);
    return \@html_array;
}

sub get_title_text($$) {
    my ($ra,$opts) = @_;
    my @title_text = ();
    my $ta = get_whole_tag_array($ra,'title',$opts);
    my $cnt = scalar @{$ta};
    my ($i, $typ, $tag, $rha, $lnn);
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        if ($typ == $TAG_TEXT) {
            push(@title_text,$tag);
        }
    }
    return \@title_text;
}

sub get_html_title_from_file($) {
    my $ff = shift;
    if (! open INF, "<$ff") {
        return "failed";
    }
    my @lines = <INF>;
    close INF;
    my $html = join("",@lines);
    my $ra = get_html_refarray($html);
    my $ta = get_whole_tag_array($ra,'title',0);
    my $tta = get_title_text($ta,0);
    return join(" ",@{$tta});
}


sub get_a_href_array($$) {
    my ($ra,$opts) = @_;
    my $ar = get_tag_array($ra,'a',0);
    my @href_array = ();
    my ($cnt,$typ,$tag,$rha,$i,$key,$val,$lnn);
    $cnt = scalar @{$ar};
    my $dbg = ($opts & 0x8000);
    prt("HTML ref array had $cnt items. Finding [A] HREF\n") if ($dbg);
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        foreach $key (keys %{$rha}) {
            if ($key =~ /^href$/i) {
                push(@href_array, ${$rha}{$key});
            }
        }
    }
    $cnt = scalar @href_array;
    prt("Returning $cnt HREF entries...\n") if ($dbg);
    return \@href_array;
}

sub get_a_href_text_array($$) {
    my ($ra,$opts) = @_;
    my $ar = get_whole_tag_array($ra,'a',$opts);
    ### my $ar = get_tag_array($ra,'a',0);
    my @href_array = ();
    my ($cnt,$typ,$tag,$rha,$i,$key,$val,$lnn,$i2,$txt);
    $cnt = scalar @{$ar};
    my $dbg = ($opts & 0x8000);
    prt("HTML ref array had $cnt items. Finding [A] HREF\n") if ($dbg);
    for ($i = 0; $i < $cnt; $i++) {
        $i2 = $i + 1;
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        foreach $key (keys %{$rha}) {
            if ($key =~ /^href$/i) {
                $txt = '';
                if (($i2 < $cnt)&&(${$ra}[$i2][0] == $TAG_TEXT)) {
                    $txt = ${$ra}[$i2][1]
                }
                push(@href_array, [${$rha}{$key},$txt]);    # return with TEXT
            }
        }
    }
    $cnt = scalar @href_array;
    prt("Returning $cnt HREF entries...\n") if ($dbg);
    return \@href_array;
}


sub get_html_from_refarray($$) {
    my ($ra,$opts) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn);
    my ($hcnt,$key,$val,$att);
    $cnt = scalar @{$ra};
    my $dbg = ($opts & 0x8000);
    prt("HTML ref array has $cnt items\n") if ($dbg);
    my $html = '';
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        $hcnt = scalar keys(%{$rha});
        $att = '';
        foreach $key (keys %{$rha}) {
            $val = ${$rha}{$key};
            $att .= " " if (length($att));
            if ($val eq $ATT_NV) {
                $att .= $key;
            } else {
                $att .= "$key=$val";
            }
        }
        if (($typ != $TAG_SPECIAL)&&($typ != $TAG_COMMENT)) {
            $tag = lc($tag) unless ($opts & 1);
        }
        if ($typ == $TAG_NORM) {
            #prt("$lnn: norm [$tag]");
            $html .= "<$tag";
            $html .= " $att" if (length($att));
            $html .= ">\n";
        } elsif ($typ == $TAG_CLOSE) {
            #prt("$lnn: close [$tag]");
            $html .= "</$tag";
            $html .= ">\n";
        } elsif ($typ == $TAG_CLOSED) {
            #prt("$lnn: closed [$tag]");
            $html .= "<$tag";
            $html .= " />\n";
        } elsif ($typ == $TAG_CLOSEA) {
            #prt("$lnn: closea [$tag]");
            $html .= "<$tag";
            $html .= ">\n";
        } elsif ($typ == $TAG_SPECIAL) {
            #prt("$lnn: spl [$tag]");
            $html .= "<$tag";
            $html .= ">\n";
        } elsif ($typ == $TAG_COMMENT) {
            #prt("$lnn: comm [$tag]");
            $html .= "<!-- $tag";
            $html .= " -->\n";
        } elsif ($typ == $TAG_TEXT) {
            #prt("$lnn: text [$tag]");
            $html .= "$tag";
            $html .= "\n";
        } else {
            prt("$lnn: unknown [$tag]");
        }
        #prt(" attr [$att]") if (length($att));
        #prt("\n");
    }
    return $html;
}

1;

# eof - lib_html.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional