#!/usr/bin/perl -w
#< lib_html.pl
use strict;
use warnings;
my $TAG_NORM = 0; # normal has open
, and close
my $TAG_CLOSE = 1; # is close of item
my $TAG_CLOSED = 2; # has xml close
my $TAG_CLOSEA = 3; # is self closing
my $TAG_SPECIAL = 4; # special like
my $TAG_COMMENT = 5; # comment
my $TAG_TEXT = 6; # just 'text'
my $ATT_NV = '';
my $mod_name = 'lib_html';
# feature
my $show_warnings = 1;
sub set_show_warnings($) { $show_warnings = shift; }
# debug
my $lh_dbg_01 = 0; # show tags as decoded
my $lh_dbg_02 = 0; # load failure
my $lh_dbg_03 = 0; # show counts
my %tag_type_hash = (
$TAG_NORM => 'norm',
$TAG_CLOSE => 'close',
$TAG_CLOSED => 'closed',
$TAG_CLOSEA => 'closea',
$TAG_SPECIAL => 'special',
$TAG_COMMENT => 'comment',
$TAG_TEXT => 'text'
);
sub tag_type_to_text($) {
my $typ = shift;
if (defined $tag_type_hash{$typ}) {
return $tag_type_hash{$typ};
}
return "Type [$typ] NOT DEFINED";
}
sub get_attr_no_value() { return $ATT_NV; }
sub get_tag_normal_value() { return $TAG_NORM; };
sub get_tag_close_value() { return $TAG_CLOSE; }
sub get_tag_closed_value() { return $TAG_CLOSED; }
sub get_tag_closea_value() { return $TAG_CLOSEA; }
sub get_tag_special_value() { return $TAG_SPECIAL; }
sub get_tag_comment_value() { return $TAG_COMMENT; }
sub get_tag_text_value() { return $TAG_TEXT; }
sub get_attr_refhash($) {
my $txt = shift;
my %hash = ();
my @arr = space_split($txt);
my ($item,@arr2,$cnt,$att,$val,$j);
foreach $item (@arr) {
@arr2 = split("=",$item);
$cnt = scalar @arr2;
$att = trim_all($arr2[0]);
$val = '';
if ($cnt > 1) {
if ($cnt > 2) {
for ($j = 1; $j < $cnt; $j++) {
$val .= '=' if (length($val));
$val .= $arr2[$j];
}
} else {
$val = $arr2[1];
}
} else {
next if ($att eq '/');
$val = $ATT_NV;
}
$hash{$att} = $val;
}
return \%hash;
}
sub is_auto_closed_tag($) {
my ($tag) = @_;
return 1 if ($tag =~ /^meta$/i);
return 2 if ($tag =~ /^link$/i);
return 3 if ($tag =~ /^img$/i);
return 4 if ($tag =~ /^input$/i);
return 5 if ($tag =~ /^br$/i);
####return 6 if ($tag =~ /^form$/i);
return 0;
}
sub is_optional_auto_closed_tag($) {
my ($tag) = @_;
return 1 if ($tag =~ /^td/i);
return 0;
}
sub deal_with_script($) {
my ($content) = @_;
my ($len,$text,$i,$inquot,$qc,$ch,$nc,$i2);
my ($rem);
$len = length($content);
$text = '';
$inquot = 0;
for ($i = 0; $i < $len; $i++) {
$i2 = $i + 1;
$ch = substr($content,$i,1);
$nc = ($i2 < $len) ? substr($content,$i2,1) : '';
if ($inquot) {
$text .= $ch;
if ($ch eq $qc) {
$inquot = 0;
}
} else {
# not in QUOTE
if (($ch eq '"') || ($ch eq "'")) {
$inquot = 1;
$qc = $ch;
$text .= $ch; # accumulate the script
} elsif (($ch eq '<')&&($nc eq '/')) {
# potential END of SCRIPT
$rem = substr($content,$i2+1);
if ($rem =~ /^script>/i) {
return $i - 1;
}
} else {
$text .= $ch; # accumulate the script
}
}
}
return $i;
}
sub deal_with_pre($$) {
my ($content,$rtxt) = @_;
my ($len,$text,$i,$inquot,$qc,$ch,$nc,$i2);
my ($rem);
$len = length($content);
$text = '';
$inquot = 0;
for ($i = 0; $i < $len; $i++) {
$i2 = $i + 1;
$ch = substr($content,$i,1);
$nc = ($i2 < $len) ? substr($content,$i2,1) : '';
if (($ch eq '<')&&($nc eq '/')) {
# potential END of PRE
$rem = substr($content,$i2+1);
if ($rem =~ /^pre>/i) {
last;
}
} else {
if ($ch =~ /\n/) {
if (!($text =~ /\n$/)) {
$text .= $ch;
}
} else {
$text .= $ch; # accumulate the pre - AS IS
}
}
}
${$rtxt} = $text;
return $i - 1;
}
# HTML parsing
sub get_html_refarray($) {
my $content = shift;
my ($len,$i,$ch,$nc,$nc2,$tag,$i2,$i3,$intag,$text);
my ($inquot,$qc,$incomm,$rem,$lnn);
my (@arr,$cnt,$j,$attrs,$ftag,$ptag,$rah,$typ);
my ($msg,$tlnn,$tlnn2,$ptag2);
$len = length($content);
$intag = 0;
$inquot = 0;
$incomm = 0;
$lnn = 1;
$text = '';
my @tag_stack = ();
my @html_array = ();
for ($i = 0; $i < $len; $i++) {
$i2 = $i + 1;
$i3 = $i + 2;
$ch = substr($content,$i,1);
$nc = ($i2 < $len) ? substr($content,$i2,1) : '';
if ($ch eq "\n") {
$lnn++;
$ch = ' ';
}
if ($intag) {
if ($inquot) {
$tag .= $ch;
if ($ch eq $qc) {
$inquot = 0;
}
} else {
$tag .= $ch;
if (($ch eq '"')||($ch eq "'")) {
$inquot = 1;
$qc = $ch;
} elsif ($ch eq '>') {
$intag = 0;
$tag =~ s/^/;
$tag =~ s/>$//;
$ftag = $tag;
@arr = space_split($tag);
$tag = $arr[0];
$cnt = scalar @arr;
$attrs = '';
for ($j = 1; $j < $cnt; $j++) {
$attrs .= ' ' if (length($attrs));
$attrs .= $arr[$j];
}
$rah = get_attr_refhash($attrs);
$msg = "$lnn: ";
if ($tag =~ /^!/) {
$tag =~ s/^!//;
$msg .= " spl
$msg .= " close [$tag]";
if (@tag_stack) {
$tlnn = $tag_stack[-1][0];
$ptag = $tag_stack[-1][1];
pop @tag_stack;
if (lc($tag) ne lc($ptag)) {
$msg = "WARNING: $mod_name: $lnn: Close TAG [$ftag], NOT PREVIOUS [$tlnn: $ptag]! ";
if (@tag_stack) {
$tlnn2 = $tag_stack[-1][0];
$ptag2 = $tag_stack[-1][1];
$msg .= "(prev [$ptag2]";
if (is_optional_auto_closed_tag($ptag) && ($tag eq $ptag2)) {
pop @tag_stack;
next; # continue - no problem
}
} else {
$msg .= "last!";
}
prtw("$msg\n") if ($show_warnings);
}
} else {
prtw("WARNING: $mod_name: $lnn: Close TAG [$ftag], NOT ON STACK!\n") if ($show_warnings);
}
} elsif ($ftag =~ /\/$/) {
$msg .= " closed <[$tag]";
$typ = $TAG_CLOSED;
} elsif (is_auto_closed_tag($tag)) {
$msg = " auto-closed <[$tag]";
$typ = $TAG_CLOSEA;
} else {
push(@tag_stack,[$lnn,$tag]);
$msg .= " tag <[$tag]";
$typ = $TAG_NORM;
}
if (length($attrs)) {
$msg .= " attr [$attrs]";
}
$msg .= ">";
prt("$msg\n") if ($lh_dbg_01);
push(@html_array,[$typ,$tag,$rah,$lnn]);
if ($typ == $TAG_NORM) {
# some extra checks
if ($tag =~ /^script$/i) {
# DEAL WITH SCRIPT!
$rem = substr($content,$i2);
$i += deal_with_script($rem);
#$rem = substr($content,$i);
#pgm_exit(1,"DEAL WITH SCRIPT [$rem]\n");
} elsif ($tag =~ /^pre$/i) {
$rem = substr($content,$i2);
$text = '';
$i += deal_with_pre($rem,\$text);
if (length($text)) {
$typ = $TAG_TEXT;
$rah = get_attr_refhash("");
prt("$lnn: TXT [$text]\n") if ($lh_dbg_01);
push(@html_array,[$typ,$text,$rah,$lnn]);
}
}
}
}
}
} else {
if ($inquot) {
$text .= $ch;
if ($ch eq $qc) {
$inquot = 0;
}
} else {
if (($ch eq '"')||($ch eq "'")) {
$inquot = 1;
$qc = $ch;
$text .= $ch;
} elsif ($ch eq '<') {
$typ = $TAG_TEXT;
$rah = get_attr_refhash("");
if (length($text)) {
prt("$lnn: TXT [$text]\n") if ($lh_dbg_01);
push(@html_array,[$typ,$text,$rah,$lnn]);
}
$text = '';
if ($nc eq '!') {
$rem = substr($content,$i2+1);
if ($rem =~ /^--/) {
# in a comment
$i = $i2 + 1;
$rem = "!";
for (; $i < $len; $i++) {
$i2 = $i + 1;
$i3 = $i + 2;
$ch = substr($content,$i,1);
if ($ch eq "\n") {
$lnn++;
$ch = ' ';
}
$nc = ($i2 < $len) ? substr($content,$i2,1) : '';
$nc2 = ($i3 < $len) ? substr($content,$i3,1) : '';
$lnn++ if ($ch eq "\n");
if (($ch eq '-')&&($nc eq '-')&&($nc2 eq '>')) {
$i = $i3;
$rem .= "--";
last;
}
$rem .= $ch;
}
push(@html_array,[$TAG_COMMENT,$rem,$rah,$lnn]);
next;
}
}
$tag = '<';
$intag = 1;
} else {
if ($ch =~ /\s/) {
if ( length($text) && !($text =~ /\s$/) ) {
$text .= $ch;
}
} else {
$text .= $ch;
}
}
}
}
}
#prt("Done $lnn lines...\n");
$len = scalar @tag_stack;
if ($len) {
$text = '';
for ($i = 0; $i < $len; $i++) {
$tlnn = $tag_stack[$i][0];
$ptag = $tag_stack[$i][1];
if ($lh_dbg_02) {
# noisy output
$text .= "\n$tlnn [$ptag]";
} else {
# simple list
$text .= ' ' if (length($text));
$text .= $ptag;
}
}
prtw("WARNING: $mod_name: End of document, with $len tags on stack [$text]\n") if ($show_warnings);
}
return \@html_array;
}
sub show_html_refarray($) {
my ($ra) = @_;
my ($cnt,$typ,$tag,$rha,$i,$lnn);
my ($hcnt,$key,$val,$att);
$cnt = scalar @{$ra};
prt("HTML ref array has $cnt items\n");
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
$hcnt = scalar keys(%{$rha});
$att = '';
foreach $key (keys %{$rha}) {
$val = ${$rha}{$key};
$att .= " " if (length($att));
if ($val eq $ATT_NV) {
$att .= $key;
} else {
$att .= "$key=$val";
}
}
if ($typ == $TAG_NORM) {
prt("$lnn: norm [$tag]");
} elsif ($typ == $TAG_CLOSE) {
prt("$lnn: close [$tag]");
} elsif ($typ == $TAG_CLOSED) {
prt("$lnn: closed [$tag]");
} elsif ($typ == $TAG_CLOSEA) {
prt("$lnn: closea [$tag]");
} elsif ($typ == $TAG_SPECIAL) {
prt("$lnn: spl [$tag]");
} elsif ($typ == $TAG_COMMENT) {
prt("$lnn: comm [$tag]");
} elsif ($typ == $TAG_TEXT) {
prt("$lnn: text [$tag]");
} else {
prt("$lnn: unknown [$tag]");
}
prt(" attr [$att]") if (length($att));
prt("\n");
}
}
sub get_html_body_only($) {
my ($ra) = @_;
my ($cnt,$typ,$tag,$rha,$i,$lnn);
$cnt = scalar @{$ra};
#prt("get_html_body_only: HTML ref array had $cnt items\n");
my @html_array = ();
my $inbody = 0;
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
if ($inbody) {
last if (($typ == $TAG_CLOSE)&&($tag =~ /^body$/i));
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
push(@html_array,[$typ,$tag,$rha,$lnn]);
} else {
$inbody = 1 if (($typ == $TAG_NORM) && ($tag =~ /^body$/i));
}
}
$cnt = scalar @html_array;
#prt("get_html_body_only: HTML returning $cnt items\n");
return \@html_array;
}
sub drop_html_tags($$) {
my ($ra,$rd) = @_;
my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add);
$cnt = scalar @{$ra};
prt("HTML ref array has $cnt items\n");
my @html_array = ();
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
$add = 1;
foreach $tt (@{$rd}) {
if ($tag =~ /^$tt$/i) {
$add = 0;
last;
}
}
push(@html_array,[$typ,$tag,$rha,$lnn]) if ($add);
}
$cnt = scalar @html_array;
prt("HTML returning $cnt items\n");
return \@html_array;
}
sub drop_div_tag($) {
my ($ra) = shift;
my @arr = qw(div);
return drop_html_tags($ra,\@arr);
}
sub get_tag_array($$$) {
my ($ra,$find,$opts) = @_;
my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add);
$cnt = scalar @{$ra};
my $dbg = ($opts & 0x8000);
prt("HTML ref array has $cnt items. Finding [$find]\n") if ($dbg);
my @html_array = ();
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
$add = 0;
if ($tag =~ /^$find$/i) {
if ($typ == $TAG_NORM) {
$add = 1;
} else {
# add only if options says so
if ($opts & 1) {
$add = 1;
}
}
}
push(@html_array,[$typ,$tag,$rha,$lnn]) if ($add);
}
$cnt = scalar @html_array;
prt("HTML returning $cnt items\n") if ($dbg);
return \@html_array;
}
# only for NORMAL tag - collect all items until it closes
sub get_whole_tag_array($$$) {
my ($ra,$find,$opts) = @_;
my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add,$i2);
my ($typ2,$tag2,$rha2,$lnn2,$tnam);
my $dbg = ($opts & 0x8000);
$cnt = scalar @{$ra};
prt("[dbg] get_whole_tag_array: ref has $cnt items. Finding [$find]\n") if ($dbg);
my @html_array = ();
my @arr = ();
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
$add = 0;
if ($tag =~ /^$find$/i) {
$add = 1 if ($typ == $TAG_NORM);
}
if ($add) {
@arr = ();
push(@arr,[$typ,$tag,$rha,$lnn]);
$i2 = $i + 1;
$add = 0;
for (; $i2 < $cnt; $i2++) {
$typ2 = ${$ra}[$i2][0];
$tag2 = ${$ra}[$i2][1];
$rha2 = ${$ra}[$i2][2];
$lnn2 = ${$ra}[$i2][3];
$tnam = tag_type_to_text($typ2);
# prt("$lnn2 $tag2 $tnam \n");
push(@arr,[$typ2,$tag2,$rha2,$lnn2]);
if (($tag2 =~ /^$find$/i) && ($typ2 == $TAG_CLOSE)) {
$add = 1;
last;
}
}
if ($add) {
push(@html_array, @arr);
} else {
prt("[dbg] FAILED to get close of [$tag]\n") if ($dbg);
}
}
}
$cnt = scalar @html_array;
prt("[dbg] get_whole_tag_array: HTML returning $cnt items\n") if ($dbg);
return \@html_array;
}
sub get_title_text($$) {
my ($ra,$opts) = @_;
my @title_text = ();
my $ta = get_whole_tag_array($ra,'title',$opts);
my $cnt = scalar @{$ta};
my ($i, $typ, $tag, $rha, $lnn);
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
if ($typ == $TAG_TEXT) {
push(@title_text,$tag);
}
}
return \@title_text;
}
sub get_html_title_from_file($) {
my $ff = shift;
if (! open INF, "<$ff") {
return "failed";
}
my @lines = ;
close INF;
my $html = join("",@lines);
my $ra = get_html_refarray($html);
my $ta = get_whole_tag_array($ra,'title',0);
my $tta = get_title_text($ta,0);
return join(" ",@{$tta});
}
sub get_a_href_array($$) {
my ($ra,$opts) = @_;
my $ar = get_tag_array($ra,'a',0);
my @href_array = ();
my ($cnt,$typ,$tag,$rha,$i,$key,$val,$lnn);
$cnt = scalar @{$ar};
my $dbg = ($opts & 0x8000);
prt("HTML ref array had $cnt items. Finding [A] HREF\n") if ($dbg);
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
foreach $key (keys %{$rha}) {
if ($key =~ /^href$/i) {
push(@href_array, ${$rha}{$key});
}
}
}
$cnt = scalar @href_array;
prt("Returning $cnt HREF entries...\n") if ($dbg);
return \@href_array;
}
sub get_a_href_text_array($$) {
my ($ra,$opts) = @_;
my $ar = get_whole_tag_array($ra,'a',$opts);
### my $ar = get_tag_array($ra,'a',0);
my @href_array = ();
my ($cnt,$typ,$tag,$rha,$i,$key,$val,$lnn,$i2,$txt);
$cnt = scalar @{$ar};
my $dbg = ($opts & 0x8000);
prt("HTML ref array had $cnt items. Finding [A] HREF\n") if ($dbg);
for ($i = 0; $i < $cnt; $i++) {
$i2 = $i + 1;
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
foreach $key (keys %{$rha}) {
if ($key =~ /^href$/i) {
$txt = '';
if (($i2 < $cnt)&&(${$ra}[$i2][0] == $TAG_TEXT)) {
$txt = ${$ra}[$i2][1]
}
push(@href_array, [${$rha}{$key},$txt]); # return with TEXT
}
}
}
$cnt = scalar @href_array;
prt("Returning $cnt HREF entries...\n") if ($dbg);
return \@href_array;
}
sub get_html_from_refarray($$) {
my ($ra,$opts) = @_;
my ($cnt,$typ,$tag,$rha,$i,$lnn);
my ($hcnt,$key,$val,$att);
$cnt = scalar @{$ra};
my $dbg = ($opts & 0x8000);
prt("HTML ref array has $cnt items\n") if ($dbg);
my $html = '';
for ($i = 0; $i < $cnt; $i++) {
$typ = ${$ra}[$i][0];
$tag = ${$ra}[$i][1];
$rha = ${$ra}[$i][2];
$lnn = ${$ra}[$i][3];
$hcnt = scalar keys(%{$rha});
$att = '';
foreach $key (keys %{$rha}) {
$val = ${$rha}{$key};
$att .= " " if (length($att));
if ($val eq $ATT_NV) {
$att .= $key;
} else {
$att .= "$key=$val";
}
}
if (($typ != $TAG_SPECIAL)&&($typ != $TAG_COMMENT)) {
$tag = lc($tag) unless ($opts & 1);
}
if ($typ == $TAG_NORM) {
#prt("$lnn: norm [$tag]");
$html .= "<$tag";
$html .= " $att" if (length($att));
$html .= ">\n";
} elsif ($typ == $TAG_CLOSE) {
#prt("$lnn: close [$tag]");
$html .= "$tag";
$html .= ">\n";
} elsif ($typ == $TAG_CLOSED) {
#prt("$lnn: closed [$tag]");
$html .= "<$tag";
$html .= " />\n";
} elsif ($typ == $TAG_CLOSEA) {
#prt("$lnn: closea [$tag]");
$html .= "<$tag";
$html .= ">\n";
} elsif ($typ == $TAG_SPECIAL) {
#prt("$lnn: spl [$tag]");
$html .= "<$tag";
$html .= ">\n";
} elsif ($typ == $TAG_COMMENT) {
#prt("$lnn: comm [$tag]");
$html .= "\n";
} elsif ($typ == $TAG_TEXT) {
#prt("$lnn: text [$tag]");
$html .= "$tag";
$html .= "\n";
} else {
prt("$lnn: unknown [$tag]");
}
#prt(" attr [$att]") if (length($att));
#prt("\n");
}
return $html;
}
1;
# eof - lib_html.pl