#!/usr/bin/perl -w #< lib_html.pl use strict; use warnings; my $TAG_NORM = 0; # normal has open , and close my $TAG_CLOSE = 1; # is close of item my $TAG_CLOSED = 2; # has xml close
my $TAG_CLOSEA = 3; # is self closing
my $TAG_SPECIAL = 4; # special like my $TAG_COMMENT = 5; # comment my $TAG_TEXT = 6; # just 'text' my $ATT_NV = ''; my $mod_name = 'lib_html'; # feature my $show_warnings = 1; sub set_show_warnings($) { $show_warnings = shift; } # debug my $lh_dbg_01 = 0; # show tags as decoded my $lh_dbg_02 = 0; # load failure my $lh_dbg_03 = 0; # show counts my %tag_type_hash = ( $TAG_NORM => 'norm', $TAG_CLOSE => 'close', $TAG_CLOSED => 'closed', $TAG_CLOSEA => 'closea', $TAG_SPECIAL => 'special', $TAG_COMMENT => 'comment', $TAG_TEXT => 'text' ); sub tag_type_to_text($) { my $typ = shift; if (defined $tag_type_hash{$typ}) { return $tag_type_hash{$typ}; } return "Type [$typ] NOT DEFINED"; } sub get_attr_no_value() { return $ATT_NV; } sub get_tag_normal_value() { return $TAG_NORM; }; sub get_tag_close_value() { return $TAG_CLOSE; } sub get_tag_closed_value() { return $TAG_CLOSED; } sub get_tag_closea_value() { return $TAG_CLOSEA; } sub get_tag_special_value() { return $TAG_SPECIAL; } sub get_tag_comment_value() { return $TAG_COMMENT; } sub get_tag_text_value() { return $TAG_TEXT; } sub get_attr_refhash($) { my $txt = shift; my %hash = (); my @arr = space_split($txt); my ($item,@arr2,$cnt,$att,$val,$j); foreach $item (@arr) { @arr2 = split("=",$item); $cnt = scalar @arr2; $att = trim_all($arr2[0]); $val = ''; if ($cnt > 1) { if ($cnt > 2) { for ($j = 1; $j < $cnt; $j++) { $val .= '=' if (length($val)); $val .= $arr2[$j]; } } else { $val = $arr2[1]; } } else { next if ($att eq '/'); $val = $ATT_NV; } $hash{$att} = $val; } return \%hash; } sub is_auto_closed_tag($) { my ($tag) = @_; return 1 if ($tag =~ /^meta$/i); return 2 if ($tag =~ /^link$/i); return 3 if ($tag =~ /^img$/i); return 4 if ($tag =~ /^input$/i); return 5 if ($tag =~ /^br$/i); ####return 6 if ($tag =~ /^form$/i); return 0; } sub is_optional_auto_closed_tag($) { my ($tag) = @_; return 1 if ($tag =~ /^td/i); return 0; } sub deal_with_script($) { my ($content) = @_; my ($len,$text,$i,$inquot,$qc,$ch,$nc,$i2); my ($rem); $len = length($content); $text = ''; $inquot = 0; for ($i = 0; $i < $len; $i++) { $i2 = $i + 1; $ch = substr($content,$i,1); $nc = ($i2 < $len) ? substr($content,$i2,1) : ''; if ($inquot) { $text .= $ch; if ($ch eq $qc) { $inquot = 0; } } else { # not in QUOTE if (($ch eq '"') || ($ch eq "'")) { $inquot = 1; $qc = $ch; $text .= $ch; # accumulate the script } elsif (($ch eq '<')&&($nc eq '/')) { # potential END of SCRIPT $rem = substr($content,$i2+1); if ($rem =~ /^script>/i) { return $i - 1; } } else { $text .= $ch; # accumulate the script } } } return $i; } sub deal_with_pre($$) { my ($content,$rtxt) = @_; my ($len,$text,$i,$inquot,$qc,$ch,$nc,$i2); my ($rem); $len = length($content); $text = ''; $inquot = 0; for ($i = 0; $i < $len; $i++) { $i2 = $i + 1; $ch = substr($content,$i,1); $nc = ($i2 < $len) ? substr($content,$i2,1) : ''; if (($ch eq '<')&&($nc eq '/')) { # potential END of PRE $rem = substr($content,$i2+1); if ($rem =~ /^pre>/i) { last; } } else { if ($ch =~ /\n/) { if (!($text =~ /\n$/)) { $text .= $ch; } } else { $text .= $ch; # accumulate the pre - AS IS } } } ${$rtxt} = $text; return $i - 1; } # HTML parsing sub get_html_refarray($) { my $content = shift; my ($len,$i,$ch,$nc,$nc2,$tag,$i2,$i3,$intag,$text); my ($inquot,$qc,$incomm,$rem,$lnn); my (@arr,$cnt,$j,$attrs,$ftag,$ptag,$rah,$typ); my ($msg,$tlnn,$tlnn2,$ptag2); $len = length($content); $intag = 0; $inquot = 0; $incomm = 0; $lnn = 1; $text = ''; my @tag_stack = (); my @html_array = (); for ($i = 0; $i < $len; $i++) { $i2 = $i + 1; $i3 = $i + 2; $ch = substr($content,$i,1); $nc = ($i2 < $len) ? substr($content,$i2,1) : ''; if ($ch eq "\n") { $lnn++; $ch = ' '; } if ($intag) { if ($inquot) { $tag .= $ch; if ($ch eq $qc) { $inquot = 0; } } else { $tag .= $ch; if (($ch eq '"')||($ch eq "'")) { $inquot = 1; $qc = $ch; } elsif ($ch eq '>') { $intag = 0; $tag =~ s/^$//; $ftag = $tag; @arr = space_split($tag); $tag = $arr[0]; $cnt = scalar @arr; $attrs = ''; for ($j = 1; $j < $cnt; $j++) { $attrs .= ' ' if (length($attrs)); $attrs .= $arr[$j]; } $rah = get_attr_refhash($attrs); $msg = "$lnn: "; if ($tag =~ /^!/) { $tag =~ s/^!//; $msg .= " spl $msg .= " close "; prt("$msg\n") if ($lh_dbg_01); push(@html_array,[$typ,$tag,$rah,$lnn]); if ($typ == $TAG_NORM) { # some extra checks if ($tag =~ /^script$/i) { # DEAL WITH SCRIPT! $rem = substr($content,$i2); $i += deal_with_script($rem); #$rem = substr($content,$i); #pgm_exit(1,"DEAL WITH SCRIPT [$rem]\n"); } elsif ($tag =~ /^pre$/i) { $rem = substr($content,$i2); $text = ''; $i += deal_with_pre($rem,\$text); if (length($text)) { $typ = $TAG_TEXT; $rah = get_attr_refhash(""); prt("$lnn: TXT [$text]\n") if ($lh_dbg_01); push(@html_array,[$typ,$text,$rah,$lnn]); } } } } } } else { if ($inquot) { $text .= $ch; if ($ch eq $qc) { $inquot = 0; } } else { if (($ch eq '"')||($ch eq "'")) { $inquot = 1; $qc = $ch; $text .= $ch; } elsif ($ch eq '<') { $typ = $TAG_TEXT; $rah = get_attr_refhash(""); if (length($text)) { prt("$lnn: TXT [$text]\n") if ($lh_dbg_01); push(@html_array,[$typ,$text,$rah,$lnn]); } $text = ''; if ($nc eq '!') { $rem = substr($content,$i2+1); if ($rem =~ /^--/) { # in a comment $i = $i2 + 1; $rem = "!"; for (; $i < $len; $i++) { $i2 = $i + 1; $i3 = $i + 2; $ch = substr($content,$i,1); if ($ch eq "\n") { $lnn++; $ch = ' '; } $nc = ($i2 < $len) ? substr($content,$i2,1) : ''; $nc2 = ($i3 < $len) ? substr($content,$i3,1) : ''; $lnn++ if ($ch eq "\n"); if (($ch eq '-')&&($nc eq '-')&&($nc2 eq '>')) { $i = $i3; $rem .= "--"; last; } $rem .= $ch; } push(@html_array,[$TAG_COMMENT,$rem,$rah,$lnn]); next; } } $tag = '<'; $intag = 1; } else { if ($ch =~ /\s/) { if ( length($text) && !($text =~ /\s$/) ) { $text .= $ch; } } else { $text .= $ch; } } } } } #prt("Done $lnn lines...\n"); $len = scalar @tag_stack; if ($len) { $text = ''; for ($i = 0; $i < $len; $i++) { $tlnn = $tag_stack[$i][0]; $ptag = $tag_stack[$i][1]; if ($lh_dbg_02) { # noisy output $text .= "\n$tlnn [$ptag]"; } else { # simple list $text .= ' ' if (length($text)); $text .= $ptag; } } prtw("WARNING: $mod_name: End of document, with $len tags on stack [$text]\n") if ($show_warnings); } return \@html_array; } sub show_html_refarray($) { my ($ra) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn); my ($hcnt,$key,$val,$att); $cnt = scalar @{$ra}; prt("HTML ref array has $cnt items\n"); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; $hcnt = scalar keys(%{$rha}); $att = ''; foreach $key (keys %{$rha}) { $val = ${$rha}{$key}; $att .= " " if (length($att)); if ($val eq $ATT_NV) { $att .= $key; } else { $att .= "$key=$val"; } } if ($typ == $TAG_NORM) { prt("$lnn: norm [$tag]"); } elsif ($typ == $TAG_CLOSE) { prt("$lnn: close [$tag]"); } elsif ($typ == $TAG_CLOSED) { prt("$lnn: closed [$tag]"); } elsif ($typ == $TAG_CLOSEA) { prt("$lnn: closea [$tag]"); } elsif ($typ == $TAG_SPECIAL) { prt("$lnn: spl [$tag]"); } elsif ($typ == $TAG_COMMENT) { prt("$lnn: comm [$tag]"); } elsif ($typ == $TAG_TEXT) { prt("$lnn: text [$tag]"); } else { prt("$lnn: unknown [$tag]"); } prt(" attr [$att]") if (length($att)); prt("\n"); } } sub get_html_body_only($) { my ($ra) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn); $cnt = scalar @{$ra}; #prt("get_html_body_only: HTML ref array had $cnt items\n"); my @html_array = (); my $inbody = 0; for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; if ($inbody) { last if (($typ == $TAG_CLOSE)&&($tag =~ /^body$/i)); $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; push(@html_array,[$typ,$tag,$rha,$lnn]); } else { $inbody = 1 if (($typ == $TAG_NORM) && ($tag =~ /^body$/i)); } } $cnt = scalar @html_array; #prt("get_html_body_only: HTML returning $cnt items\n"); return \@html_array; } sub drop_html_tags($$) { my ($ra,$rd) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add); $cnt = scalar @{$ra}; prt("HTML ref array has $cnt items\n"); my @html_array = (); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; $add = 1; foreach $tt (@{$rd}) { if ($tag =~ /^$tt$/i) { $add = 0; last; } } push(@html_array,[$typ,$tag,$rha,$lnn]) if ($add); } $cnt = scalar @html_array; prt("HTML returning $cnt items\n"); return \@html_array; } sub drop_div_tag($) { my ($ra) = shift; my @arr = qw(div); return drop_html_tags($ra,\@arr); } sub get_tag_array($$$) { my ($ra,$find,$opts) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add); $cnt = scalar @{$ra}; my $dbg = ($opts & 0x8000); prt("HTML ref array has $cnt items. Finding [$find]\n") if ($dbg); my @html_array = (); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; $add = 0; if ($tag =~ /^$find$/i) { if ($typ == $TAG_NORM) { $add = 1; } else { # add only if options says so if ($opts & 1) { $add = 1; } } } push(@html_array,[$typ,$tag,$rha,$lnn]) if ($add); } $cnt = scalar @html_array; prt("HTML returning $cnt items\n") if ($dbg); return \@html_array; } # only for NORMAL tag - collect all items until it closes sub get_whole_tag_array($$$) { my ($ra,$find,$opts) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add,$i2); my ($typ2,$tag2,$rha2,$lnn2,$tnam); my $dbg = ($opts & 0x8000); $cnt = scalar @{$ra}; prt("[dbg] get_whole_tag_array: ref has $cnt items. Finding [$find]\n") if ($dbg); my @html_array = (); my @arr = (); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; $add = 0; if ($tag =~ /^$find$/i) { $add = 1 if ($typ == $TAG_NORM); } if ($add) { @arr = (); push(@arr,[$typ,$tag,$rha,$lnn]); $i2 = $i + 1; $add = 0; for (; $i2 < $cnt; $i2++) { $typ2 = ${$ra}[$i2][0]; $tag2 = ${$ra}[$i2][1]; $rha2 = ${$ra}[$i2][2]; $lnn2 = ${$ra}[$i2][3]; $tnam = tag_type_to_text($typ2); # prt("$lnn2 $tag2 $tnam \n"); push(@arr,[$typ2,$tag2,$rha2,$lnn2]); if (($tag2 =~ /^$find$/i) && ($typ2 == $TAG_CLOSE)) { $add = 1; last; } } if ($add) { push(@html_array, @arr); } else { prt("[dbg] FAILED to get close of [$tag]\n") if ($dbg); } } } $cnt = scalar @html_array; prt("[dbg] get_whole_tag_array: HTML returning $cnt items\n") if ($dbg); return \@html_array; } sub get_title_text($$) { my ($ra,$opts) = @_; my @title_text = (); my $ta = get_whole_tag_array($ra,'title',$opts); my $cnt = scalar @{$ta}; my ($i, $typ, $tag, $rha, $lnn); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; if ($typ == $TAG_TEXT) { push(@title_text,$tag); } } return \@title_text; } sub get_html_title_from_file($) { my $ff = shift; if (! open INF, "<$ff") { return "failed"; } my @lines = ; close INF; my $html = join("",@lines); my $ra = get_html_refarray($html); my $ta = get_whole_tag_array($ra,'title',0); my $tta = get_title_text($ta,0); return join(" ",@{$tta}); } sub get_a_href_array($$) { my ($ra,$opts) = @_; my $ar = get_tag_array($ra,'a',0); my @href_array = (); my ($cnt,$typ,$tag,$rha,$i,$key,$val,$lnn); $cnt = scalar @{$ar}; my $dbg = ($opts & 0x8000); prt("HTML ref array had $cnt items. Finding [A] HREF\n") if ($dbg); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; foreach $key (keys %{$rha}) { if ($key =~ /^href$/i) { push(@href_array, ${$rha}{$key}); } } } $cnt = scalar @href_array; prt("Returning $cnt HREF entries...\n") if ($dbg); return \@href_array; } sub get_a_href_text_array($$) { my ($ra,$opts) = @_; my $ar = get_whole_tag_array($ra,'a',$opts); ### my $ar = get_tag_array($ra,'a',0); my @href_array = (); my ($cnt,$typ,$tag,$rha,$i,$key,$val,$lnn,$i2,$txt); $cnt = scalar @{$ar}; my $dbg = ($opts & 0x8000); prt("HTML ref array had $cnt items. Finding [A] HREF\n") if ($dbg); for ($i = 0; $i < $cnt; $i++) { $i2 = $i + 1; $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; foreach $key (keys %{$rha}) { if ($key =~ /^href$/i) { $txt = ''; if (($i2 < $cnt)&&(${$ra}[$i2][0] == $TAG_TEXT)) { $txt = ${$ra}[$i2][1] } push(@href_array, [${$rha}{$key},$txt]); # return with TEXT } } } $cnt = scalar @href_array; prt("Returning $cnt HREF entries...\n") if ($dbg); return \@href_array; } sub get_html_from_refarray($$) { my ($ra,$opts) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn); my ($hcnt,$key,$val,$att); $cnt = scalar @{$ra}; my $dbg = ($opts & 0x8000); prt("HTML ref array has $cnt items\n") if ($dbg); my $html = ''; for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; $hcnt = scalar keys(%{$rha}); $att = ''; foreach $key (keys %{$rha}) { $val = ${$rha}{$key}; $att .= " " if (length($att)); if ($val eq $ATT_NV) { $att .= $key; } else { $att .= "$key=$val"; } } if (($typ != $TAG_SPECIAL)&&($typ != $TAG_COMMENT)) { $tag = lc($tag) unless ($opts & 1); } if ($typ == $TAG_NORM) { #prt("$lnn: norm [$tag]"); $html .= "<$tag"; $html .= " $att" if (length($att)); $html .= ">\n"; } elsif ($typ == $TAG_CLOSE) { #prt("$lnn: close [$tag]"); $html .= "\n"; } elsif ($typ == $TAG_CLOSED) { #prt("$lnn: closed [$tag]"); $html .= "<$tag"; $html .= " />\n"; } elsif ($typ == $TAG_CLOSEA) { #prt("$lnn: closea [$tag]"); $html .= "<$tag"; $html .= ">\n"; } elsif ($typ == $TAG_SPECIAL) { #prt("$lnn: spl [$tag]"); $html .= "<$tag"; $html .= ">\n"; } elsif ($typ == $TAG_COMMENT) { #prt("$lnn: comm [$tag]"); $html .= "\n"; } elsif ($typ == $TAG_TEXT) { #prt("$lnn: text [$tag]"); $html .= "$tag"; $html .= "\n"; } else { prt("$lnn: unknown [$tag]"); } #prt(" attr [$att]") if (length($att)); #prt("\n"); } return $html; } 1; # eof - lib_html.pl