#!/perl -w
# NAME: chkhtml.pl
# AIM: Just parse HTML elements, and report any problem
# 08/01/2016 - Some UI enhancements
# 2010/04/06 - geoff mclane - http://geoffair.net/mperl/
use strict;
use warnings;
use File::Basename;
use Cwd;
unshift(@INC, 'C:\GTools\perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
my @tmpsp = split(/\\/,$pgmname);
$pgmname = $tmpsp[-1];
}
my $perl_dir = 'C:\GTools\perl';
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);
# user variables
my $load_log = 1;
my $def_infile = 'F:\Projects\tidy-html5\test\input5\in_342-1.html';
#my $def_infile = 'U:\var\www\fg\www\Docs\getstart\getstart24.html';
#my $def_infile = 'C:\HOMEPAGE\FG\Docs\getstart\getstartch9.html';
my $in_file = '';
my $show_attributes = 0;
my $show_text = 0;
my $trim_text = 1;
my $max_txt_len = 80;
my $show_element_stack = 1;
my @closed_tags = qw( meta link area base basefont br frame hr isindex param bgsound embed keygen img );
# tags which do NOT need a closing, like
, tag
my @opt_tags = ( "body", "colgroup", "dd", "dt", "head", "html", "li", "optgroup", "option",
"p", "tbody", "td", "tfoot", "th", "thead", "tr", "marquee" );
### program variables
my $verbosity = 0;
my @warnings = ();
my $cwd = cwd();
sub pgm_exit($$) {
my ($val,$msg) = @_;
if (length($msg)) {
$msg .= "\n" if (!($msg =~ /\n$/));
prt($msg)
}
close_log($outfile,$load_log);
exit($val);
}
sub prtw($) {
my ($tx) = shift;
$tx =~ s/\n$//;
prt("$tx\n");
push(@warnings,$tx);
}
sub show_warnings() {
if (@warnings) {
prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
foreach my $itm (@warnings) {
prt("$itm\n");
}
prt("\n");
} else {
prt( "\nNo warnings issued.\n\n" );
}
}
sub is_closed_tag($) {
my ($tt) = shift;
my $lctt = lc($tt);
foreach my $tag (@closed_tags) {
return 1 if ($tag eq $lctt);
}
return 0;
}
sub is_opt_tag($) {
my ($tt) = shift;
my $lctt = lc($tt);
foreach my $tag (@opt_tags) {
return 1 if ($tag eq $lctt);
}
return 0;
}
# $drop = can_find_this_tag($tag,\@elements);
sub can_find_this_tag($$) {
my ($tag,$re) = @_;
my $len = scalar @{$re};
my $drop = 0;
my $bu = -1;
my $last = '';
while ($len) {
$drop++; # can pop this one
$last = ${$re}[$bu][0]; # get tag
if ($last eq $tag) { # if the desired tag
return $drop; # return drop value
} elsif ( ! is_opt_tag($last) ) {
return 0; # oop, have a non-optional tag
}
$bu--; # back up one more
$len--; # and reduce available to check
}
return 0;
}
sub is_all_optional($) {
my ($re) = @_;
my $len = scalar @{$re};
my $bu = -1;
my ($last);
while ($len) {
$last = ${$re}[$bu][0]; # get tag
if ( ! is_opt_tag($last) ) {
return 0; # oop, have a non-optional tag
}
$bu--; # back up one more
$len--; # and reduce available to check
}
return 1; # ALL were optiona
}
sub show_stack_elements($$$) {
my ($tag,$rele,$rlns) = @_;
my $cnt = scalar @{$rele};
my $lcnt = scalar @{$rlns};
if ($cnt) {
prt("The stack has $cnt elements... The current closing element is [$tag]\n");
for (my $i = 0; $i < $cnt; $i++) {
my $e = ${$rele}[$i][0];
my $n = ${$rele}[$i][1];
prt("$n: elelement [$e]");
prt(" SAME as tag [$tag]!") if ($e eq $tag);
if ($n <= $lcnt) {
my $ln = trim_all(${$rlns}[$n-1]);
prt(" line=[$ln]");
}
prt("\n");
}
}
}
sub get_element_chain($) {
my ($rele) = @_;
my $cnt = scalar @{$rele};
my $chn = '';
if ($cnt) {
for (my $i = 0; $i < $cnt; $i++) {
my $e = ${$rele}[$i][0];
$chn .= '|' if length($chn);
$chn .= $e;
}
}
return $chn;
}
sub process_file($) {
my ($inf) = shift;
if (!open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]!\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Got $lncnt lines, from $inf...\n");
my ($i,$line,$ch,$tag,$len,$intag,$txt,$j,$pc,$ppc,$incdata,$hadsp,$attrs);
my ($lnn,$last,$lln,$bgnlnn,$endlnn,$clnn,$stkdep,$maxdep);
my ($maxelement,$echn,$tag_type,$tmp);
$tag = '';
$attrs = '';
$intag = 0;
$incdata = 0;
$hadsp = 0;
$txt = '';
$ch = '';
$pc = '';
my @elements = ();
$lnn = 0;
$maxdep = 0;
$maxelement = '';
for ($i = 0; $i < $lncnt; $i++) {
$line = $lines[$i];
chomp $line;
$len = length($line);
$lnn++;
$clnn = sprintf("%3d",$lnn);
for ($j = 0; $j < $len; $j++) {
$ppc = $pc;
$pc = $ch;
$ch = substr($line,$j,1);
if ($incdata) {
$tag .= $ch;
if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']')) {
$incdata = 0;
prt("$clnn: End CDATA\n");
}
} elsif ($intag) {
if ($hadsp) {
$attrs .= $ch if !($ch eq '>');
} elsif ($ch =~ /\s/) {
$hadsp = 1;
} else {
$tag .= $ch if !($ch eq '>');
}
if ($ch eq '>') {
$intag = 0;
$endlnn = $lnn;
} elsif (($ch eq '[')&&($pc eq 'A')&&($tag =~ /^<\!\[CDATA\[/)) {
$incdata = 1;
prt("\n$clnn: Begin CDATA\n");
}
if (!$intag) {
$stkdep = scalar @elements;
$tag = trim_all($tag);
$tag_type = 'Unknonw';
if ($tag =~ /^(\!|\?)/) {
$tag_type = "Special";
} elsif (($attrs =~ /\/$/) || is_closed_tag($tag)) {
$tag_type = "self-closed";
} elsif ($tag =~ /^\//) {
$tag_type = "Close";
} else {
$tag_type = "Open";
}
$clnn = sprintf("%3d",$lnn);
if ($verbosity) {
prt("$clnn: ");
if ($show_text) {
prt("Text [".trim_all($txt)."] ") if (length($txt) && !($txt =~ /^\s+$/));
}
prt("End tag [$tag] ");
if ($show_attributes) {
prt("Attrs [".trim_all($attrs)."] ") if (length($attrs));
}
}
if ($tag =~ /^(\!|\?)/) {
prt("Special") if ($verbosity);
} else {
# if ($attrs =~ /\/$/) but it may NOT end with '/'
if (($attrs =~ /\/$/) || is_closed_tag($tag)) {
prt("self-closed") if ($verbosity);
} elsif ($tag =~ /^\//) {
$tag = substr($tag,1);
prt("Close") if ($verbosity);
$echn = get_element_chain(\@elements);
if ($verbosity && $show_element_stack) {
prt(" s=$echn $stkdep");
}
if (@elements) {
$last = $elements[-1][0];
$lln = $elements[-1][1];
if ($last eq $tag) {
pop @elements;
} else {
# but may have 'opt' tags - tags that need no close on the stack, which
# can be dropped to get to this tag
my $drop = can_find_this_tag($tag,\@elements);
if ($drop) {
prt("\nWARNING: Last [$last]$lln NE [$tag]$lnn drop $drop ");
$tmp = '';
while($drop--) {
$tmp .= "|" if (length($tmp));
$tmp .= pop @elements;
}
prt("$tmp ");
} else {
prt("\nERROR: Last [$last]$lln NE [$tag]$lnn line=[".trim_all($line)."]\n");
show_stack_elements($tag,\@elements,\@lines);
pgm_exit(1,"ERROR:[1]: It is useless to continue when the element stack is out of order!\n");
}
}
} else {
prt("\nERROR: The stack has NO elements... The current closing element is [$tag]\n");
pgm_exit(1,"ERROR:[2]: It is useless to continue when the element stack is out of order! [2]\n");
}
} else {
prt("Open") if ($verbosity);
push(@elements,[$tag,$bgnlnn,$endlnn]);
$echn = get_element_chain(\@elements);
$stkdep = scalar @elements;
if ($stkdep > $maxdep) {
$maxdep = $stkdep;
$maxelement = "$clnn: $tag $bgnlnn $endlnn [$echn]";
}
if ($verbosity && $show_element_stack) {
prt(" s=$echn $stkdep");
}
}
}
prt("\n") if ($verbosity);
# reset
$txt = '';
$tag = '';
$attrs = '';
$hadsp = 0;
}
} else {
if ($ch eq '<') {
$tag = '';
$intag = 1;
$hadsp = 0;
$bgnlnn = $lnn;
} else {
$txt .= $ch;
}
}
} # reached end of line - get next
$ch = ' ';
$txt .= $ch if (length($txt) && !($txt =~ /\s$/));
if ($hadsp) {
$attrs .= $ch if (length($attrs) && !($attrs =~ /\s$/));
} else {
$tag .= $ch if (length($tag) && !($tag =~ /\s$/));
}
$ppc = $pc;
$pc = $ch;
}
if (@elements && !is_all_optional(\@elements)) {
show_stack_elements("At-end-of-file",\@elements,\@lines);
pgm_exit(1,"ERROR:[3] It is useless to continue when the element stack is out of order!\n");
}
prt("Max. element stack $maxdep...$maxelement\n");
prt("Done $lncnt lines... $inf appears ok...\n");
}
#########################################
### MAIN ###
parse_args(@ARGV);
prt( "$pgmname: in [$cwd]: Process $in_file...\n" );
process_file($in_file);
pgm_exit(0,"Normal exit(0)");
########################################
sub give_help {
prt("$pgmname: version 0.0.1 2010-04-06\n");
prt("Usage: $pgmname [options] in_file_name\n");
prt("Options:\n");
prt(" -h (or -?) = THis help, and exit 0\n");
prt(" -l = Load log file at end.\n");
prt(" -v[num] = Bump, or set verbosity to [num]\n");
prt("Parse input file, and report any problems...\n");
}
sub parse_args {
my (@av) = @_;
my ($arg,$sarg,$ch);
while (@av) {
$arg = $av[0];
if ($arg =~ /^-/) {
$sarg = substr($arg,1);
$sarg = substr($sarg,1) while ($sarg =~ /^-/);
$ch = substr($sarg,0,1);
if ($ch =~ /h/i) {
give_help();
pgm_exit(0,"Help exit");
} elsif ($ch =~ /l/i) {
$load_log = 1;
prt("Set to load log at end\n");
} elsif ($ch =~ /v/i) {
$sarg = substr($sarg,1);
if (length($sarg)) {
if ($sarg =~ /^\d+$/) {
$verbosity = $sarg;
prt("Set verbosity to [$verbosity]\n");
} else {
pgm_exit(1,"Unknown argument [$arg] - verbosity is -v[num]. Try -h for help\n");
}
} else {
$verbosity++;
prt("Bumped verbosity to [$verbosity]\n");
}
} else {
pgm_exit(1,"Unknown argument [$arg] Try -h for help\n");
}
} else {
$in_file = $arg;
prt("Set input file to [$in_file]\n");
}
shift @av;
}
if (!length($in_file)) {
$in_file = $def_infile;
$load_log = 1;
$verbosity = 9;
prt("Set DEFAULT input file to [$in_file], and set load_log=1, and verbosity=$verbosity\n");
}
}
# eof - template.pl