Generated: Sat Oct 24 16:35:30 2020 from tidyentities.pl 2017/11/15 74.4 KB. text copy
#!/usr/bin/perl -w # NAME: tidyentities.pl # AIM: VERY SPECIALIZED - Read a C table, and a JSON set of entities, and compare # 2017-11-06 - purpose extended # 08/02/2015 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; use JSON; use Data::Dumper; use feature 'unicode_strings'; use utf8; use Text::Unidecode; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.6 2017-11-06"; ###my $VERS = "0.0.5 2015-01-09"; my $load_log = 0; my $in_file = ''; my $in_file1 = 'C:\Users\user\Documents\Tidy\htmlmathml.json'; my $in_file2 = 'C:\Users\user\Documents\Tidy\entities.c'; # my $in_file2 = 'F:\Projects\tidy-html5\src\entities.c'; # 2017-11-06 - add from : https://dev.w3.org/html5/html-author/charref my $in_file3 = 'C:\Users\user\Documents\Tidy\charref.html'; # from : https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references my $in_file4 = 'C:\Users\user\Documents\Tidy\wiki-list-entities.html'; # from : https://www.w3.org/TR/REC-html40/sgml/entities.html my $in_file4f = 'C:\Users\user\Documents\Tidy\html4-ents.html'; # from : https://www.w3.org/TR/html5/ # from : https://www.w3.org/TR/html5/syntax.html#named-character-references my $in_file5 = 'C:\Users\user\Documents\Tidy\syntax5-ents.html'; my $in_file5j = 'C:\Users\user\Documents\Tidy\html5-ents.json'; # from : https://www.freeformatter.com/html-entities.html - Just 238 entities # my $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; my $verbosity = 0; my $out_file = $temp_dir.$PATH_SEP."tempents1.c"; my $out_file2 = $temp_dir.$PATH_SEP."tempents12.c"; my $out_file3 = $temp_dir.$PATH_SEP."tempmultents1.html"; my $out_file4 = $temp_dir.$PATH_SEP."temptidyents1.html"; my $out_file5 = $temp_dir.$PATH_SEP."tempdiffents1.html"; my $out_file6 = $temp_dir.$PATH_SEP."tempfreeents1.html"; my $out_file7 = $temp_dir.$PATH_SEP."tempallents1.html"; my $out_file8 = $temp_dir.$PATH_SEP."tempmodents1.html"; my $nocase_sort = 0; my $pad_struct = 0; # # from: https://www.w3.org/TR/REC-xml/#sec-predefined-ent - version 1.0 # Only 5 defined - amp, lt, gt, apos, quot # from: https://www.w3.org/MarkUp/html-spec/html-spec_toc.html # Hypertext Markup Language - 2.0 September 22, 1995 - below # from : https://www.w3.org/TR/REC-html32#latin1 - below # HTML5 - from : https://www.w3.org/TR/html5/syntax.html#named-character-references # <div id="named-character-references-table"> # ### DEBUG ### my $debug_on = 0; my $def_file = 'def_file'; ### program variables my @warnings = (); my $cwd = cwd(); my %entities = (); # entities found in current C code [$vers, $code, 0] my %all_ents = (); # entities found in current charref.html my %all_by_val = (); # listed in DECIMAL order, accounting for REPEATS my $max_ent = 0; my $max_ver = 0; my $max_dec = 0; my %json_ents = (); # entities found in html5-ents.json my %free_ents = (); # entities found in $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; my $curr_date = lu_get_YYYYMMDD_hhmmss(time()); # Difference include - # ‌ 10184 and 10185, BUT these are - # ⟈ 10184 and ⟉ 10185 # from : https://www.w3schools.com/charsets/ref_utf_punctuation.asp # 8204 200C ‌ ZERO WIDTH NON-JOINER # BUT json contains ‌ with decimal 10184 and 10185 which are different 'glyphs'!!! # my %modified_ents = (); my %preferred_vals = ( 'phiv' => 981, 'varphi' => 981, 'OverBar' => 8254, 'UnderBar' => 95, 'epsi' => 949, 'epsiv' => 1013, 'varepsilon' => 1013, 'epsilon' => 949, 'straightepsilon' => 1013, 'strns' => 175, 'angst' => 197, 'ohm' => 937 ); my %equal_pairs = ( 'angst' => [197, 8491], 'ohm' => [937, 8486] ); my %exclude_pairs = ( 'race' => [8756, 817] # 0223D + 00331 = (8765 + 817 = 9582) ); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub mycmp_nc_sort { return -1 if (lc($a) lt lc($b)); return 1 if (lc($a) gt lc($b)); return 0; } sub mycmp_dec { return -1 if ($a < $b); return 1 if ($a > $b); return 0; } sub get_head($) { my $title = shift; my $txt = <<EOF; <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>$title</title> <style> table { margin-left: auto; margin-right: auto; } table, td, th { border: 1px solid gray; } .ctr { text-align: center; } .rite { text-align: right; } </style> </head> EOF return $txt; } sub write_table($$$) { my ($tit,$htm,$out) = @_; # like "entities",$htm,$out my $line = get_head($tit); $line .= "<body>\n"; $line .= "<a id='top'></a>"; $line .= "<h1>$tit</h1>\n"; $line .= "<table>\n"; $line .= $htm; $line .= "</table>\n"; $line .= "<p>Table columns headed with 'G' is a 'glyph' generated from the alphabetic entity, "; $line .= "like &AElig;. "; $line .= "Marked with an 'N' is a 'glyph' generated from a numeric entity, "; $line .= "like &#198;.</p>\n"; $line .= "<p class='ctr'><a href='#top'>top</a></p>\n"; $line .= "<p class='rite'>"; my ($name,$dir) = fileparse($out); $line .= "$name $curr_date"; $line .= "</p>\n"; $line .= "</body>\n</html>\n"; write2file($line,$out); } my %test_hash = ( 'nlE' => "\x{2266}\x{338}", 'harrcir' => "\x{2948}", 'omid' => "\x{29b6}", 'cularr' => "\x{21b6}", 'ycy' => "\x{44b}", 'ldca' => "\x{2936}" ); sub check_hash() { my $rh = \%test_hash; prt(Dumper($rh)); my @arr = keys %{$rh}; my ($key,$val,$num,$len,$i,$ch,$i2); foreach $key (@arr) { $val = ${$rh}{$key}; #prt("$key "); $num = sprintf("%s", unidecode( $val )); #$num = sprintf("%X", unidecode( $val )); #prt("\n"); prt("$key $num\n"); # $val =~ s/([^[:ascii:]]+)/unidecode($1)/ge; # $len = length($val); # $num = ''; # for ($i = 0; $i < $len; $i++) { # $i2 = $i + 1; # $ch = substr($val,$i,1); # prt("$i2: $ch "); # last if ($ch eq '{'); # } # for (; $i < $len; $i++) { # $ch = substr($val,$i,1); # last if ($ch eq '}'); # $num .= $ch; # } # #if ($val =~ /\\x\{(.+)\}/) { # # $num = $1; # # prt("$key $num\n"); # #} # #$num = chr($val); # prt("$key $num\n"); } pgm_exit(1,"TEMP EXIT\n"); } sub clear_hash_counts($) { my $rh = shift; my @arr = keys %{$rh}; # like \%entities; my ($ent,$ra); foreach $ent (@arr) { $ra = ${$rh}{$ent}; #$tvers = ${$ra}[0]; #$code = ${$ra}[1]; ${$ra}[2] = 0; } } ############################################################################# # from : http://www.w3.org/TR/xml-entity-names/#htmlmathml # DWN: htmlmathml.json # { # "characters": { # "AElig": "\u00C6", # "AMP": "\u0026", # "NotSubset": "\u2282\u20D2", # from : http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f-a.dtd # <!ENTITY NotSubset "⊄" ><!--alias ISOAMSN vnsub --> # <!ENTITY NotPrecedesEqual "⪯̸" ><!--alias ISOAMSN npre --> # "NotPrecedesEqual": "\u2AAF\u0338", sub process_in_json($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$num,$lnn,$val); $line = join("",@lines); my $json = JSON->new->allow_nonref; my $rh = $json->decode( $line ); my $tag = 'characters'; if (!defined ${$rh}{$tag}) { prt("NO '$tag' defined in file $inf!\n"); return; } my $rh2 = ${$rh}{$tag}; # prt(Dumper($rh2)); $load_log = 1; my @arr = sort mycmp_nc_sort keys(%{$rh2}); my $cnt = scalar @arr; prt("Found $cnt entity keys...\n"); foreach $tag (@arr) { $val = ${$rh2}{$tag}; if ($val =~ /\\x\{([0-9a-f]+)\}/) { $num = hex($1); prt("$tag = $num\n"); } else { ###$num = chr($val); } } } ############################################################################# # static const entity entities[] = #{ # /* # ** Markup pre-defined character entities # */ # { "quot", VERS_ALL|VERS_XML, 34 }, # { "amp", VERS_ALL|VERS_XML, 38 }, # ... # { NULL, VERS_UNKNOWN, 0 } # }; # /* all W3C defined document types */ # #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50) # /* special flag */ # #define VERS_XML 65536u sub process_in_file2($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn,$len,@arr,$ent,$ver,$code); $lnn = 0; my $intable = 0; foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); if ($intable) { if ($line =~ /^\{(.+)\}/) { $inc = trim_all($1); @arr = split(',',$inc); $len = scalar @arr; if ($len == 3) { $ent = strip_double_quotes(trim_all($arr[0])); $ver = trim_all($arr[1]); $code = trim_all($arr[2]); if ($ent eq 'NULL') { prt("$lnn: $ent $ver $code - end table\n") if (VERB5()); $intable = 0; last; } $entities{$ent} = [$ver, $code, 0]; prt("$lnn: '$ent' '$ver' '$code'\n") if (VERB5()); } else { pgm_exit(1,"$lnn DID NOT SPLIT 3, got $len [$line] *** FIX ME ***\n"); } } } else { if ($line =~ /const\s+entity\s+entities/) { $intable = 1; prt("$lnn $line - start table\n") if (VERB5()); } } } @arr = sort keys %entities; $len = scalar @arr; prt("Got $len entities from $inf\n"); my ($ra); my $htm = ''; my $cols = 4; my $wrap = 0; my $head = "<th>Name</th><th>Code</th><th>G</th><th>N</th>"; $htm .= "<tr>"; for ($wrap = 0; $wrap < $cols; $wrap++) { $htm .= $head; } $htm .= "</tr>\n"; $wrap = 0; foreach $ent (@arr) { $ra = $entities{$ent}; $ver = ${$ra}[0]; $code = ${$ra}[1]; $htm .= "<tr>" if ($wrap == 0); $htm .= "<td>&$ent;</td><td>$code</td><td>&$ent;</td><td>&#$code;</td>"; $wrap++; if ($wrap == $cols) { $wrap = 0; $htm .= "</tr>\n"; } } if ($wrap) { while ($wrap < $cols) { $wrap++; $htm .= "<td> </td><td> </td><td> </td><td> </td>"; } $htm .= "</tr>\n"; } write_table("$len Tidy Entities",$htm,$out_file4); prt("Wrote $len entities to $out_file4\n"); } ############################################################################# # <table> # <tr # 0 title="U+00009 CHARACTER TABULATION" # 1 data-block="C0 Controls and Basic Latin" # 2 data-category="Cc" # 3 data-set="mmlextra"> # <td class="character"> 	 # <td class="named"><code>&Tab;</code> # <td class="hex"><code>&#x00009;</code> # <td class="dec"><code>&#9;</code> # <td class="desc">CHARACTER TABULATION # <tr title="U+0000A LINE FEED (LF)" data-block="C0 Controls and Basic Latin" data-category="Cc" data-set="mmlextra"> # <td class="character"> 
 # <td class="named"><code>&NewLine;</code> # <td class="hex"><code>&#x0000A;</code> # <td class="dec"><code>&#10;</code> # <td class="desc">LINE FEED (LF) # <tr title="U+00021 EXCLAMATION MARK" data-block="C0 Controls and Basic Latin" data-category="Po" data-set="9573-2003-isonum"> # <td class="character"> ! # <td class="named"><code>&excl;</code> # <td class="hex"><code>&#x00021;</code> # <td class="dec"><code>&#33;</code> # <td class="desc">EXCLAMATION MARK # <tr title="U+00022 QUOTATION MARK" data-block="C0 Controls and Basic Latin" data-category="Po" data-set="predefined xhtml1-special 9573-2003-isonum html5-uppercase"> # <td class="character"> " # <td class="named"><code>&quot; &QUOT;</code> # <td class="hex"><code>&#x00022;</code> # <td class="dec"><code>&#34;</code> # <td class="desc">QUOTATION MARK # ... sub process_in_file3($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn,$len,@arr,$ent,$ver,$code); my ($i,$ch,$cnt,@arr2,$class,$cnt2,$attr); my ($type,$value,$hexchar,$tmp,$decimal,$ra,$added); $lnn = 0; my @ents = (); my $intable = 0; my $tag = ''; my $tail = ''; my $intag = 0; my $trcnt = 0; my $entcnt = 0; my $modents = 0; $class = ''; foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); if ($intable) { if ($line =~ /^<\/table>/) { prt("$lnn: End table...\n") if (VERB9()); $intable = 0; } else { for ($i = 0; $i < $len; $i++) { $ch = substr($line,$i,1); if ($ch eq '<') { $tag = $ch; $i++; $intag = 1; for (; $i < $len; $i++) { $ch = substr($line,$i,1); if ($ch eq '>') { if ($intag) { $tag .= $ch; } else { # $tail .= $ch; } #prt("$tag $tail\n"); if (length($tag) || length($tail)) { if (length($tail)) { if (length($tag)) { @arr = space_split($tail); $cnt = scalar @arr; if ($tag =~ /^<tr/i) { # 0 title="U+00009 CHARACTER TABULATION" # 1 data-block="C0 Controls and Basic Latin" # 2 data-category="Cc" # 3 data-set="mmlextra"> foreach $attr (@arr) { @arr2 = split("=",$attr); $cnt2 = scalar @arr2; if ($cnt2 == 2) { $type = $arr2[0]; $value = strip_quotes($arr2[1]); if ($type eq 'title') { # title="U+00009 CHARACTER TABULATION" } elsif ($type eq 'data-block') { # data-block="C0 Controls and Basic Latin" } elsif ($type eq 'data-category') { # data-category="Cc" } elsif ($type eq 'data-set') { # data-set="mmlextra"> } else { pgm_exit(1,"Failed with '$attr'. type '$type', value '$value' - FIX ME\n"); } } else { pgm_exit(1,"Failed with '$attr'\n"); } } prt("Tag: TR with $cnt attrs\n") if (VERB9()) ; $trcnt++; } elsif ($cnt == 1) { @arr2 = split("=",$arr[0]); $cnt2 = scalar @arr2; if (($cnt2 == 2) && ($arr2[0] =~ /class/i)) { $class = strip_quotes($arr2[1]); # prt("Tag: $tag with class '$class'\n"); if ($class eq 'character') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'named') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'hex') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'dec') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'desc') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } else { pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n"); } } else { prtw("Warning: Tag: $tag with $cnt attrs, $tail - FIX ME\n"); } } else { prt("Tag: $tag with $cnt attrs, $tail\n"); } } else { prt("Tail: $tail\n"); } } else { if ($tag =~ /<code/) { # skip this } elsif ($tag =~ /<\/code>/) { # skip this } else { prt("$tag ONLY\n"); } } } $tag = ''; $tail = ''; $intag = 0; } elsif ($ch eq '<') { # start of another tag closes this... $tail = trim_all($tail); if (length($tag) || length($tail)) { if (length($tail)) { if (length($tag)) { @arr = space_split($tail); $cnt = scalar @arr; prt("tag: $tag with $cnt attrs CHECK ME\n"); } else { #prt("tail: $tail, class $class\n"); if ($class eq 'character') { #prt("tail: $tail, class $class\n"); if ($tail =~ /^\&#x([0-9A-F]+);$/) { $hexchar = $1; prt("C: $class = 0x$hexchar\n") if (VERB2()); } else { pgm_exit(1,"Failed with tail '$tail', class '$class'\n"); } } elsif ($class eq 'named') { #prt("tail: $tail, class $class\n"); @arr2 = split(/\s+/,$tail); @ents = (); foreach $tmp (@arr2) { if ($tmp =~ /^\&(\w+);$/) { $ent = $1; # $entities{$ent} = [$ver, $code, 0]; if (defined $entities{$ent}) { $ra = $entities{$ent}; $ver = ${$ra}[0]; $code = ${$ra}[1]; # maybe check the 'code' value $added = ${$ra}[2]; # check if a duplicate ${$ra}[2]++; prtw("Warning: $ent, $ver, $code already added! CHECK ME!\n") if ($added); } else { $ver = 'VERS_CHECK'; } prt("E: $class = '$ent', vers '$ver'\n") if (VERB2()); ################################### push(@ents,[$ent,$ver]); ################################### # $entcnt++; } else { pgm_exit(1,"Failed with tail '$tail', class '$class'\n"); } } } elsif ($class eq 'hex') { prt("tail: $tail, class $class\n") if (VERB9()); } elsif ($class eq 'dec') { # tail: &#120170;, class dec # prt("tail: $tail, class $class\n"); if ($tail =~ /^\&\#(\d+);$/) { $decimal = $1; prt("D: $class = '$decimal'\n") if (VERB2()); } else { pgm_exit(1,"Failed with tail '$tail', class '$class'\n"); } } elsif ($class eq 'desc') { prt("tail: $tail, class $class\n") if (VERB9()); } else { pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n"); } } } else { if ($tag =~ /<code/) { # skip this } elsif ($tag =~ /<\/code>/) { # skip this } else { prt("$tag ONLY\n"); } } } # start of NEXT tag $tag = $ch; $tail = ''; $intag = 1; } elsif ($intag) { if ($ch =~ /\s/) { $intag = 0; $tail = ''; } else { $tag .= $ch; } } else { $tail .= $ch; } } } } # for the line # at end of line if (length($tag) || length($tail)) { if (length($tail)) { if (length($tag)) { @arr = space_split($tail); $cnt = scalar @arr; prt("tag: $tag with $cnt attrs\n"); } else { prt("tail: $tail, class $class\n") if (VERB5()); } } else { if ($tag =~ /<code/) { # skip this } elsif ($tag =~ /<\/code>/) { # skip this } else { prtw("Warning: $tag ONLY - CHECK ME\n"); } } } $tag = ''; $tail = ''; if (defined $ent && defined $ver && defined $decimal && @ents) { #my %all_ents = (); # entities found in current charref.html #push(@ents,[$ent,$ver]); my ($ra2); foreach $ra (@ents) { $ent = ${$ra}[0]; $ver = ${$ra}[1]; ####################################### if (defined $preferred_vals{$ent}) { $code = $preferred_vals{$ent}; if ($code != $decimal) { prt("Modifying $ent code $decimal to $code\n"); $modified_ents{$ent} = [$code, $decimal]; $decimal = $code; $modents++; } } ####################################### if (defined $exclude_pairs{$ent}) { $ra = $exclude_pairs{$ent}; $ver = ${$ra}[0]; $code = ${$ra}[1]; prt("Excluding $ent, a 'surrogate' pair $ver:$code, dec $decimal!\n"); } else { ####################################### if (defined $all_ents{$ent}) { $ra2 = $all_ents{$ent}; my $pver = ${$ra2}[0]; my $pcod = ${$ra2}[1]; if (($pver ne $ver) || ($pcod != $decimal)) { pgm_exit(1,"Already have $ent, $pcod, $pver, NOW $decimal, $ver!\n"); } } else { $entcnt++; $all_ents{$ent} = [$ver, $decimal, 0]; } ####################################### ### Prepare list by VALUE ### NOTE: can be repeated values ### NOTE: this MISSSES the numeric equivalent ABOVE, ### but not ALL are exact equivalents! $all_by_val{$decimal} = [] if (!defined $all_by_val{$decimal}); $ra2 = $all_by_val{$decimal}; push(@{$ra2}, [$ent,$ver]); ####################################### $len = length($ent); $max_ent = $len if ($len > $max_ent); $len = length($ver); $max_ver = $len if ($len > $max_ver); $len = length($decimal); $max_dec = $len if ($len > $max_dec); } } $ent = undef; $ver = undef; $decimal = undef; @ents = (); } } } else { if ($line =~ /^<table>/) { prt("$lnn: Start table...\n") if (VERB9()); $intable = 1; } } } ####################################################### # $all_ents{$ent} = [$ver, $decimal, 0]; ####################################################### @arr = keys %all_ents; $cnt = scalar @arr; prt("Found $trcnt lines... $entcnt entities, $cnt keys...\n"); @arr = keys %entities; foreach $ent (@arr) { $ra = $entities{$ent}; $ver = ${$ra}[0]; $decimal = ${$ra}[1]; $added = ${$ra}[2]; if ($added == 0) { prtw("Warning: Seem to have missed $ent, $ver, $decimal!\n"); } } ###################################################### ### write html table sorted by VALUE # push(@{$ra2}, [$ent,$ver]); @arr = sort mycmp_dec keys(%all_by_val); $cnt = scalar @arr; # just count of VALUES stored, each could be more than 1 entity prt("Got $cnt decimal value... Write to a html file...\n"); my $htm = ''; my $cols = 4; my $wrap = 0; my $entries = 0; my ($ra2); $htm .= "<tr>"; for ($wrap = 0; $wrap < $cols; $wrap++) { $htm .= "<th>Name</th>"; $htm .= "<th>Code</th>"; $htm .= "<th>G</th>"; $htm .= "<th>N</th>"; } $htm .= "</tr>\n"; $wrap = 0; foreach $code (@arr) { $ra = $all_by_val{$code}; foreach $ra2 (@{$ra}) { $entries++; $ent = ${$ra2}[0]; $ver = ${$ra2}[1]; $htm .= "<tr>" if ($wrap == 0); $htm .= "<td>&$ent;</td>"; $htm .= "<td>$code</td>"; $htm .= "<td>&$ent;</td>"; $htm .= "<td>&#$code;</td>"; $wrap++; if ($wrap == $cols) { $wrap = 0; $htm .= "</tr>\n"; } } } if ($wrap) { while ($wrap < $cols) { $wrap++; $htm .= "<td> </td>"; $htm .= "<td> </td>"; $htm .= "<td> </td>"; $htm .= "<td> </td>"; } $htm .= "</tr>\n"; } my ($name,$dir) = fileparse($inf); # my $out_file7 = $temp_dir.$PATH_SEP."tempallents1.html"; write_table("$entries entries, $entcnt entities, $cnt values, from $name",$htm,$out_file7); prt("$entries entries, $cnt values, $entcnt entities, written to $out_file7...\n"); ###################################################### ### TODO: Write out modified ents # my $out_file8 = $temp_dir.$PATH_SEP."tempmodents1.html"; # $modified_ents{$ent} = [$code, $decimal]; @arr = sort keys( %modified_ents ); $cnt = scalar @arr; if ($cnt != $modents) { pgm_exit(1,"Error: Need to store mods in an array of arrays!\n"); } prt("Write out $cnt ($modents) modified entities...\n"); $htm = ""; $wrap = 0; $htm .= "<tr>"; $htm .= "<th>Name</th>"; $htm .= "<th>G</th>"; $htm .= "<th>Code1</th>"; $htm .= "<th>N</th>"; $htm .= "<th>Code2</th>"; $htm .= "<th>N</th>"; $htm .= "</tr>\n"; foreach $ent (@arr) { $ra2 = $modified_ents{$ent}; $code = ${$ra2}[0]; $ver = ${$ra2}[1]; $htm .= "<tr>"; $htm .= "<td>&$ent;</td>"; $htm .= "<td>&$ent;</td>"; $htm .= "<td>$code</td>"; $htm .= "<td>&#$code;</td>"; $htm .= "<td>$ver</td>"; $htm .= "<td>&#$ver;</td>"; $htm .= "</tr>\n"; } write_table("$cnt modified entities, 2 values",$htm,$out_file8); prt("$cnt entities, written to $out_file8...\n"); ###################################################### # $load_log = 1; # pgm_exit(1,"TEMP EXIT 2\n"); } ############################################################################# # list of some 253 entities, up to HTML 4 it seems #<table class="wikitable sortable"> #<tr style="background:#efefef"> #<th>Name</th> #<th>Character</th> #<th>Unicode code point (decimal)</th> #<th>Standard</th> #<th>DTD<sup id="cite_ref-DTD_1-0" class="reference"><a href="#cite_note-DTD-1">[a]</a></sup></th> #<th>Old ISO subset<sup id="cite_ref-ISOsubset_2-0" class="reference"><a href="#cite_note-ISOsubset-2">[b]</a></sup></th> #<th>Description<sup id="cite_ref-Description_3-0" class="reference"><a href="#cite_note-Description-3">[c]</a></sup></th> #</tr> #<tr> #<td>quot</td> #<td>"</td> #<td>U+0022 (34)</td> #<td>HTML 2.0</td> #<td>HTMLspecial</td> #<td>ISOnum</td> #<td><a href="/wiki/Quotation_mark" title="Quotation mark">quotation mark</a> <i>(APL quote)</i></td> #</tr> #<tr> #<td>amp</td> #<td>&</td> #<td>U+0026 (38)</td> #<td>Original html specification(html 1.0) and HTML 2.0</td> #<td>HTMLspecial and <a rel="nofollow" class="external free" href="http://info.cern.ch/MarkUp/html-spec/html.dtd">http://info.cern.ch/MarkUp/html-spec/html.dtd</a> (originally)</td> #<td>ISOnum</td> #<td><a href="/wiki/Ampersand" title="Ampersand">ampersand</a></td> #</tr> #<tr> #<td>apos</td> #<td>'</td> #<td>U+0027 (39)</td> #<td>XHTML 1.0</td> #<td>HTMLspecial</td> #<td>ISOnum</td> #<td><a href="/wiki/Apostrophe" title="Apostrophe">apostrophe</a> <i>(apostrophe-quote)</i>; see <a href="#Entities_representing_special_characters_in_XHTML">below</a></td> #</tr> sub process_in_file4($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$ent,$dec,$ver,$ra); my $lnn = 0; my $intable = 0; my $trcnt = 0; my $trcnt2 = 0; my $intr = 0; my $tdcnt = 0; my $tvers = 'N/A'; my $code = "Unk"; my $verx = ''; foreach $line (@lines) { chomp $line; $lnn++; if (!$intable) { if ($line =~ /<table\s+class=\"wikitable\s+sortable\">/) { $intable = 1; prt("$lnn: Start table $line\n"); } next; } if ($line =~ /<\/table>/) { $intable = 0; prt("$lnn: End table $line\n"); } if ($line =~ /^<tr>/) { $trcnt++; $intr = 1; $tdcnt = 0; next; } if ($line =~ /^<\/tr>/) { $intr = 0; if ($trcnt2) { if (defined $ent && defined $dec && defined $ver) { prt("$lnn: '$ent', dec $dec, ver$verx $ver $tvers $code\n"); $ent = undef; $dec = undef; $ver = undef; } else { pgm_exit(1,"$lnn: Failed to get threesome...\n"); } } $trcnt2++; next; } # 422 '<td style="background:#ddd"> </td> if ($line =~ /^<td(>|\s+)/) { $tdcnt++; if ($tdcnt == 1) { if ($line =~ /<td>(\w+)<\/td>/) { $ent = $1; $tvers = 'N/A'; $code = 'Unk'; # $entities{$ent} = [$ver, $code, 0]; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $code = ${$ra}[1]; # maybe check the 'code' value } prt("$lnn: $ent\n") if (VERB9()); } else { pgm_exit(1,"$lnn: First TD failed '$line'\n"); } } elsif ($tdcnt == 3) { if ($line =~ /<td>U.+\((\d+)\)<\/td>/) { $dec = $1; } else { pgm_exit(1,"$lnn: Third TD failed '$line'\n"); } } elsif ($tdcnt == 4) { $verx = ''; if ($line =~ /^<td>HTML\s+(.+)<\/td>/) { $ver = $1; } elsif ($line =~ /Original\s+html\h+specification\(html 1.0\)/) { $ver = '1.0'; } elsif ($line =~ /^<td>XHTML\s+(.+)<\/td>/) { $ver = $1; $verx = 'X'; } else { pgm_exit(1,"$lnn: Forth TD failed '$line'\n"); } } } } prt("$lnn: EOF - tr count $trcnt\n"); $load_log = 1; pgm_exit(1,"TEMP EXIT 2\n"); } # ======================================================================== # HTML 4 # https://www.w3.org/TR/REC-html40/sgml/entities.html # <!ENTITY nbsp CDATA " " -- no-break space = non-breaking space, # U+00A0 ISOnum --> # <!ENTITY lang CDATA "〈" -- left-pointing angle bracket = bra, # U+2329 ISOtech --> #<!-- lang is NOT the same character as U+003C 'less than' # or U+2039 'single left-pointing angle quotation mark' --> #<!ENTITY rang CDATA "〉" -- right-pointing angle bracket = ket, # U+232A ISOtech --> #<!-- rang is NOT the same character as U+003E 'greater than' # or U+203A 'single right-pointing angle quotation mark' --> # to # <!ENTITY euro CDATA "€" -- euro sign, U+20AC NEW --> # In comparison with Tidy # Got 3 WARNINGS... # Warning: 391: ent lang, code 9001 vs 10216, vers VERS_FROM40 # Warning: 395: ent rang, code 9002 vs 10217, vers VERS_FROM40 # Warning: Seem to have missed apos, VERS_FROM40|VERS_XML, 39! # https://www.w3.org/TR/html5/syntax.html#named-character-references # lang = U+027E8 = 10216 # rang = U+027E9 = 10217 # and # apos = U+00027 = 39 # ======================================================================== sub process_in_file4f($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$ent,$val,$code,$tvers,$tcode,$ra); my $lnn = 0; my $entcnt = 0; foreach $line (@lines) { chomp $line; $lnn++; if ($line =~ /<!ENTITY\s+/) { $entcnt++; if ($line =~ /<!ENTITY\s+(\w+)\s+CDATA\s+\"(.+)\"\s+/) { $ent = $1; $val = $2; if ($val =~ /^&\#(\d+);/) { $code = $1; } else { pgm_exit(1,"$lnn: Failed value '$val', line '$line'\n"); } if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; # maybe check the 'code' value ${$ra}[2]++; } else { pgm_exit(1,"$lnn: Failed ent '$ent', '$line'\n"); } if ($code == $tcode) { prt("$lnn: ent $ent, code $code, vers $tvers\n"); } else { prtw("Warning: $lnn: ent $ent, code $code vs $tcode, vers $tvers\n"); } } else { pgm_exit(1,"$lnn: Failed decode '$line'\n"); } } } prt("Found $entcnt ENTITY...\n"); my @arr = keys %entities; my ($added); foreach $ent (@arr) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $code = ${$ra}[1]; $added = ${$ra}[2]; if ($added == 0) { prtw("Warning: Seem to have missed $ent, $tvers, $code!\n"); } } pgm_exit(1,"TEMP EXIT 3\n"); } # 2031 entities written to C:\GTools\perl\tempents.c... # 2031 entities written to C:\GTools\perl\tempents.c.html... # 2125 entities written to C:\GTools\perl\tempents2.c... # 2125 entities written to C:\GTools\perl\tempents2.c.html... # $all_ents{$ent} = [$ver, $decimal, 0]; sub do_output($$$) { my ($rh,$out,$inf) = @_; # like \%all_ents my $struc = 'static const entity entities[] ='; my ($key,$ra,$ver,$dec,$line,$ent,@arr); my $td = ''; my $htm = ''; my $cols = 4; my $wrap = 0; my %decimals = (); if ($nocase_sort) { @arr = sort mycmp_nc_sort keys(%{$rh}); } else { @arr = sort keys(%{$rh}); } my @lines = (); push(@lines,$struc); push(@lines,"{"); my $entcnt = 0; # $all_ents{$ent} = [$ver, $decimal, 0]; $max_ent += 3; $max_ver += 1; # add thead my $head = "<th>Name</th><th>Code</th><th>G</th><th>N</th>"; $htm .= "<tr>"; for ($wrap = 0; $wrap < $cols; $wrap++) { $htm .= $head; } $htm .= "</tr>\n"; $wrap = 0; foreach $key (@arr) { $ra = ${$rh}{$key}; $ver = ${$ra}[0]; $dec = ${$ra}[1]; if (defined $decimals{$dec}) { $decimals{$dec}++; # is case sensitive, so is NORMAL to have repeated values # prtw("Warning: REPEATED decimal $ent $ver $dec\n"); } else { $decimals{$dec} = 1; } # got ent=$key,$dec,$ver $td = "<td>&$key;</td><td>$dec</td><td>&$key;</td><td>&#$dec;</td>"; $htm .= "<tr>" if ($wrap == 0); $htm .= $td; $wrap++; if ($wrap == $cols) { $htm .= "</tr>\n"; $wrap = 0; } $ent = "\"$key\","; $ver .= ','; if ($pad_struct) { $ent .= ' ' while (length($ent) < $max_ent); $ver .= ' ' while (length($ver) < $max_ver); $dec = ' '.$dec while (length($dec) < $max_dec); $line = " { $ent$ver$dec },"; } else { $line = " { $ent $ver $dec },"; } push(@lines,$line); $entcnt++; } # { NULL, VERS_UNKNOWN, 0 } $ent = 'NULL,'; $ver = 'VERS_UNKNOWN'; $ver .= ','; $dec = 0; if ($pad_struct) { $ent .= ' ' while (length($ent) < $max_ent); $ver .= ' ' while (length($ver) < $max_ver); $dec = ' '.$dec while (length($dec) < $max_dec); $line = " { $ent$ver$dec }"; } else { $line = " { $ent $ver $dec },"; } push(@lines,$line); $line = "};"; push(@lines,$line); $line = join("\n",@lines)."\n"; write2file($line,$out); prt("$entcnt entities written to $out...\n"); # my ($n,$d,$e) = fileparse($out, qr/\.[^.]*/); if ($wrap) { #$td = "<td>&$key;</td><td>$dec</td><td>&$key;</td><td>&#$dec;</td>"; $td = "<td> </td><td>&ndsp;</td><td> </td><td> </td>"; while ($wrap < $cols) { $htm .= $td; $wrap++; } $htm .= "</tr>\n"; } my ($name,$dir) = fileparse($inf); $out .= ".html"; write_table("$entcnt entities from $name",$htm,$out); prt("$entcnt entities written to $out...\n"); } # '∨' => { # 'codepoints' => [ # 8744 # ], # 'characters' => "\x{2228}" # }, # A problem with 'zwnj'? # "‌": { "codepoints": [8204], "characters": "\u200C" } # sub process_in_json2($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$num,$lnn,$val,$rh2,$ent,$key); my ($ra,$cnt,$tvers,$tcode,$ra2,$i,$msg,$ver); $line = join("",@lines); my $json = JSON->new->allow_nonref; my $rh = $json->decode( $line ); #prt(Dumper($rh)); #$load_log = 1; my @arr = sort keys %{$rh}; $num = scalar @arr; prt("Got $num keys, from json input...\n"); clear_hash_counts(\%entities); my %plus1 = (); my $keycnt = 0; my $rpts = 0; foreach $key (@arr) { $rh2 = ${$rh}{$key}; $ent = $key; $ent =~ s/^&//; $ent =~ s/;$//; #if (defined ${$rh2}{codepoints} && defined ${$rh2}{characters}) { if (defined ${$rh2}{codepoints}) { $ra2 = ${$rh2}{codepoints}; $cnt = scalar @{$ra2}; # first codepoint $val = ${$ra2}[0]; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; ${$ra}[2]++; } else { $tvers = 'VERS_CHECK'; $tcode = $val; } if ($cnt == 2) { # MULTIPLE codepoints # 0 1 2 3 4 $plus1{$ent} = [$tvers,$tcode,0,$cnt,$ra2]; } elsif ($cnt == 1) { if (defined $json_ents{$ent}) { $ra = $json_ents{$ent}; $ver = ${$ra}[0]; $val = ${$ra}[1]; if (($ver ne $tvers) || ($val != $tcode)) { pgm_exit(1,"json_ents $ent ALREADY defined as [$ver, $val], now [$tvers, $tcode]\n"); } $rpts++; } else { $keycnt++; $json_ents{$ent} = [$tvers, $tcode, 0]; } } else { prt(Dumper($ra2)); pgm_exit(1,"Reference array $cnt - not 1 or 2!\n"); } } else { prt(Dumper($rh2)); pgm_exit(1,"Reference hash does not contain 'codepoints'!\n"); } } ########################################################## ### deal with those with two... my $htm = ''; my $tmp = ''; @arr = sort keys %plus1; $cnt = scalar @arr; $tmp = $keycnt + $cnt + $rpts; prt("Added $keycnt entities, with code, to json_ents hash... hv $rpts rpts, $cnt with 2, tot $tmp\n"); my $head = "<th>#</th><th>Name</th><th>G</th><th>Code</th><th>N</th><th>Code</th><th>N</th><th>NN</th>"; $tmp = ''; if ($cnt) { $verbosity = 1; prt("Note: $cnt entities, have more than one codepoint!\n"); ###prt(join(", ", @arr)."\n"); $lnn = 0; $htm .= "<tr>"; $htm .= $head; $htm .= "</tr>\n"; foreach $ent (@arr) { $lnn++; $ra = $plus1{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; $cnt = $plus1{$ent}[3]; $ra2 = $plus1{$ent}[4]; $msg = "$lnn: $ent, $tvers, $tcode - "; $htm .= "<tr><td>$lnn</td><td>&$ent;</td><td>&$ent;</td>"; $tmp = ''; for ($i = 0; $i < $cnt; $i++) { $val = ${$ra2}[$i]; $msg .= "$val "; $htm .= "<td>$val</td><td>&#$val;</td>"; $tmp .= "&#$val;"; } $htm .= "<td>$tmp</td>"; prt("$msg\n") if (VERB5()); $htm .= "</tr>\n"; } $cnt = scalar @arr; prt("Display of $cnt entities, with more than one codepoint!\n"); write_table("$cnt Multi-codepoints entities",$htm,$out_file3); prt("HTML table written to $out_file3...\n"); } } sub compare_ent_hashes($$) { my ($rh1,$rh2) = @_; my @arr1 = sort keys(%{$rh1}); my @arr2 = sort keys(%{$rh2}); my $cnt1 = scalar @arr1; my $cnt2 = scalar @arr2; if (!$cnt1) { prt("Warning: First hash has NO keys...\n"); return; } if (!$cnt2) { prt("Warning: Second hash has NO keys...\n"); return; } prt("Comparing 2 hashes: $cnt1 and $cnt2 keys respectively..\n"); my ($key,$ra1,$vers1,$code1,$ra2,$vers2,$code2,$ent); my $samecnt = 0; my $diffcnt = 0; my $misscnt1 = 0; my $misscnt2 = 0; my $htm = "<tr>"; $htm .= "<th>#</th><th>Name</th><th>G</th>"; $htm .= "<th>Code</th><th>N</th>"; $htm .= "<th>Code</th><th>N</th>"; $htm .= "</tr>\n"; foreach $key (@arr1) { $ent = $key; $ra1 = ${$rh1}{$key}; $vers1 = ${$ra1}[0]; $code1 = ${$ra1}[1]; if (defined ${$rh2}{$key}) { $ra2 = ${$rh2}{$key}; $vers2 = ${$ra2}[0]; $code2 = ${$ra2}[1]; if (($vers1 eq $vers2) && ($code1 == $code2)) { # the SAME - no interest... $samecnt++; } else { prt("Diff: ent $key - rh1 $vers1, $code1 vs rh2 $vers2, $code2\n") if (VERB2()); $diffcnt++; $htm .= "<tr>"; $htm .= "<td>D: $diffcnt</td>"; $htm .= "<td>&$ent;</td><td>&$ent;</td>"; $htm .= "<td>$code1</td><td>&#$code1;</td>"; $htm .= "<td>$code2</td><td>&#$code2;</td>"; $htm .= "</tr>\n" } # mark BOTH as compared ${$ra1}[2] = 1; ${$ra2}[2] = 1; } else { prt("Missed: ent $key $vers1, $code1 ONLY in rh1\n") if (VERB2()); $misscnt1++; $htm .= "<tr>"; $htm .= "<td>M1 $misscnt1</td>"; $htm .= "<td>&$ent;</td><td>&$ent;</td>"; $htm .= "<td>$code1</td><td>&#$code1;</td>"; $htm .= "<td> </td><td> </td>"; $htm .= "</tr>\n" } } foreach $key (@arr2) { $ent = $key; $ra2 = ${$rh2}{$key}; $vers2 = ${$ra2}[0]; $code2 = ${$ra2}[1]; next if (${$ra2}[2]); prt("Missed: ent $key $vers2, $code2 ONLY in rh2\n") if (VERB2()); $misscnt2++; $htm .= "<tr>"; $htm .= "<td>M2 $misscnt2</td>"; $htm .= "<td>&$ent;</td><td>&$ent;</td>"; $htm .= "<td> </td><td> </td>"; $htm .= "<td>$code2</td><td>&#$code2;</td>"; $htm .= "</tr>\n" } prt("Done 2 hashes: $cnt1 and $cnt2 keys - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2...\n"); write_table("Hash Differences - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2",$htm,$out_file5); prt("Has differences written to $out_file5\n"); } sub show_array_ref($) { my $ra = shift; my ($tmp,$cnt); $cnt = scalar @{$ra}; prt("Show of ra of $cnt values...\n"); $cnt = 0; foreach $tmp (@{$ra}) { $cnt++; prt("$cnt: '$tmp'\n"); } } # Character Entity Name Entity Number Description # 1 2 3 4 # <tr><td> </td><td></td> <td>&#32;</td><td>Space</td> </tr> # <tr><td>!</td> <td></td> <td>&#33;</td><td>Exclamation mark</td></tr> # <tr><td>&</td> <td>&amp;</td><td>&#38;</td><td>Ampersand</td> </tr> # tmp '♦</td>' # tmp '&diams;</td>' # tmp '&#9830;</td>' # tmp 'Diamond</td>' sub get_cols($$$$$) { my ($line,$rnum,$rent,$rdec,$rdesc) = @_; $line =~ s/^<tr>//; $line =~ s/<\/tr>$//; my $len = length($line); my @arr = split(/<td>/,$line); my $cnt = scalar @arr; #prt("cnt = $cnt, line '$line'\n"); my ($tmp); my ($i,$num,$ent,$dec,$desc); my @arr2 = (); $i = 0; foreach $tmp (@arr) { $i++; # prt("$i: tmp '$tmp'\n"); next if ($tmp =~ /^\s*$/); next if (length($tmp) == 0); push(@arr2,$tmp); #prt("$i: tmp '$tmp'\n"); } $cnt = scalar @arr2; if ($cnt == 4) { # extract each column value $i = 0; # for ($i = 0; $i < $cnt; $i++) { foreach $tmp (@arr2) { # $tmp = $arr2[$i]; if ($tmp =~ /<\/td>$/) { $tmp =~ s/<\/td>$//; } if ($i == 0) { if ($tmp =~ /^&\#(\d+);$/) { $num = $1; } elsif ($tmp eq ' ') { $num = 160; } else { prt("$i: tmp '$tmp'\n"); show_array_ref(\@arr2); return 0; } } elsif ($i == 1) { if ($tmp =~ /^&(\w+);$/) { $ent = $1; } elsif ($tmp =~ /^\s*$/) { $ent = ''; } else { prt("$i: tmp '$tmp'\n"); return 0; } } elsif ($i == 2) { if ($tmp =~ /^&\#(\d+);$/) { $dec = $1; } else { prt("$i: tmp '$tmp'\n"); return 0; } } elsif ($i == 3) { $desc = $tmp; } $i++; } ${$rnum} = $num; ${$rent} = $ent; ${$rdec} = $dec; ${$rdesc} = $desc; return 1; } return 0; } # Just 238 entities # my $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; # 'C:\Users\user\Documents\Tidy\html-entities.html'; # from : https://www.freeformatter.com/html-entities.html # <table class="bordered-table zebra-striped" style="font-size:11px;"> # HTML entities.</p><table class="bordered-table zebra-striped" style="font-size:11px;"> sub process_in_file6($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$len,$num,$ent,$dec,$desc,$ra,$tvers,$tcode); my $lnn = 0; my $intable = 0; my $intbody = 0; my $tblcnt = 0; my $entcnt = 0; my $htm = ''; clear_hash_counts(\%entities); foreach $line (@lines) { chomp $line; $line = trim_all($line); $lnn++; $len = length($line); next if ($len == 0); if ($line =~ /^<table/) { pgm_exit(1,"$lnn: Already IN table! '$line'\n") if ($intable); prt("$lnn: Start table '$line'\n") if (VERB9()); $intable = 1; $tblcnt++; next; } elsif ($line =~ /^<\/table/) { pgm_exit(1,"$lnn: NOT IN table! '$line'\n") if (!$intable); prt("$lnn: End table '$line'\n") if (VERB9()); $intable = 0; next; } if ($intable) { if ($line =~ /<tbody>/) { pgm_exit(1,"$lnn: Already IN tbody! '$line'\n") if ($intbody); prt("$lnn: Start tbody '$line'\n") if (VERB9()); $intbody = 1; next; } elsif ($line =~ /<\/tbody>/) { pgm_exit(1,"$lnn: NOT IN tbody! '$line'\n") if (!$intbody); prt("$lnn: End tbody '$line'\n") if (VERB9()); $intbody = 0; next; } if ($intbody) { if ($line =~ /^<tr>/) { $ent = ''; if (get_cols($line,\$num,\$ent,\$dec,\$desc)) { $len = length($ent); next if ($len == 0); $entcnt++; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; ${$ra}[2]++; prtw("Warning: free_ents $ent $tvers, tidy $tcode vs $dec\n") if ($tcode != $dec); } else { $tvers = 'VERS_CHECK'; $tcode = $dec; } $free_ents{$ent} = [$tvers, $tcode, 0]; $htm .= "<tr><td>$entcnt</td><td>&$ent;</td><td>$dec</td><td>&$ent;</td>"; $htm .= "<td>&#$dec;</td>"; $htm .= "<td>$tvers</td>"; $htm .= "<td>$desc</td>"; $htm .= "</tr>\n"; } else { pgm_exit(1,"$lnn: TR NOT HANDLED! '$line'\n"); } } else { pgm_exit(1,"$lnn: NOT HANDLED! '$line'\n"); } } } } my @arr = keys %entities; my ($added); my $missed = 0; foreach $ent (@arr) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; $added = ${$ra}[2]; $dec = $tcode; if ($added == 0) { prtw("Warning: Seem to have missed $ent, $tvers, $tcode!\n") if (VERB9()); $htm .= "<tr><td colspan='7'>*** NOT FOUND IN FreeFormatter.com TABLES ***</td></tr>\n" if ($missed == 0); $missed++; $htm .= "<tr><td>$missed</td><td>&$ent;</td><td>$dec</td><td>&$ent;</td>"; $htm .= "<td>&#$dec;</td>"; $htm .= "<td>$tvers</td>"; $htm .= "<td>NOT FOUND in tables</td>"; $htm .= "</tr>\n"; } } write_table("$entcnt FreeFormatter.com, missing $missed",$htm,$out_file6); prt("$tblcnt tables, found $entcnt entities... written to $out_file6\n"); clear_hash_counts(\%entities); # pgm_exit(1,"TEMP EXIT 6\n"); } ######################################### ### MAIN ### ###parse_args(@ARGV); ### check_hash(); process_in_file2($in_file2); # 253 - 'C:\Users\user\Documents\Tidy\entities.c'; #process_in_file6($in_file6); # 238 'C:\Users\user\Documents\Tidy\html-entities.html'; #process_in_file4f($in_file4f); # 'C:\Users\user\Documents\Tidy\html4-ents.html'; #process_in_file4($in_file4); # 253 - 'C:\Users\user\Documents\Tidy\wiki-list-entities.html'; process_in_file3($in_file3); # 'C:\Users\user\Documents\Tidy\charref.html'; => %all_ents do_output(\%all_ents,$out_file,$in_file3); # process_in_json($in_file1); # 'C:\Users\user\Documents\Tidy\htmlmathml.json'; process_in_json2($in_file5j); # = 'C:\Users\user\Documents\Tidy\html5-ents.json'; do_output(\%json_ents,$out_file2,$in_file5j); compare_ent_hashes(\%all_ents,\%json_ents); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); my $verb = VERB2(); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } $verb = VERB2(); prt("Verbosity = $verbosity\n") if ($verb); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log at end. ($load_log)\n") if ($verb); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if ($verb); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if ($verb); } shift @av; } if ($debug_on) { prtw("WARNING: DEBUG is ON!\n"); if (length($in_file) == 0) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); } } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out <file> (-o) = Write output to this file.\n"); } # Hypertext Markup Language - 2.0 September 22, 1995 # https://www.w3.org/MarkUp/html-spec/html-spec_9.html#SEC9.7 # <!-- Modified for use in HTML # $Id: ISOlat1.sgml,v 1.2 1994/11/30 23:45:12 connolly Exp $ --> sub get_html2_ents() { my $txt = <<EOF; <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) --> <!ENTITY Aacute CDATA "Á" -- capital A, acute accent --> <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent --> <!ENTITY Agrave CDATA "À" -- capital A, grave accent --> <!ENTITY Aring CDATA "Å" -- capital A, ring --> <!ENTITY Atilde CDATA "Ã" -- capital A, tilde --> <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark --> <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla --> <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic --> <!ENTITY Eacute CDATA "É" -- capital E, acute accent --> <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent --> <!ENTITY Egrave CDATA "È" -- capital E, grave accent --> <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark --> <!ENTITY Iacute CDATA "Í" -- capital I, acute accent --> <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent --> <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent --> <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark --> <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde --> <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent --> <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent --> <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent --> <!ENTITY Oslash CDATA "Ø" -- capital O, slash --> <!ENTITY Otilde CDATA "Õ" -- capital O, tilde --> <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark --> <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic --> <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent --> <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent --> <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent --> <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark --> <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent --> <!ENTITY aacute CDATA "á" -- small a, acute accent --> <!ENTITY acirc CDATA "â" -- small a, circumflex accent --> <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) --> <!ENTITY agrave CDATA "à" -- small a, grave accent --> <!ENTITY aring CDATA "å" -- small a, ring --> <!ENTITY atilde CDATA "ã" -- small a, tilde --> <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark --> <!ENTITY ccedil CDATA "ç" -- small c, cedilla --> <!ENTITY eacute CDATA "é" -- small e, acute accent --> <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent --> <!ENTITY egrave CDATA "è" -- small e, grave accent --> <!ENTITY eth CDATA "ð" -- small eth, Icelandic --> <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark --> <!ENTITY iacute CDATA "í" -- small i, acute accent --> <!ENTITY icirc CDATA "î" -- small i, circumflex accent --> <!ENTITY igrave CDATA "ì" -- small i, grave accent --> <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark --> <!ENTITY ntilde CDATA "ñ" -- small n, tilde --> <!ENTITY oacute CDATA "ó" -- small o, acute accent --> <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent --> <!ENTITY ograve CDATA "ò" -- small o, grave accent --> <!ENTITY oslash CDATA "ø" -- small o, slash --> <!ENTITY otilde CDATA "õ" -- small o, tilde --> <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark --> <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) --> <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic --> <!ENTITY uacute CDATA "ú" -- small u, acute accent --> <!ENTITY ucirc CDATA "û" -- small u, circumflex accent --> <!ENTITY ugrave CDATA "ù" -- small u, grave accent --> <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark --> <!ENTITY yacute CDATA "ý" -- small y, acute accent --> <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark --> EOF return $txt; } # from : https://www.w3.org/TR/REC-html32#latin1 sub get_html3_ents() { my $txt = <<EOF; <!ENTITY nbsp CDATA " " -- no-break space --> <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark --> <!ENTITY cent CDATA "¢" -- cent sign --> <!ENTITY pound CDATA "£" -- pound sterling sign --> <!ENTITY curren CDATA "¤" -- general currency sign --> <!ENTITY yen CDATA "¥" -- yen sign --> <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar --> <!ENTITY sect CDATA "§" -- section sign --> <!ENTITY uml CDATA "¨" -- umlaut (dieresis) --> <!ENTITY copy CDATA "©" -- copyright sign --> <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine --> <!ENTITY laquo CDATA "«" -- angle quotation mark, left --> <!ENTITY not CDATA "¬" -- not sign --> <!ENTITY shy CDATA "­" -- soft hyphen --> <!ENTITY reg CDATA "®" -- registered sign --> <!ENTITY macr CDATA "¯" -- macron --> <!ENTITY deg CDATA "°" -- degree sign --> <!ENTITY plusmn CDATA "±" -- plus-or-minus sign --> <!ENTITY sup2 CDATA "²" -- superscript two --> <!ENTITY sup3 CDATA "³" -- superscript three --> <!ENTITY acute CDATA "´" -- acute accent --> <!ENTITY micro CDATA "µ" -- micro sign --> <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) --> <!ENTITY middot CDATA "·" -- middle dot --> <!ENTITY cedil CDATA "¸" -- cedilla --> <!ENTITY sup1 CDATA "¹" -- superscript one --> <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine --> <!ENTITY raquo CDATA "»" -- angle quotation mark, right --> <!ENTITY frac14 CDATA "¼" -- fraction one-quarter --> <!ENTITY frac12 CDATA "½" -- fraction one-half --> <!ENTITY frac34 CDATA "¾" -- fraction three-quarters --> <!ENTITY iquest CDATA "¿" -- inverted question mark --> <!ENTITY Agrave CDATA "À" -- capital A, grave accent --> <!ENTITY Aacute CDATA "Á" -- capital A, acute accent --> <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent --> <!ENTITY Atilde CDATA "Ã" -- capital A, tilde --> <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark --> <!ENTITY Aring CDATA "Å" -- capital A, ring --> <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) --> <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla --> <!ENTITY Egrave CDATA "È" -- capital E, grave accent --> <!ENTITY Eacute CDATA "É" -- capital E, acute accent --> <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent --> <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark --> <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent --> <!ENTITY Iacute CDATA "Í" -- capital I, acute accent --> <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent --> <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark --> <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic --> <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde --> <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent --> <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent --> <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent --> <!ENTITY Otilde CDATA "Õ" -- capital O, tilde --> <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark --> <!ENTITY times CDATA "×" -- multiply sign --> <!ENTITY Oslash CDATA "Ø" -- capital O, slash --> <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent --> <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent --> <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent --> <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark --> <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent --> <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic --> <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) --> <!ENTITY agrave CDATA "à" -- small a, grave accent --> <!ENTITY aacute CDATA "á" -- small a, acute accent --> <!ENTITY acirc CDATA "â" -- small a, circumflex accent --> <!ENTITY atilde CDATA "ã" -- small a, tilde --> <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark --> <!ENTITY aring CDATA "å" -- small a, ring --> <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) --> <!ENTITY ccedil CDATA "ç" -- small c, cedilla --> <!ENTITY egrave CDATA "è" -- small e, grave accent --> <!ENTITY eacute CDATA "é" -- small e, acute accent --> <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent --> <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark --> <!ENTITY igrave CDATA "ì" -- small i, grave accent --> <!ENTITY iacute CDATA "í" -- small i, acute accent --> <!ENTITY icirc CDATA "î" -- small i, circumflex accent --> <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark --> <!ENTITY eth CDATA "ð" -- small eth, Icelandic --> <!ENTITY ntilde CDATA "ñ" -- small n, tilde --> <!ENTITY ograve CDATA "ò" -- small o, grave accent --> <!ENTITY oacute CDATA "ó" -- small o, acute accent --> <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent --> <!ENTITY otilde CDATA "õ" -- small o, tilde --> <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark --> <!ENTITY divide CDATA "÷" -- divide sign --> <!ENTITY oslash CDATA "ø" -- small o, slash --> <!ENTITY ugrave CDATA "ù" -- small u, grave accent --> <!ENTITY uacute CDATA "ú" -- small u, acute accent --> <!ENTITY ucirc CDATA "û" -- small u, circumflex accent --> <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark --> <!ENTITY yacute CDATA "ý" -- small y, acute accent --> <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic --> <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark --> EOF return $txt; } # eof - tidyentities.pl