#!/usr/bin/perl -w # NAME: tidyentities.pl # AIM: VERY SPECIALIZED - Read a C table, and a JSON set of entities, and compare # 2017-11-06 - purpose extended # 08/02/2015 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; use JSON; use Data::Dumper; use feature 'unicode_strings'; use utf8; use Text::Unidecode; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.6 2017-11-06"; ###my $VERS = "0.0.5 2015-01-09"; my $load_log = 0; my $in_file = ''; my $in_file1 = 'C:\Users\user\Documents\Tidy\htmlmathml.json'; my $in_file2 = 'C:\Users\user\Documents\Tidy\entities.c'; # my $in_file2 = 'F:\Projects\tidy-html5\src\entities.c'; # 2017-11-06 - add from : https://dev.w3.org/html5/html-author/charref my $in_file3 = 'C:\Users\user\Documents\Tidy\charref.html'; # from : https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references my $in_file4 = 'C:\Users\user\Documents\Tidy\wiki-list-entities.html'; # from : https://www.w3.org/TR/REC-html40/sgml/entities.html my $in_file4f = 'C:\Users\user\Documents\Tidy\html4-ents.html'; # from : https://www.w3.org/TR/html5/ # from : https://www.w3.org/TR/html5/syntax.html#named-character-references my $in_file5 = 'C:\Users\user\Documents\Tidy\syntax5-ents.html'; my $in_file5j = 'C:\Users\user\Documents\Tidy\html5-ents.json'; # from : https://www.freeformatter.com/html-entities.html - Just 238 entities # my $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; my $verbosity = 0; my $out_file = $temp_dir.$PATH_SEP."tempents1.c"; my $out_file2 = $temp_dir.$PATH_SEP."tempents12.c"; my $out_file3 = $temp_dir.$PATH_SEP."tempmultents1.html"; my $out_file4 = $temp_dir.$PATH_SEP."temptidyents1.html"; my $out_file5 = $temp_dir.$PATH_SEP."tempdiffents1.html"; my $out_file6 = $temp_dir.$PATH_SEP."tempfreeents1.html"; my $out_file7 = $temp_dir.$PATH_SEP."tempallents1.html"; my $out_file8 = $temp_dir.$PATH_SEP."tempmodents1.html"; my $nocase_sort = 0; my $pad_struct = 0; # # from: https://www.w3.org/TR/REC-xml/#sec-predefined-ent - version 1.0 # Only 5 defined - amp, lt, gt, apos, quot # from: https://www.w3.org/MarkUp/html-spec/html-spec_toc.html # Hypertext Markup Language - 2.0 September 22, 1995 - below # from : https://www.w3.org/TR/REC-html32#latin1 - below # HTML5 - from : https://www.w3.org/TR/html5/syntax.html#named-character-references #
# ### DEBUG ### my $debug_on = 0; my $def_file = 'def_file'; ### program variables my @warnings = (); my $cwd = cwd(); my %entities = (); # entities found in current C code [$vers, $code, 0] my %all_ents = (); # entities found in current charref.html my %all_by_val = (); # listed in DECIMAL order, accounting for REPEATS my $max_ent = 0; my $max_ver = 0; my $max_dec = 0; my %json_ents = (); # entities found in html5-ents.json my %free_ents = (); # entities found in $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; my $curr_date = lu_get_YYYYMMDD_hhmmss(time()); # Difference include - # ‌ 10184 and 10185, BUT these are - # ⟈ 10184 and ⟉ 10185 # from : https://www.w3schools.com/charsets/ref_utf_punctuation.asp # 8204 200C ‌ ZERO WIDTH NON-JOINER # BUT json contains ‌ with decimal 10184 and 10185 which are different 'glyphs'!!! # my %modified_ents = (); my %preferred_vals = ( 'phiv' => 981, 'varphi' => 981, 'OverBar' => 8254, 'UnderBar' => 95, 'epsi' => 949, 'epsiv' => 1013, 'varepsilon' => 1013, 'epsilon' => 949, 'straightepsilon' => 1013, 'strns' => 175, 'angst' => 197, 'ohm' => 937 ); my %equal_pairs = ( 'angst' => [197, 8491], 'ohm' => [937, 8486] ); my %exclude_pairs = ( 'race' => [8756, 817] # 0223D + 00331 = (8765 + 817 = 9582) ); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub mycmp_nc_sort { return -1 if (lc($a) lt lc($b)); return 1 if (lc($a) gt lc($b)); return 0; } sub mycmp_dec { return -1 if ($a < $b); return 1 if ($a > $b); return 0; } sub get_head($) { my $title = shift; my $txt = < $title EOF return $txt; } sub write_table($$$) { my ($tit,$htm,$out) = @_; # like "entities",$htm,$out my $line = get_head($tit); $line .= "\n"; $line .= ""; $line .= "

$tit

\n"; $line .= "\n"; $line .= $htm; $line .= "
\n"; $line .= "

Table columns headed with 'G' is a 'glyph' generated from the alphabetic entity, "; $line .= "like &AElig;. "; $line .= "Marked with an 'N' is a 'glyph' generated from a numeric entity, "; $line .= "like &#198;.

\n"; $line .= "

top

\n"; $line .= "

"; my ($name,$dir) = fileparse($out); $line .= "$name $curr_date"; $line .= "

\n"; $line .= "\n\n"; write2file($line,$out); } my %test_hash = ( 'nlE' => "\x{2266}\x{338}", 'harrcir' => "\x{2948}", 'omid' => "\x{29b6}", 'cularr' => "\x{21b6}", 'ycy' => "\x{44b}", 'ldca' => "\x{2936}" ); sub check_hash() { my $rh = \%test_hash; prt(Dumper($rh)); my @arr = keys %{$rh}; my ($key,$val,$num,$len,$i,$ch,$i2); foreach $key (@arr) { $val = ${$rh}{$key}; #prt("$key "); $num = sprintf("%s", unidecode( $val )); #$num = sprintf("%X", unidecode( $val )); #prt("\n"); prt("$key $num\n"); # $val =~ s/([^[:ascii:]]+)/unidecode($1)/ge; # $len = length($val); # $num = ''; # for ($i = 0; $i < $len; $i++) { # $i2 = $i + 1; # $ch = substr($val,$i,1); # prt("$i2: $ch "); # last if ($ch eq '{'); # } # for (; $i < $len; $i++) { # $ch = substr($val,$i,1); # last if ($ch eq '}'); # $num .= $ch; # } # #if ($val =~ /\\x\{(.+)\}/) { # # $num = $1; # # prt("$key $num\n"); # #} # #$num = chr($val); # prt("$key $num\n"); } pgm_exit(1,"TEMP EXIT\n"); } sub clear_hash_counts($) { my $rh = shift; my @arr = keys %{$rh}; # like \%entities; my ($ent,$ra); foreach $ent (@arr) { $ra = ${$rh}{$ent}; #$tvers = ${$ra}[0]; #$code = ${$ra}[1]; ${$ra}[2] = 0; } } ############################################################################# # from : http://www.w3.org/TR/xml-entity-names/#htmlmathml # DWN: htmlmathml.json # { # "characters": { # "AElig": "\u00C6", # "AMP": "\u0026", # "NotSubset": "\u2282\u20D2", # from : http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f-a.dtd # # # "NotPrecedesEqual": "\u2AAF\u0338", sub process_in_json($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$num,$lnn,$val); $line = join("",@lines); my $json = JSON->new->allow_nonref; my $rh = $json->decode( $line ); my $tag = 'characters'; if (!defined ${$rh}{$tag}) { prt("NO '$tag' defined in file $inf!\n"); return; } my $rh2 = ${$rh}{$tag}; # prt(Dumper($rh2)); $load_log = 1; my @arr = sort mycmp_nc_sort keys(%{$rh2}); my $cnt = scalar @arr; prt("Found $cnt entity keys...\n"); foreach $tag (@arr) { $val = ${$rh2}{$tag}; if ($val =~ /\\x\{([0-9a-f]+)\}/) { $num = hex($1); prt("$tag = $num\n"); } else { ###$num = chr($val); } } } ############################################################################# # static const entity entities[] = #{ # /* # ** Markup pre-defined character entities # */ # { "quot", VERS_ALL|VERS_XML, 34 }, # { "amp", VERS_ALL|VERS_XML, 38 }, # ... # { NULL, VERS_UNKNOWN, 0 } # }; # /* all W3C defined document types */ # #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50) # /* special flag */ # #define VERS_XML 65536u sub process_in_file2($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn,$len,@arr,$ent,$ver,$code); $lnn = 0; my $intable = 0; foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); if ($intable) { if ($line =~ /^\{(.+)\}/) { $inc = trim_all($1); @arr = split(',',$inc); $len = scalar @arr; if ($len == 3) { $ent = strip_double_quotes(trim_all($arr[0])); $ver = trim_all($arr[1]); $code = trim_all($arr[2]); if ($ent eq 'NULL') { prt("$lnn: $ent $ver $code - end table\n") if (VERB5()); $intable = 0; last; } $entities{$ent} = [$ver, $code, 0]; prt("$lnn: '$ent' '$ver' '$code'\n") if (VERB5()); } else { pgm_exit(1,"$lnn DID NOT SPLIT 3, got $len [$line] *** FIX ME ***\n"); } } } else { if ($line =~ /const\s+entity\s+entities/) { $intable = 1; prt("$lnn $line - start table\n") if (VERB5()); } } } @arr = sort keys %entities; $len = scalar @arr; prt("Got $len entities from $inf\n"); my ($ra); my $htm = ''; my $cols = 4; my $wrap = 0; my $head = "NameCodeGN"; $htm .= ""; for ($wrap = 0; $wrap < $cols; $wrap++) { $htm .= $head; } $htm .= "\n"; $wrap = 0; foreach $ent (@arr) { $ra = $entities{$ent}; $ver = ${$ra}[0]; $code = ${$ra}[1]; $htm .= "" if ($wrap == 0); $htm .= "&$ent;$code&$ent;&#$code;"; $wrap++; if ($wrap == $cols) { $wrap = 0; $htm .= "\n"; } } if ($wrap) { while ($wrap < $cols) { $wrap++; $htm .= "    "; } $htm .= "\n"; } write_table("$len Tidy Entities",$htm,$out_file4); prt("Wrote $len entities to $out_file4\n"); } ############################################################################# # # # # # # foreach $attr (@arr) { @arr2 = split("=",$attr); $cnt2 = scalar @arr2; if ($cnt2 == 2) { $type = $arr2[0]; $value = strip_quotes($arr2[1]); if ($type eq 'title') { # title="U+00009 CHARACTER TABULATION" } elsif ($type eq 'data-block') { # data-block="C0 Controls and Basic Latin" } elsif ($type eq 'data-category') { # data-category="Cc" } elsif ($type eq 'data-set') { # data-set="mmlextra"> } else { pgm_exit(1,"Failed with '$attr'. type '$type', value '$value' - FIX ME\n"); } } else { pgm_exit(1,"Failed with '$attr'\n"); } } prt("Tag: TR with $cnt attrs\n") if (VERB9()) ; $trcnt++; } elsif ($cnt == 1) { @arr2 = split("=",$arr[0]); $cnt2 = scalar @arr2; if (($cnt2 == 2) && ($arr2[0] =~ /class/i)) { $class = strip_quotes($arr2[1]); # prt("Tag: $tag with class '$class'\n"); if ($class eq 'character') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'named') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'hex') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'dec') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'desc') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } else { pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n"); } } else { prtw("Warning: Tag: $tag with $cnt attrs, $tail - FIX ME\n"); } } else { prt("Tag: $tag with $cnt attrs, $tail\n"); } } else { prt("Tail: $tail\n"); } } else { if ($tag =~ //) { # skip this } else { prt("$tag ONLY\n"); } } } $tag = ''; $tail = ''; $intag = 0; } elsif ($ch eq '<') { # start of another tag closes this... $tail = trim_all($tail); if (length($tag) || length($tail)) { if (length($tail)) { if (length($tag)) { @arr = space_split($tail); $cnt = scalar @arr; prt("tag: $tag with $cnt attrs CHECK ME\n"); } else { #prt("tail: $tail, class $class\n"); if ($class eq 'character') { #prt("tail: $tail, class $class\n"); if ($tail =~ /^\&#x([0-9A-F]+);$/) { $hexchar = $1; prt("C: $class = 0x$hexchar\n") if (VERB2()); } else { pgm_exit(1,"Failed with tail '$tail', class '$class'\n"); } } elsif ($class eq 'named') { #prt("tail: $tail, class $class\n"); @arr2 = split(/\s+/,$tail); @ents = (); foreach $tmp (@arr2) { if ($tmp =~ /^\&(\w+);$/) { $ent = $1; # $entities{$ent} = [$ver, $code, 0]; if (defined $entities{$ent}) { $ra = $entities{$ent}; $ver = ${$ra}[0]; $code = ${$ra}[1]; # maybe check the 'code' value $added = ${$ra}[2]; # check if a duplicate ${$ra}[2]++; prtw("Warning: $ent, $ver, $code already added! CHECK ME!\n") if ($added); } else { $ver = 'VERS_CHECK'; } prt("E: $class = '$ent', vers '$ver'\n") if (VERB2()); ################################### push(@ents,[$ent,$ver]); ################################### # $entcnt++; } else { pgm_exit(1,"Failed with tail '$tail', class '$class'\n"); } } } elsif ($class eq 'hex') { prt("tail: $tail, class $class\n") if (VERB9()); } elsif ($class eq 'dec') { # tail: &#120170;, class dec # prt("tail: $tail, class $class\n"); if ($tail =~ /^\&\#(\d+);$/) { $decimal = $1; prt("D: $class = '$decimal'\n") if (VERB2()); } else { pgm_exit(1,"Failed with tail '$tail', class '$class'\n"); } } elsif ($class eq 'desc') { prt("tail: $tail, class $class\n") if (VERB9()); } else { pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n"); } } } else { if ($tag =~ //) { # skip this } else { prt("$tag ONLY\n"); } } } # start of NEXT tag $tag = $ch; $tail = ''; $intag = 1; } elsif ($intag) { if ($ch =~ /\s/) { $intag = 0; $tail = ''; } else { $tag .= $ch; } } else { $tail .= $ch; } } } } # for the line # at end of line if (length($tag) || length($tail)) { if (length($tail)) { if (length($tag)) { @arr = space_split($tail); $cnt = scalar @arr; prt("tag: $tag with $cnt attrs\n"); } else { prt("tail: $tail, class $class\n") if (VERB5()); } } else { if ($tag =~ //) { # skip this } else { prtw("Warning: $tag ONLY - CHECK ME\n"); } } } $tag = ''; $tail = ''; if (defined $ent && defined $ver && defined $decimal && @ents) { #my %all_ents = (); # entities found in current charref.html #push(@ents,[$ent,$ver]); my ($ra2); foreach $ra (@ents) { $ent = ${$ra}[0]; $ver = ${$ra}[1]; ####################################### if (defined $preferred_vals{$ent}) { $code = $preferred_vals{$ent}; if ($code != $decimal) { prt("Modifying $ent code $decimal to $code\n"); $modified_ents{$ent} = [$code, $decimal]; $decimal = $code; $modents++; } } ####################################### if (defined $exclude_pairs{$ent}) { $ra = $exclude_pairs{$ent}; $ver = ${$ra}[0]; $code = ${$ra}[1]; prt("Excluding $ent, a 'surrogate' pair $ver:$code, dec $decimal!\n"); } else { ####################################### if (defined $all_ents{$ent}) { $ra2 = $all_ents{$ent}; my $pver = ${$ra2}[0]; my $pcod = ${$ra2}[1]; if (($pver ne $ver) || ($pcod != $decimal)) { pgm_exit(1,"Already have $ent, $pcod, $pver, NOW $decimal, $ver!\n"); } } else { $entcnt++; $all_ents{$ent} = [$ver, $decimal, 0]; } ####################################### ### Prepare list by VALUE ### NOTE: can be repeated values ### NOTE: this MISSSES the numeric equivalent ABOVE, ### but not ALL are exact equivalents! $all_by_val{$decimal} = [] if (!defined $all_by_val{$decimal}); $ra2 = $all_by_val{$decimal}; push(@{$ra2}, [$ent,$ver]); ####################################### $len = length($ent); $max_ent = $len if ($len > $max_ent); $len = length($ver); $max_ver = $len if ($len > $max_ver); $len = length($decimal); $max_dec = $len if ($len > $max_dec); } } $ent = undef; $ver = undef; $decimal = undef; @ents = (); } } } else { if ($line =~ /^
# &Tab; # &#x00009; # &#9; # CHARACTER TABULATION #
# &NewLine; # &#x0000A; # &#10; # LINE FEED (LF) #
! # &excl; # &#x00021; # &#33; # EXCLAMATION MARK #
" # &quot; &QUOT; # &#x00022; # &#34; # QUOTATION MARK # ... sub process_in_file3($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn,$len,@arr,$ent,$ver,$code); my ($i,$ch,$cnt,@arr2,$class,$cnt2,$attr); my ($type,$value,$hexchar,$tmp,$decimal,$ra,$added); $lnn = 0; my @ents = (); my $intable = 0; my $tag = ''; my $tail = ''; my $intag = 0; my $trcnt = 0; my $entcnt = 0; my $modents = 0; $class = ''; foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); if ($intable) { if ($line =~ /^<\/table>/) { prt("$lnn: End table...\n") if (VERB9()); $intable = 0; } else { for ($i = 0; $i < $len; $i++) { $ch = substr($line,$i,1); if ($ch eq '<') { $tag = $ch; $i++; $intag = 1; for (; $i < $len; $i++) { $ch = substr($line,$i,1); if ($ch eq '>') { if ($intag) { $tag .= $ch; } else { # $tail .= $ch; } #prt("$tag $tail\n"); if (length($tag) || length($tail)) { if (length($tail)) { if (length($tag)) { @arr = space_split($tail); $cnt = scalar @arr; if ($tag =~ /^
/) { prt("$lnn: Start table...\n") if (VERB9()); $intable = 1; } } } ####################################################### # $all_ents{$ent} = [$ver, $decimal, 0]; ####################################################### @arr = keys %all_ents; $cnt = scalar @arr; prt("Found $trcnt lines... $entcnt entities, $cnt keys...\n"); @arr = keys %entities; foreach $ent (@arr) { $ra = $entities{$ent}; $ver = ${$ra}[0]; $decimal = ${$ra}[1]; $added = ${$ra}[2]; if ($added == 0) { prtw("Warning: Seem to have missed $ent, $ver, $decimal!\n"); } } ###################################################### ### write html table sorted by VALUE # push(@{$ra2}, [$ent,$ver]); @arr = sort mycmp_dec keys(%all_by_val); $cnt = scalar @arr; # just count of VALUES stored, each could be more than 1 entity prt("Got $cnt decimal value... Write to a html file...\n"); my $htm = ''; my $cols = 4; my $wrap = 0; my $entries = 0; my ($ra2); $htm .= ""; for ($wrap = 0; $wrap < $cols; $wrap++) { $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; } $htm .= "\n"; $wrap = 0; foreach $code (@arr) { $ra = $all_by_val{$code}; foreach $ra2 (@{$ra}) { $entries++; $ent = ${$ra2}[0]; $ver = ${$ra2}[1]; $htm .= "" if ($wrap == 0); $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $wrap++; if ($wrap == $cols) { $wrap = 0; $htm .= "\n"; } } } if ($wrap) { while ($wrap < $cols) { $wrap++; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; } $htm .= "\n"; } my ($name,$dir) = fileparse($inf); # my $out_file7 = $temp_dir.$PATH_SEP."tempallents1.html"; write_table("$entries entries, $entcnt entities, $cnt values, from $name",$htm,$out_file7); prt("$entries entries, $cnt values, $entcnt entities, written to $out_file7...\n"); ###################################################### ### TODO: Write out modified ents # my $out_file8 = $temp_dir.$PATH_SEP."tempmodents1.html"; # $modified_ents{$ent} = [$code, $decimal]; @arr = sort keys( %modified_ents ); $cnt = scalar @arr; if ($cnt != $modents) { pgm_exit(1,"Error: Need to store mods in an array of arrays!\n"); } prt("Write out $cnt ($modents) modified entities...\n"); $htm = ""; $wrap = 0; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n"; foreach $ent (@arr) { $ra2 = $modified_ents{$ent}; $code = ${$ra2}[0]; $ver = ${$ra2}[1]; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n"; } write_table("$cnt modified entities, 2 values",$htm,$out_file8); prt("$cnt entities, written to $out_file8...\n"); ###################################################### # $load_log = 1; # pgm_exit(1,"TEMP EXIT 2\n"); } ############################################################################# # list of some 253 entities, up to HTML 4 it seems #
NameCodeGN
&$ent;$code&$ent;&#$code;
    
NameGCode1NCode2N
&$ent;&$ent;$code&#$code;$ver&#$ver;
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub process_in_file4($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$ent,$dec,$ver,$ra); my $lnn = 0; my $intable = 0; my $trcnt = 0; my $trcnt2 = 0; my $intr = 0; my $tdcnt = 0; my $tvers = 'N/A'; my $code = "Unk"; my $verx = ''; foreach $line (@lines) { chomp $line; $lnn++; if (!$intable) { if ($line =~ //) { $intable = 1; prt("$lnn: Start table $line\n"); } next; } if ($line =~ /<\/table>/) { $intable = 0; prt("$lnn: End table $line\n"); } if ($line =~ /^/) { $trcnt++; $intr = 1; $tdcnt = 0; next; } if ($line =~ /^<\/tr>/) { $intr = 0; if ($trcnt2) { if (defined $ent && defined $dec && defined $ver) { prt("$lnn: '$ent', dec $dec, ver$verx $ver $tvers $code\n"); $ent = undef; $dec = undef; $ver = undef; } else { pgm_exit(1,"$lnn: Failed to get threesome...\n"); } } $trcnt2++; next; } # 422 ' if ($line =~ /^|\s+)/) { $tdcnt++; if ($tdcnt == 1) { if ($line =~ /"; $htm .= ""; for ($wrap = 0; $wrap < $cols; $wrap++) { $htm .= $head; } $htm .= "\n"; $wrap = 0; foreach $key (@arr) { $ra = ${$rh}{$key}; $ver = ${$ra}[0]; $dec = ${$ra}[1]; if (defined $decimals{$dec}) { $decimals{$dec}++; # is case sensitive, so is NORMAL to have repeated values # prtw("Warning: REPEATED decimal $ent $ver $dec\n"); } else { $decimals{$dec} = 1; } # got ent=$key,$dec,$ver $td = ""; $htm .= "" if ($wrap == 0); $htm .= $td; $wrap++; if ($wrap == $cols) { $htm .= "\n"; $wrap = 0; } $ent = "\"$key\","; $ver .= ','; if ($pad_struct) { $ent .= ' ' while (length($ent) < $max_ent); $ver .= ' ' while (length($ver) < $max_ver); $dec = ' '.$dec while (length($dec) < $max_dec); $line = " { $ent$ver$dec },"; } else { $line = " { $ent $ver $dec },"; } push(@lines,$line); $entcnt++; } # { NULL, VERS_UNKNOWN, 0 } $ent = 'NULL,'; $ver = 'VERS_UNKNOWN'; $ver .= ','; $dec = 0; if ($pad_struct) { $ent .= ' ' while (length($ent) < $max_ent); $ver .= ' ' while (length($ver) < $max_ver); $dec = ' '.$dec while (length($dec) < $max_dec); $line = " { $ent$ver$dec }"; } else { $line = " { $ent $ver $dec },"; } push(@lines,$line); $line = "};"; push(@lines,$line); $line = join("\n",@lines)."\n"; write2file($line,$out); prt("$entcnt entities written to $out...\n"); # my ($n,$d,$e) = fileparse($out, qr/\.[^.]*/); if ($wrap) { #$td = ""; $td = ""; while ($wrap < $cols) { $htm .= $td; $wrap++; } $htm .= "\n"; } my ($name,$dir) = fileparse($inf); $out .= ".html"; write_table("$entcnt entities from $name",$htm,$out); prt("$entcnt entities written to $out...\n"); } # '∨' => { # 'codepoints' => [ # 8744 # ], # 'characters' => "\x{2228}" # }, # A problem with 'zwnj'? # "‌": { "codepoints": [8204], "characters": "\u200C" } # sub process_in_json2($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$num,$lnn,$val,$rh2,$ent,$key); my ($ra,$cnt,$tvers,$tcode,$ra2,$i,$msg,$ver); $line = join("",@lines); my $json = JSON->new->allow_nonref; my $rh = $json->decode( $line ); #prt(Dumper($rh)); #$load_log = 1; my @arr = sort keys %{$rh}; $num = scalar @arr; prt("Got $num keys, from json input...\n"); clear_hash_counts(\%entities); my %plus1 = (); my $keycnt = 0; my $rpts = 0; foreach $key (@arr) { $rh2 = ${$rh}{$key}; $ent = $key; $ent =~ s/^&//; $ent =~ s/;$//; #if (defined ${$rh2}{codepoints} && defined ${$rh2}{characters}) { if (defined ${$rh2}{codepoints}) { $ra2 = ${$rh2}{codepoints}; $cnt = scalar @{$ra2}; # first codepoint $val = ${$ra2}[0]; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; ${$ra}[2]++; } else { $tvers = 'VERS_CHECK'; $tcode = $val; } if ($cnt == 2) { # MULTIPLE codepoints # 0 1 2 3 4 $plus1{$ent} = [$tvers,$tcode,0,$cnt,$ra2]; } elsif ($cnt == 1) { if (defined $json_ents{$ent}) { $ra = $json_ents{$ent}; $ver = ${$ra}[0]; $val = ${$ra}[1]; if (($ver ne $tvers) || ($val != $tcode)) { pgm_exit(1,"json_ents $ent ALREADY defined as [$ver, $val], now [$tvers, $tcode]\n"); } $rpts++; } else { $keycnt++; $json_ents{$ent} = [$tvers, $tcode, 0]; } } else { prt(Dumper($ra2)); pgm_exit(1,"Reference array $cnt - not 1 or 2!\n"); } } else { prt(Dumper($rh2)); pgm_exit(1,"Reference hash does not contain 'codepoints'!\n"); } } ########################################################## ### deal with those with two... my $htm = ''; my $tmp = ''; @arr = sort keys %plus1; $cnt = scalar @arr; $tmp = $keycnt + $cnt + $rpts; prt("Added $keycnt entities, with code, to json_ents hash... hv $rpts rpts, $cnt with 2, tot $tmp\n"); my $head = ""; $tmp = ''; if ($cnt) { $verbosity = 1; prt("Note: $cnt entities, have more than one codepoint!\n"); ###prt(join(", ", @arr)."\n"); $lnn = 0; $htm .= ""; $htm .= $head; $htm .= "\n"; foreach $ent (@arr) { $lnn++; $ra = $plus1{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; $cnt = $plus1{$ent}[3]; $ra2 = $plus1{$ent}[4]; $msg = "$lnn: $ent, $tvers, $tcode - "; $htm .= ""; $tmp = ''; for ($i = 0; $i < $cnt; $i++) { $val = ${$ra2}[$i]; $msg .= "$val "; $htm .= ""; $tmp .= "&#$val;"; } $htm .= ""; prt("$msg\n") if (VERB5()); $htm .= "\n"; } $cnt = scalar @arr; prt("Display of $cnt entities, with more than one codepoint!\n"); write_table("$cnt Multi-codepoints entities",$htm,$out_file3); prt("HTML table written to $out_file3...\n"); } } sub compare_ent_hashes($$) { my ($rh1,$rh2) = @_; my @arr1 = sort keys(%{$rh1}); my @arr2 = sort keys(%{$rh2}); my $cnt1 = scalar @arr1; my $cnt2 = scalar @arr2; if (!$cnt1) { prt("Warning: First hash has NO keys...\n"); return; } if (!$cnt2) { prt("Warning: Second hash has NO keys...\n"); return; } prt("Comparing 2 hashes: $cnt1 and $cnt2 keys respectively..\n"); my ($key,$ra1,$vers1,$code1,$ra2,$vers2,$code2,$ent); my $samecnt = 0; my $diffcnt = 0; my $misscnt1 = 0; my $misscnt2 = 0; my $htm = ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n"; foreach $key (@arr1) { $ent = $key; $ra1 = ${$rh1}{$key}; $vers1 = ${$ra1}[0]; $code1 = ${$ra1}[1]; if (defined ${$rh2}{$key}) { $ra2 = ${$rh2}{$key}; $vers2 = ${$ra2}[0]; $code2 = ${$ra2}[1]; if (($vers1 eq $vers2) && ($code1 == $code2)) { # the SAME - no interest... $samecnt++; } else { prt("Diff: ent $key - rh1 $vers1, $code1 vs rh2 $vers2, $code2\n") if (VERB2()); $diffcnt++; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n" } # mark BOTH as compared ${$ra1}[2] = 1; ${$ra2}[2] = 1; } else { prt("Missed: ent $key $vers1, $code1 ONLY in rh1\n") if (VERB2()); $misscnt1++; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n" } } foreach $key (@arr2) { $ent = $key; $ra2 = ${$rh2}{$key}; $vers2 = ${$ra2}[0]; $code2 = ${$ra2}[1]; next if (${$ra2}[2]); prt("Missed: ent $key $vers2, $code2 ONLY in rh2\n") if (VERB2()); $misscnt2++; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n" } prt("Done 2 hashes: $cnt1 and $cnt2 keys - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2...\n"); write_table("Hash Differences - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2",$htm,$out_file5); prt("Has differences written to $out_file5\n"); } sub show_array_ref($) { my $ra = shift; my ($tmp,$cnt); $cnt = scalar @{$ra}; prt("Show of ra of $cnt values...\n"); $cnt = 0; foreach $tmp (@{$ra}) { $cnt++; prt("$cnt: '$tmp'\n"); } } # Character Entity Name Entity Number Description # 1 2 3 4 # # # # tmp '♦' # tmp '&diams;' # tmp '&#9830;' # tmp 'Diamond' sub get_cols($$$$$) { my ($line,$rnum,$rent,$rdec,$rdesc) = @_; $line =~ s/^//; $line =~ s/<\/tr>$//; my $len = length($line); my @arr = split(/
NameCharacterUnicode code point (decimal)StandardDTD[a]Old ISO subset[b]Description[c]
quot"U+0022 (34)HTML 2.0HTMLspecialISOnumquotation mark (APL quote)
amp&U+0026 (38)Original html specification(html 1.0) and HTML 2.0HTMLspecial and http://info.cern.ch/MarkUp/html-spec/html.dtd (originally)ISOnumampersand
apos'U+0027 (39)XHTML 1.0HTMLspecialISOnumapostrophe (apostrophe-quote); see below
 (\w+)<\/td>/) { $ent = $1; $tvers = 'N/A'; $code = 'Unk'; # $entities{$ent} = [$ver, $code, 0]; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $code = ${$ra}[1]; # maybe check the 'code' value } prt("$lnn: $ent\n") if (VERB9()); } else { pgm_exit(1,"$lnn: First TD failed '$line'\n"); } } elsif ($tdcnt == 3) { if ($line =~ /U.+\((\d+)\)<\/td>/) { $dec = $1; } else { pgm_exit(1,"$lnn: Third TD failed '$line'\n"); } } elsif ($tdcnt == 4) { $verx = ''; if ($line =~ /^HTML\s+(.+)<\/td>/) { $ver = $1; } elsif ($line =~ /Original\s+html\h+specification\(html 1.0\)/) { $ver = '1.0'; } elsif ($line =~ /^XHTML\s+(.+)<\/td>/) { $ver = $1; $verx = 'X'; } else { pgm_exit(1,"$lnn: Forth TD failed '$line'\n"); } } } } prt("$lnn: EOF - tr count $trcnt\n"); $load_log = 1; pgm_exit(1,"TEMP EXIT 2\n"); } # ======================================================================== # HTML 4 # https://www.w3.org/TR/REC-html40/sgml/entities.html # # # # # # to # # In comparison with Tidy # Got 3 WARNINGS... # Warning: 391: ent lang, code 9001 vs 10216, vers VERS_FROM40 # Warning: 395: ent rang, code 9002 vs 10217, vers VERS_FROM40 # Warning: Seem to have missed apos, VERS_FROM40|VERS_XML, 39! # https://www.w3.org/TR/html5/syntax.html#named-character-references # lang = U+027E8 = 10216 # rang = U+027E9 = 10217 # and # apos = U+00027 = 39 # ======================================================================== sub process_in_file4f($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$ent,$val,$code,$tvers,$tcode,$ra); my $lnn = 0; my $entcnt = 0; foreach $line (@lines) { chomp $line; $lnn++; if ($line =~ /NameCodeGN
&$key;$dec&$key;&#$dec;
&$key;$dec&$key;&#$dec; &ndsp;  
#NameGCodeNCodeNNN
$lnn&$ent;&$ent;$val&#$val;$tmp
#NameGCodeNCodeN
D: $diffcnt&$ent;&$ent;$code1&#$code1;$code2&#$code2;
M1 $misscnt1&$ent;&$ent;$code1&#$code1;  
M2 $misscnt2&$ent;&$ent;  $code2&#$code2;
  &#32;Space
! &#33;Exclamation mark
& &amp;&#38;Ampersand
/,$line); my $cnt = scalar @arr; #prt("cnt = $cnt, line '$line'\n"); my ($tmp); my ($i,$num,$ent,$dec,$desc); my @arr2 = (); $i = 0; foreach $tmp (@arr) { $i++; # prt("$i: tmp '$tmp'\n"); next if ($tmp =~ /^\s*$/); next if (length($tmp) == 0); push(@arr2,$tmp); #prt("$i: tmp '$tmp'\n"); } $cnt = scalar @arr2; if ($cnt == 4) { # extract each column value $i = 0; # for ($i = 0; $i < $cnt; $i++) { foreach $tmp (@arr2) { # $tmp = $arr2[$i]; if ($tmp =~ /<\/td>$/) { $tmp =~ s/<\/td>$//; } if ($i == 0) { if ($tmp =~ /^&\#(\d+);$/) { $num = $1; } elsif ($tmp eq ' ') { $num = 160; } else { prt("$i: tmp '$tmp'\n"); show_array_ref(\@arr2); return 0; } } elsif ($i == 1) { if ($tmp =~ /^&(\w+);$/) { $ent = $1; } elsif ($tmp =~ /^\s*$/) { $ent = ''; } else { prt("$i: tmp '$tmp'\n"); return 0; } } elsif ($i == 2) { if ($tmp =~ /^&\#(\d+);$/) { $dec = $1; } else { prt("$i: tmp '$tmp'\n"); return 0; } } elsif ($i == 3) { $desc = $tmp; } $i++; } ${$rnum} = $num; ${$rent} = $ent; ${$rdec} = $dec; ${$rdesc} = $desc; return 1; } return 0; } # Just 238 entities # my $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; # 'C:\Users\user\Documents\Tidy\html-entities.html'; # from : https://www.freeformatter.com/html-entities.html # # HTML entities.

sub process_in_file6($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$len,$num,$ent,$dec,$desc,$ra,$tvers,$tcode); my $lnn = 0; my $intable = 0; my $intbody = 0; my $tblcnt = 0; my $entcnt = 0; my $htm = ''; clear_hash_counts(\%entities); foreach $line (@lines) { chomp $line; $line = trim_all($line); $lnn++; $len = length($line); next if ($len == 0); if ($line =~ /^
/) { pgm_exit(1,"$lnn: Already IN tbody! '$line'\n") if ($intbody); prt("$lnn: Start tbody '$line'\n") if (VERB9()); $intbody = 1; next; } elsif ($line =~ /<\/tbody>/) { pgm_exit(1,"$lnn: NOT IN tbody! '$line'\n") if (!$intbody); prt("$lnn: End tbody '$line'\n") if (VERB9()); $intbody = 0; next; } if ($intbody) { if ($line =~ /^/) { $ent = ''; if (get_cols($line,\$num,\$ent,\$dec,\$desc)) { $len = length($ent); next if ($len == 0); $entcnt++; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; ${$ra}[2]++; prtw("Warning: free_ents $ent $tvers, tidy $tcode vs $dec\n") if ($tcode != $dec); } else { $tvers = 'VERS_CHECK'; $tcode = $dec; } $free_ents{$ent} = [$tvers, $tcode, 0]; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n"; } else { pgm_exit(1,"$lnn: TR NOT HANDLED! '$line'\n"); } } else { pgm_exit(1,"$lnn: NOT HANDLED! '$line'\n"); } } } } my @arr = keys %entities; my ($added); my $missed = 0; foreach $ent (@arr) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; $added = ${$ra}[2]; $dec = $tcode; if ($added == 0) { prtw("Warning: Seem to have missed $ent, $tvers, $tcode!\n") if (VERB9()); $htm .= "\n" if ($missed == 0); $missed++; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n"; } } write_table("$entcnt FreeFormatter.com, missing $missed",$htm,$out_file6); prt("$tblcnt tables, found $entcnt entities... written to $out_file6\n"); clear_hash_counts(\%entities); # pgm_exit(1,"TEMP EXIT 6\n"); } ######################################### ### MAIN ### ###parse_args(@ARGV); ### check_hash(); process_in_file2($in_file2); # 253 - 'C:\Users\user\Documents\Tidy\entities.c'; #process_in_file6($in_file6); # 238 'C:\Users\user\Documents\Tidy\html-entities.html'; #process_in_file4f($in_file4f); # 'C:\Users\user\Documents\Tidy\html4-ents.html'; #process_in_file4($in_file4); # 253 - 'C:\Users\user\Documents\Tidy\wiki-list-entities.html'; process_in_file3($in_file3); # 'C:\Users\user\Documents\Tidy\charref.html'; => %all_ents do_output(\%all_ents,$out_file,$in_file3); # process_in_json($in_file1); # 'C:\Users\user\Documents\Tidy\htmlmathml.json'; process_in_json2($in_file5j); # = 'C:\Users\user\Documents\Tidy\html5-ents.json'; do_output(\%json_ents,$out_file2,$in_file5j); compare_ent_hashes(\%all_ents,\%json_ents); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); my $verb = VERB2(); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } $verb = VERB2(); prt("Verbosity = $verbosity\n") if ($verb); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log at end. ($load_log)\n") if ($verb); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if ($verb); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if ($verb); } shift @av; } if ($debug_on) { prtw("WARNING: DEBUG is ON!\n"); if (length($in_file) == 0) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); } } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); } # Hypertext Markup Language - 2.0 September 22, 1995 # https://www.w3.org/MarkUp/html-spec/html-spec_9.html#SEC9.7 # sub get_html2_ents() { my $txt = < EOF return $txt; } # from : https://www.w3.org/TR/REC-html32#latin1 sub get_html3_ents() { my $txt = < EOF return $txt; } # eof - tidyentities.pl
$entcnt&$ent;$dec&$ent;&#$dec;$tvers$desc
*** NOT FOUND IN FreeFormatter.com TABLES ***
$missed&$ent;$dec&$ent;&#$dec;$tversNOT FOUND in tables