# ### DEBUG ### my $debug_on = 0; my $def_file = 'def_file'; ### program variables my @warnings = (); my $cwd = cwd(); my %entities = (); # entities found in current C code [$vers, $code, 0] my %all_ents = (); # entities found in current charref.html my %all_by_val = (); # listed in DECIMAL order, accounting for REPEATS my $max_ent = 0; my $max_ver = 0; my $max_dec = 0; my %json_ents = (); # entities found in html5-ents.json my %free_ents = (); # entities found in $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; my $curr_date = lu_get_YYYYMMDD_hhmmss(time()); # Difference include - # ‌ 10184 and 10185, BUT these are - # ⟈ 10184 and ⟉ 10185 # from : https://www.w3schools.com/charsets/ref_utf_punctuation.asp # 8204 200C ‌ ZERO WIDTH NON-JOINER # BUT json contains ‌ with decimal 10184 and 10185 which are different 'glyphs'!!! # my %modified_ents = (); my %preferred_vals = ( 'phiv' => 981, 'varphi' => 981, 'OverBar' => 8254, 'UnderBar' => 95, 'epsi' => 949, 'epsiv' => 1013, 'varepsilon' => 1013, 'epsilon' => 949, 'straightepsilon' => 1013, 'strns' => 175, 'angst' => 197, 'ohm' => 937 ); my %equal_pairs = ( 'angst' => [197, 8491], 'ohm' => [937, 8486] ); my %exclude_pairs = ( 'race' => [8756, 817] # 0223D + 00331 = (8765 + 817 = 9582) ); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub mycmp_nc_sort { return -1 if (lc($a) lt lc($b)); return 1 if (lc($a) gt lc($b)); return 0; } sub mycmp_dec { return -1 if ($a < $b); return 1 if ($a > $b); return 0; } sub get_head($) { my $title = shift; my $txt = < $title EOF return $txt; } sub write_table($$$) { my ($tit,$htm,$out) = @_; # like "entities",$htm,$out my $line = get_head($tit); $line .= "\n"; $line .= ""; $line .= "

$tit

\n"; $line .= "\n"; $line .= $htm; $line .= "

\n"; $line .= "

Table columns headed with 'G' is a 'glyph' generated from the alphabetic entity, "; $line .= "like Æ. "; $line .= "Marked with an 'N' is a 'glyph' generated from a numeric entity, "; $line .= "like Æ.

\n"; $line .= "

top

\n"; $line .= "

"; my ($name,$dir) = fileparse($out); $line .= "$name $curr_date"; $line .= "

\n"; $line .= "\n\n"; write2file($line,$out); } my %test_hash = ( 'nlE' => "\x{2266}\x{338}", 'harrcir' => "\x{2948}", 'omid' => "\x{29b6}", 'cularr' => "\x{21b6}", 'ycy' => "\x{44b}", 'ldca' => "\x{2936}" ); sub check_hash() { my $rh = \%test_hash; prt(Dumper($rh)); my @arr = keys %{$rh}; my ($key,$val,$num,$len,$i,$ch,$i2); foreach $key (@arr) { $val = ${$rh}{$key}; #prt("$key "); $num = sprintf("%s", unidecode( $val )); #$num = sprintf("%X", unidecode( $val )); #prt("\n"); prt("$key $num\n"); # $val =~ s/([^[:ascii:]]+)/unidecode($1)/ge; # $len = length($val); # $num = ''; # for ($i = 0; $i < $len; $i++) { # $i2 = $i + 1; # $ch = substr($val,$i,1); # prt("$i2: $ch "); # last if ($ch eq '{'); # } # for (; $i < $len; $i++) { # $ch = substr($val,$i,1); # last if ($ch eq '}'); # $num .= $ch; # } # #if ($val =~ /\\x\{(.+)\}/) { # # $num = $1; # # prt("$key $num\n"); # #} # #$num = chr($val); # prt("$key $num\n"); } pgm_exit(1,"TEMP EXIT\n"); } sub clear_hash_counts($) { my $rh = shift; my @arr = keys %{$rh}; # like \%entities; my ($ent,$ra); foreach $ent (@arr) { $ra = ${$rh}{$ent}; #$tvers = ${$ra}[0]; #$code = ${$ra}[1]; ${$ra}[2] = 0; } } ############################################################################# # from : http://www.w3.org/TR/xml-entity-names/#htmlmathml # DWN: htmlmathml.json # { # "characters": { # "AElig": "\u00C6", # "AMP": "\u0026", # "NotSubset": "\u2282\u20D2", # from : http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f-a.dtd # # # "NotPrecedesEqual": "\u2AAF\u0338", sub process_in_json($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$num,$lnn,$val); $line = join("",@lines); my $json = JSON->new->allow_nonref; my $rh = $json->decode( $line ); my $tag = 'characters'; if (!defined ${$rh}{$tag}) { prt("NO '$tag' defined in file $inf!\n"); return; } my $rh2 = ${$rh}{$tag}; # prt(Dumper($rh2)); $load_log = 1; my @arr = sort mycmp_nc_sort keys(%{$rh2}); my $cnt = scalar @arr; prt("Found $cnt entity keys...\n"); foreach $tag (@arr) { $val = ${$rh2}{$tag}; if ($val =~ /\\x\{([0-9a-f]+)\}/) { $num = hex($1); prt("$tag = $num\n"); } else { ###$num = chr($val); } } } ############################################################################# # static const entity entities[] = #{ # /* # ** Markup pre-defined character entities # */ # { "quot", VERS_ALL|VERS_XML, 34 }, # { "amp", VERS_ALL|VERS_XML, 38 }, # ... # { NULL, VERS_UNKNOWN, 0 } # }; # /* all W3C defined document types */ # #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50) # /* special flag */ # #define VERS_XML 65536u sub process_in_file2($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn,$len,@arr,$ent,$ver,$code); $lnn = 0; my $intable = 0; foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); if ($intable) { if ($line =~ /^\{(.+)\}/) { $inc = trim_all($1); @arr = split(',',$inc); $len = scalar @arr; if ($len == 3) { $ent = strip_double_quotes(trim_all($arr[0])); $ver = trim_all($arr[1]); $code = trim_all($arr[2]); if ($ent eq 'NULL') { prt("$lnn: $ent $ver $code - end table\n") if (VERB5()); $intable = 0; last; } $entities{$ent} = [$ver, $code, 0]; prt("$lnn: '$ent' '$ver' '$code'\n") if (VERB5()); } else { pgm_exit(1,"$lnn DID NOT SPLIT 3, got $len [$line] *** FIX ME ***\n"); } } } else { if ($line =~ /const\s+entity\s+entities/) { $intable = 1; prt("$lnn $line - start table\n") if (VERB5()); } } } @arr = sort keys %entities; $len = scalar @arr; prt("Got $len entities from $inf\n"); my ($ra); my $htm = ''; my $cols = 4; my $wrap = 0; my $head = "NameCodeGN"; $htm .= ""; for ($wrap = 0; $wrap < $cols; $wrap++) { $htm .= $head; } $htm .= "\n"; $wrap = 0; foreach $ent (@arr) { $ra = $entities{$ent}; $ver = ${$ra}[0]; $code = ${$ra}[1]; $htm .= "" if ($wrap == 0); $htm .= "&$ent;$code&$ent;&#$code;"; $wrap++; if ($wrap == $cols) { $wrap = 0; $htm .= "\n"; } } if ($wrap) { while ($wrap < $cols) { $wrap++; $htm .= " "; } $htm .= "\n"; } write_table("$len Tidy Entities",$htm,$out_file4); prt("Wrote $len entities to $out_file4\n"); } ############################################################################# # # # # # # foreach $attr (@arr) { @arr2 = split("=",$attr); $cnt2 = scalar @arr2; if ($cnt2 == 2) { $type = $arr2[0]; $value = strip_quotes($arr2[1]); if ($type eq 'title') { # title="U+00009 CHARACTER TABULATION" } elsif ($type eq 'data-block') { # data-block="C0 Controls and Basic Latin" } elsif ($type eq 'data-category') { # data-category="Cc" } elsif ($type eq 'data-set') { # data-set="mmlextra"> } else { pgm_exit(1,"Failed with '$attr'. type '$type', value '$value' - FIX ME\n"); } } else { pgm_exit(1,"Failed with '$attr'\n"); } } prt("Tag: TR with $cnt attrs\n") if (VERB9()) ; $trcnt++; } elsif ($cnt == 1) { @arr2 = split("=",$arr[0]); $cnt2 = scalar @arr2; if (($cnt2 == 2) && ($arr2[0] =~ /class/i)) { $class = strip_quotes($arr2[1]); # prt("Tag: $tag with class '$class'\n"); if ($class eq 'character') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'named') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'hex') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'dec') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } elsif ($class eq 'desc') { prt("Tag: $tag with class '$class'\n") if (VERB5()); } else { pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n"); } } else { prtw("Warning: Tag: $tag with $cnt attrs, $tail - FIX ME\n"); } } else { prt("Tag: $tag with $cnt attrs, $tail\n"); } } else { prt("Tail: $tail\n"); } } else { if ($tag =~ /

/) {
                                            # skip this
                                        } else {
                                            prt("$tag ONLY\n");
                                        }
                                    }
                                }
                                $tag = '';
                                $tail = '';
                                $intag = 0;
                            } elsif ($ch eq '<') {
                                # start of another tag closes this...
                                $tail = trim_all($tail);
                                if (length($tag) || length($tail)) {
                                    if (length($tail)) {
                                        if (length($tag)) {
                                            @arr = space_split($tail);
                                            $cnt = scalar @arr;
                                            prt("tag: $tag with $cnt attrs CHECK ME\n");
                                        } else {
                                            #prt("tail: $tail, class $class\n");
                                            if ($class eq 'character') {
                                                #prt("tail: $tail, class $class\n");
                                                if ($tail =~ /^\&#x([0-9A-F]+);$/) {
                                                    $hexchar = $1;
                                                    prt("C: $class = 0x$hexchar\n") if (VERB2());
                                                } else {
                                                    pgm_exit(1,"Failed with tail '$tail', class '$class'\n");
                                                }
                                            } elsif ($class eq 'named') {
                                                #prt("tail: $tail, class $class\n");
                                                @arr2 = split(/\s+/,$tail);
                                                @ents = ();
                                                foreach $tmp (@arr2) {
                                                    if ($tmp =~ /^\&(\w+);$/) {
                                                        $ent = $1;
                                                        # $entities{$ent} = [$ver, $code, 0];
                                                        if (defined $entities{$ent}) {
                                                            $ra = $entities{$ent};
                                                            $ver = ${$ra}[0];
                                                            $code = ${$ra}[1];  # maybe check the 'code' value
                                                            $added = ${$ra}[2]; # check if a duplicate
                                                            ${$ra}[2]++;
                                                            prtw("Warning: $ent, $ver, $code already added! CHECK ME!\n") if ($added);
                                                        } else {
                                                            $ver = 'VERS_CHECK';
                                                        }
                                                        prt("E: $class = '$ent', vers '$ver'\n") if (VERB2());
                                                        ###################################
                                                        push(@ents,[$ent,$ver]);
                                                        ###################################
                                                        # $entcnt++;
                                                    } else {
                                                        pgm_exit(1,"Failed with tail '$tail', class '$class'\n");
                                                    }
                                                }
                                            } elsif ($class eq 'hex') {
                                                prt("tail: $tail, class $class\n") if (VERB9());
                                            } elsif ($class eq 'dec') {
                                                # tail: &#120170;, class dec
                                                # prt("tail: $tail, class $class\n");
                                                if ($tail =~ /^\&\#(\d+);$/) {
                                                    $decimal = $1;
                                                    prt("D: $class = '$decimal'\n") if (VERB2());
                                                } else {
                                                    pgm_exit(1,"Failed with tail '$tail', class '$class'\n");
                                                }
                                            } elsif ($class eq 'desc') {
                                                prt("tail: $tail, class $class\n") if (VERB9());
                                            } else {
                                                pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n");
                                            }
                                        }
                                    } else {
                                        if ($tag =~ //) {
                                            # skip this
                                        } else {
                                            prt("$tag ONLY\n");
                                        }
                                    }
                                }
                                # start of NEXT tag
                                $tag = $ch;
                                $tail = '';
                                $intag = 1;
                            } elsif ($intag) {
                                if ($ch =~ /\s/) {
                                    $intag = 0;
                                    $tail = '';
                                } else {
                                    $tag .= $ch;
                                }
                            } else {
                                $tail .= $ch;
                            }
                        }
                    }
                }   # for the line
                # at end of line
                if (length($tag) || length($tail)) {
                    if (length($tail)) {
                        if (length($tag)) {
                            @arr = space_split($tail);
                            $cnt = scalar @arr;
                            prt("tag: $tag with $cnt attrs\n");
                        } else {
                            prt("tail: $tail, class $class\n") if (VERB5());
                        }
                    } else {
                        if ($tag =~ //) {
                            # skip this
                        } else {
                            prtw("Warning: $tag ONLY - CHECK ME\n");
                        }
                    }
                }
                $tag = '';
                $tail = '';
                if (defined $ent && defined $ver && defined $decimal && @ents) {
                    #my %all_ents = ();  # entities found in current charref.html
                    #push(@ents,[$ent,$ver]);
                    my ($ra2);
                    foreach $ra (@ents) {
                        $ent = ${$ra}[0];
                        $ver = ${$ra}[1];
                        #######################################
                        if (defined $preferred_vals{$ent}) {
                            $code = $preferred_vals{$ent};
                            if ($code != $decimal) {
                                prt("Modifying $ent code $decimal to $code\n");
                                $modified_ents{$ent} = [$code, $decimal];
                                $decimal = $code;
                                $modents++;
                            }
                        }
                        #######################################
                        if (defined $exclude_pairs{$ent}) {
                            $ra = $exclude_pairs{$ent};
                            $ver = ${$ra}[0];
                            $code = ${$ra}[1];
                            prt("Excluding $ent, a 'surrogate' pair $ver:$code, dec $decimal!\n");
                        } else {
                            #######################################
                            if (defined $all_ents{$ent}) {
                                $ra2 = $all_ents{$ent};
                                my $pver = ${$ra2}[0];
                                my $pcod = ${$ra2}[1];
                                if (($pver ne $ver) || ($pcod != $decimal)) {
                                    pgm_exit(1,"Already have $ent, $pcod, $pver, NOW $decimal, $ver!\n");
                                }
                            } else {
                                $entcnt++;
                                $all_ents{$ent} = [$ver, $decimal, 0];
                            }
                            #######################################
                            ### Prepare list by VALUE
                            ### NOTE: can be repeated values
                            ### NOTE: this MISSSES the numeric equivalent ABOVE, 
                            ###       but not ALL are exact equivalents!
                            $all_by_val{$decimal} = [] if (!defined $all_by_val{$decimal});
                            $ra2 = $all_by_val{$decimal};
                            push(@{$ra2}, [$ent,$ver]);
                            #######################################
                            $len = length($ent);
                            $max_ent = $len if ($len > $max_ent);
                            $len = length($ver);
                            $max_ver = $len if ($len > $max_ver);
                            $len = length($decimal);
                            $max_dec = $len if ($len > $max_dec);
                        }
                    }
                    $ent = undef;
                    $ver = undef;
                    $decimal = undef;
                    @ents = ();
                }

            }
        } else {
            if ($line =~ /^

#	`&Tab;` #	` ` #	` ` #	CHARACTER TABULATION #
#	`&NewLine;` #	` ` #	` ` #	LINE FEED (LF) #
! #	`&excl;` #	`!` #	`!` #	EXCLAMATION MARK #
" #	`" &QUOT;` #	`"` #	`"` #	QUOTATION MARK # ... sub process_in_file3($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn,$len,@arr,$ent,$ver,$code); my ($i,$ch,$cnt,@arr2,$class,$cnt2,$attr); my ($type,$value,$hexchar,$tmp,$decimal,$ra,$added); $lnn = 0; my @ents = (); my $intable = 0; my $tag = ''; my $tail = ''; my $intag = 0; my $trcnt = 0; my $entcnt = 0; my $modents = 0; $class = ''; foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); if ($intable) { if ($line =~ /^<\/table>/) { prt("$lnn: End table...\n") if (VERB9()); $intable = 0; } else { for ($i = 0; $i < $len; $i++) { $ch = substr($line,$i,1); if ($ch eq '<') { $tag = $ch; $i++; $intag = 1; for (; $i < $len; $i++) { $ch = substr($line,$i,1); if ($ch eq '>') { if ($intag) { $tag .= $ch; } else { # $tail .= $ch; } #prt("$tag $tail\n"); if (length($tag) \|\| length($tail)) { if (length($tail)) { if (length($tag)) { @arr = space_split($tail); $cnt = scalar @arr; if ($tag =~ /^

/) {
                prt("$lnn: Start table...\n") if (VERB9());
                $intable = 1;
            }
        }
    }
    #######################################################
    # $all_ents{$ent} = [$ver, $decimal, 0];
    #######################################################
    @arr = keys %all_ents;
    $cnt = scalar @arr;
    prt("Found $trcnt lines... $entcnt entities, $cnt keys...\n");
    @arr = keys %entities;
    foreach $ent (@arr) {
        $ra = $entities{$ent};
        $ver = ${$ra}[0];
        $decimal = ${$ra}[1];
        $added = ${$ra}[2];
        if ($added == 0) {
            prtw("Warning: Seem to have missed $ent, $ver, $decimal!\n");
        }
    }
    ######################################################
    ### write html table sorted by VALUE
    # push(@{$ra2}, [$ent,$ver]);
    @arr = sort mycmp_dec keys(%all_by_val);
    $cnt = scalar @arr; # just count of VALUES stored, each could be more than 1 entity
    prt("Got $cnt decimal value... Write to a html file...\n");
    my $htm = '';
    my $cols = 4;
    my $wrap = 0;
    my $entries = 0;
    my ($ra2);
    $htm .= "

";
    for ($wrap = 0; $wrap < $cols; $wrap++) {
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
    }
    $htm .= "

\n";
    $wrap = 0;
    foreach $code (@arr) {
        $ra = $all_by_val{$code};
        foreach $ra2 (@{$ra}) {
            $entries++;
            $ent = ${$ra2}[0];
            $ver = ${$ra2}[1];
            $htm .= "

" if ($wrap == 0);
            $htm .= "

";
            $htm .= "

";
            $htm .= "

";
            $htm .= "

";
            $wrap++;
            if ($wrap == $cols) {
                $wrap = 0;
                $htm .= "

\n";
            }
        }
    }
    if ($wrap) {
        while ($wrap < $cols) {
            $wrap++;
            $htm .= "

";
            $htm .= "

";
            $htm .= "

";
            $htm .= "

";
        }
        $htm .= "

\n";
    }
    my ($name,$dir) = fileparse($inf);
    # my $out_file7 = $temp_dir.$PATH_SEP."tempallents1.html";
    write_table("$entries entries, $entcnt entities, $cnt values, from $name",$htm,$out_file7);
    prt("$entries entries, $cnt values, $entcnt entities, written to $out_file7...\n");
    ######################################################
    ### TODO: Write out modified ents
    # my $out_file8 = $temp_dir.$PATH_SEP."tempmodents1.html";
    # $modified_ents{$ent} = [$code, $decimal];
    @arr = sort keys( %modified_ents );
    $cnt = scalar @arr;
    if ($cnt != $modents) {
        pgm_exit(1,"Error: Need to store mods in an array of arrays!\n");
    }
    prt("Write out $cnt ($modents) modified entities...\n");
    $htm = "";
    $wrap = 0;
    $htm .= "

";
    $htm .= "

";
    $htm .= "

";
    $htm .= "

";
    $htm .= "

";
    $htm .= "

";
    $htm .= "

";
    $htm .= "

\n";
    foreach $ent (@arr) {
        $ra2 = $modified_ents{$ent};
        $code = ${$ra2}[0];
        $ver  = ${$ra2}[1];
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

\n";
    }
    write_table("$cnt modified entities, 2 values",$htm,$out_file8);
    prt("$cnt entities, written to $out_file8...\n");

    ######################################################
    # $load_log = 1;
    # pgm_exit(1,"TEMP EXIT 2\n");
}

#############################################################################
# list of some 253 entities, up to HTML 4 it seems
#

Name	Code	G	N
&$ent;	$code	&$ent;	&#$code;

Name	G	Code1	N	Code2	N
&$ent;	&$ent;	$code	&#$code;	$ver	&#$ver;



sub process_in_file4($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = ;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$ent,$dec,$ver,$ra);
    my $lnn = 0;
    my $intable = 0;
    my $trcnt = 0;
    my $trcnt2 = 0;
    my $intr = 0;
    my $tdcnt = 0;
    my $tvers = 'N/A';
    my $code = "Unk";
    my $verx = '';
    foreach $line (@lines) {
        chomp $line;
        $lnn++;
        if (!$intable) {
            if ($line =~ //) {
                $intable = 1;
                prt("$lnn: Start table $line\n");
            }
            next;
        }
        if ($line =~ /<\/table>/) {
            $intable = 0;
            prt("$lnn: End table $line\n");
        }
        if ($line =~ /^

/) {
            $trcnt++;
            $intr = 1;
            $tdcnt = 0;
            next;
        }
        if ($line =~ /^<\/tr>/) {
            $intr = 0;
            if ($trcnt2) {
                if (defined $ent && defined $dec && defined $ver) {
                    prt("$lnn: '$ent', dec $dec, ver$verx $ver $tvers $code\n");
                    $ent = undef;
                    $dec = undef;
                    $ver = undef;
                } else {
                    pgm_exit(1,"$lnn: Failed to get threesome...\n");
                }
            }
            $trcnt2++;
            next;
        }
        # 422 '


        if ($line =~ /^|\s+)/) {
            $tdcnt++;
            if ($tdcnt == 1) {
                if ($line =~ /

";
    $htm .= "

";
    for ($wrap = 0; $wrap < $cols; $wrap++) {
        $htm .= $head;
    }
    $htm .= "

\n";
    $wrap = 0;

    foreach $key (@arr) {
        $ra = ${$rh}{$key};
        $ver = ${$ra}[0];
        $dec = ${$ra}[1];
        if (defined $decimals{$dec}) {
            $decimals{$dec}++;
            # is case sensitive, so is NORMAL to have repeated values
            # prtw("Warning: REPEATED decimal $ent $ver $dec\n");
        } else {
            $decimals{$dec} = 1;
        }
        # got ent=$key,$dec,$ver
        $td = "

";
        $htm .= "

" if ($wrap == 0);
        $htm .= $td;
        $wrap++;
        if ($wrap == $cols) {
            $htm .= "

\n";
            $wrap = 0;
        }
        $ent = "\"$key\",";
        $ver .= ',';
        if ($pad_struct) {
            $ent .= ' ' while (length($ent) < $max_ent);
            $ver .= ' ' while (length($ver) < $max_ver);
            $dec = ' '.$dec while (length($dec) < $max_dec);
            $line = "    { $ent$ver$dec },";
        } else {
            $line = "    { $ent $ver $dec },";
        }
        push(@lines,$line);
        $entcnt++;
    }
    #     { NULL,       VERS_UNKNOWN, 0 }
    $ent = 'NULL,';
    $ver = 'VERS_UNKNOWN';
    $ver .= ',';
    $dec = 0;
    if ($pad_struct) {
        $ent .= ' ' while (length($ent) < $max_ent);
        $ver .= ' ' while (length($ver) < $max_ver);
        $dec = ' '.$dec while (length($dec) < $max_dec);
        $line = "    { $ent$ver$dec }";
    } else {
        $line = "    { $ent $ver $dec },";
    }
    push(@lines,$line);
    $line = "};";
    push(@lines,$line);
    $line = join("\n",@lines)."\n";
    write2file($line,$out);
    prt("$entcnt entities written to $out...\n");
    # my ($n,$d,$e) = fileparse($out, qr/\.[^.]*/);
    if ($wrap) {
        #$td = "

";
        $td = "

";
        while ($wrap < $cols) {
            $htm .= $td;
            $wrap++;
        }
        $htm .= "

\n";
    }
    my ($name,$dir) = fileparse($inf);
    $out .= ".html";
    write_table("$entcnt entities from $name",$htm,$out);
    prt("$entcnt entities written to $out...\n");
}


#          '∨' => {
#                       'codepoints' => [
#                                         8744
#                                       ],
#                       'characters' => "\x{2228}"
#                     },
# A problem with 'zwnj'?
# "‌": { "codepoints": [8204], "characters": "\u200C" }
# 
sub process_in_json2($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = ;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$num,$lnn,$val,$rh2,$ent,$key);
    my ($ra,$cnt,$tvers,$tcode,$ra2,$i,$msg,$ver);
    $line = join("",@lines);
    my $json = JSON->new->allow_nonref;
    my $rh = $json->decode( $line );
    #prt(Dumper($rh));
    #$load_log = 1;
    my @arr = sort keys %{$rh};
    $num = scalar @arr;
    prt("Got $num keys, from json input...\n");
    clear_hash_counts(\%entities);
    my %plus1 = ();
    my $keycnt = 0;
    my $rpts = 0;
    foreach $key (@arr) {
        $rh2 = ${$rh}{$key};
        $ent = $key;
        $ent =~ s/^&//;
        $ent =~ s/;$//;
        #if (defined ${$rh2}{codepoints} && defined ${$rh2}{characters}) {
        if (defined ${$rh2}{codepoints}) {
            $ra2 = ${$rh2}{codepoints};
            $cnt = scalar @{$ra2};
            # first codepoint
            $val = ${$ra2}[0];
            if (defined $entities{$ent}) {
                $ra = $entities{$ent};
                $tvers = ${$ra}[0];
                $tcode = ${$ra}[1];
                ${$ra}[2]++;
            } else {
                $tvers = 'VERS_CHECK';
                $tcode = $val;
            }
            if ($cnt == 2) {
                # MULTIPLE codepoints
                #               0      1      2 3    4
                $plus1{$ent} = [$tvers,$tcode,0,$cnt,$ra2];
            } elsif ($cnt == 1) {
                if (defined $json_ents{$ent}) {
                    $ra = $json_ents{$ent};
                    $ver = ${$ra}[0];
                    $val = ${$ra}[1];
                    if (($ver ne $tvers) || ($val != $tcode)) {
                        pgm_exit(1,"json_ents $ent ALREADY defined as [$ver, $val], now [$tvers, $tcode]\n");
                    }
                    $rpts++;
                } else {
                    $keycnt++;
                    $json_ents{$ent} = [$tvers, $tcode, 0];
                }
            } else {
                prt(Dumper($ra2));
                pgm_exit(1,"Reference array $cnt - not 1 or 2!\n");
            }
        } else {
            prt(Dumper($rh2));
            pgm_exit(1,"Reference hash does not contain 'codepoints'!\n");
        }
    }
    ##########################################################
    ### deal with those with two...
    my $htm = '';
    my $tmp = '';
    @arr = sort keys %plus1;
    $cnt = scalar @arr;
    $tmp = $keycnt + $cnt + $rpts;
    prt("Added $keycnt entities, with code, to json_ents hash... hv $rpts rpts, $cnt with 2, tot $tmp\n");
    my $head = "

";
    $tmp = '';
    if ($cnt) {
        $verbosity = 1;
        prt("Note: $cnt entities, have more than one codepoint!\n");
        ###prt(join(", ", @arr)."\n");
        $lnn = 0;
        $htm .= "

";
        $htm .= $head;
        $htm .= "

\n";
        foreach $ent (@arr) {
            $lnn++;
            $ra = $plus1{$ent};
            $tvers = ${$ra}[0];
            $tcode = ${$ra}[1];
            $cnt = $plus1{$ent}[3];
            $ra2 = $plus1{$ent}[4];
            $msg = "$lnn: $ent, $tvers, $tcode - ";
            $htm .= "

";
            $tmp = '';
            for ($i = 0; $i < $cnt; $i++) {
                $val = ${$ra2}[$i];
                $msg .= "$val ";
                $htm .= "

";
                $tmp .= "&#$val;";
            }
            $htm .= "

";
            prt("$msg\n") if (VERB5());
            $htm .= "

\n";
        }
        $cnt = scalar @arr;
        prt("Display of $cnt entities, with more than one codepoint!\n");
        write_table("$cnt Multi-codepoints entities",$htm,$out_file3);
        prt("HTML table written to $out_file3...\n");
    }
}

sub compare_ent_hashes($$) {
    my ($rh1,$rh2) = @_;
    my @arr1 = sort keys(%{$rh1});
    my @arr2 = sort keys(%{$rh2});
    my $cnt1 = scalar @arr1;
    my $cnt2 = scalar @arr2;
    if (!$cnt1) {
        prt("Warning: First hash has NO keys...\n");
        return;
    }
    if (!$cnt2) {
        prt("Warning: Second hash has NO keys...\n");
        return;
    }
    prt("Comparing 2 hashes: $cnt1 and $cnt2 keys respectively..\n");
    my ($key,$ra1,$vers1,$code1,$ra2,$vers2,$code2,$ent);
    my $samecnt = 0;
    my $diffcnt = 0;
    my $misscnt1 = 0;
    my $misscnt2 = 0;
    my $htm = "

";
    $htm .= "

";
    $htm .= "

";
    $htm .= "

";
    $htm .= "

\n";
    foreach $key (@arr1) {
        $ent = $key;
        $ra1 = ${$rh1}{$key};
        $vers1 = ${$ra1}[0];
        $code1 = ${$ra1}[1];
        if (defined ${$rh2}{$key}) {
            $ra2 = ${$rh2}{$key};
            $vers2 = ${$ra2}[0];
            $code2 = ${$ra2}[1];
            if (($vers1 eq $vers2) && ($code1 == $code2)) {
                # the SAME - no interest...
                $samecnt++;
            } else {
                prt("Diff: ent $key - rh1 $vers1, $code1 vs rh2 $vers2, $code2\n") if (VERB2());
                $diffcnt++;
                $htm .= "

";
                $htm .= "

";
                $htm .= "

";
                $htm .= "

";
                $htm .= "

";
                $htm .= "

\n"
            }
            # mark BOTH as compared
            ${$ra1}[2] = 1;
            ${$ra2}[2] = 1;
        } else {
            prt("Missed: ent $key $vers1, $code1 ONLY in rh1\n") if (VERB2());
            $misscnt1++;
            $htm .= "

";
            $htm .= "

";
            $htm .= "

";
            $htm .= "

";
            $htm .= "

";
            $htm .= "

\n"
        }
    }
    foreach $key (@arr2) {
        $ent = $key;
        $ra2 = ${$rh2}{$key};
        $vers2 = ${$ra2}[0];
        $code2 = ${$ra2}[1];
        next if (${$ra2}[2]);
        prt("Missed: ent $key $vers2, $code2 ONLY in rh2\n") if (VERB2());
        $misscnt2++;
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

";
        $htm .= "

\n"
    }
    prt("Done 2 hashes: $cnt1 and $cnt2 keys - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2...\n");
    write_table("Hash Differences - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2",$htm,$out_file5);
    prt("Has differences written to $out_file5\n");
}

sub show_array_ref($) {
    my $ra = shift;
    my ($tmp,$cnt);
    $cnt = scalar @{$ra};
    prt("Show of ra of $cnt values...\n");
    $cnt = 0;
    foreach $tmp (@{$ra}) {
        $cnt++;
        prt("$cnt: '$tmp'\n");
    }
}

# Character	            Entity Name	       Entity Number     Description
#         1             2                  3                 4  
#


# tmp '♦'
# tmp '&diams;'
# tmp '&#9830;'
# tmp 'Diamond'
sub get_cols($$$$$) {
    my ($line,$rnum,$rent,$rdec,$rdesc) = @_;
    $line =~ s/^

//;
    $line =~ s/<\/tr>$//;
    my $len = length($line);
    my @arr = split(/

Name

Character

Unicode code point (decimal)

Standard

DTD^[a]

Old ISO subset^[b]

Description^[c]

quot

U+0022 (34)

HTML 2.0

HTMLspecial

ISOnum

quotation mark (APL quote)

amp

U+0026 (38)

Original html specification(html 1.0) and HTML 2.0

HTMLspecial and http://info.cern.ch/MarkUp/html-spec/html.dtd (originally)

ISOnum

ampersand

apos

U+0027 (39)

XHTML 1.0

HTMLspecial

ISOnum

apostrophe (apostrophe-quote); see below

(\w+)<\/td>/) { $ent = $1; $tvers = 'N/A'; $code = 'Unk'; # $entities{$ent} = [$ver, $code, 0]; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $code = ${$ra}[1]; # maybe check the 'code' value } prt("$lnn: $ent\n") if (VERB9()); } else { pgm_exit(1,"$lnn: First TD failed '$line'\n"); } } elsif ($tdcnt == 3) { if ($line =~ /

U.+$(\d+)$<\/td>/) { $dec = $1; } else { pgm_exit(1,"$lnn: Third TD failed '$line'\n"); } } elsif ($tdcnt == 4) { $verx = ''; if ($line =~ /^

HTML\s+(.+)<\/td>/) { $ver = $1; } elsif ($line =~ /Original\s+html\h+specification$html 1.0$/) { $ver = '1.0'; } elsif ($line =~ /^

XHTML\s+(.+)<\/td>/) { $ver = $1; $verx = 'X'; } else { pgm_exit(1,"$lnn: Forth TD failed '$line'\n"); } } } } prt("$lnn: EOF - tr count $trcnt\n"); $load_log = 1; pgm_exit(1,"TEMP EXIT 2\n"); } # ======================================================================== # HTML 4 # https://www.w3.org/TR/REC-html40/sgml/entities.html # # # # # # to # # In comparison with Tidy # Got 3 WARNINGS... # Warning: 391: ent lang, code 9001 vs 10216, vers VERS_FROM40 # Warning: 395: ent rang, code 9002 vs 10217, vers VERS_FROM40 # Warning: Seem to have missed apos, VERS_FROM40|VERS_XML, 39! # https://www.w3.org/TR/html5/syntax.html#named-character-references # lang = U+027E8 = 10216 # rang = U+027E9 = 10217 # and # apos = U+00027 = 39 # ======================================================================== sub process_in_file4f($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$ent,$val,$code,$tvers,$tcode,$ra); my $lnn = 0; my $entcnt = 0; foreach $line (@lines) { chomp $line; $lnn++; if ($line =~ /Name

Code

&$key;

$dec

&$key;

&#$dec;

&$key;

$dec

&$key;

&#$dec;

&ndsp;

Name

Code

$lnn

&$ent;

$val

&#$val;

$tmp

Name

Code

D: $diffcnt

&$ent;

$code1

&#$code1;

$code2

&#$code2;

M1 $misscnt1

&$ent;

$code1

&#$code1;

M2 $misscnt2

&$ent;

$code2

&#$code2;

Space

Exclamation mark

Ampersand

/,$line); my $cnt = scalar @arr; #prt("cnt = $cnt, line '$line'\n"); my ($tmp); my ($i,$num,$ent,$dec,$desc); my @arr2 = (); $i = 0; foreach $tmp (@arr) { $i++; # prt("$i: tmp '$tmp'\n"); next if ($tmp =~ /^\s*$/); next if (length($tmp) == 0); push(@arr2,$tmp); #prt("$i: tmp '$tmp'\n"); } $cnt = scalar @arr2; if ($cnt == 4) { # extract each column value $i = 0; # for ($i = 0; $i < $cnt; $i++) { foreach $tmp (@arr2) { # $tmp = $arr2[$i]; if ($tmp =~ /<\/td>$/) { $tmp =~ s/<\/td>$//; } if ($i == 0) { if ($tmp =~ /^&\#(\d+);$/) { $num = $1; } elsif ($tmp eq ' ') { $num = 160; } else { prt("$i: tmp '$tmp'\n"); show_array_ref(\@arr2); return 0; } } elsif ($i == 1) { if ($tmp =~ /^&(\w+);$/) { $ent = $1; } elsif ($tmp =~ /^\s*$/) { $ent = ''; } else { prt("$i: tmp '$tmp'\n"); return 0; } } elsif ($i == 2) { if ($tmp =~ /^&\#(\d+);$/) { $dec = $1; } else { prt("$i: tmp '$tmp'\n"); return 0; } } elsif ($i == 3) { $desc = $tmp; } $i++; } ${$rnum} = $num; ${$rent} = $ent; ${$rdec} = $dec; ${$rdesc} = $desc; return 1; } return 0; } # Just 238 entities # my $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html'; # 'C:\Users\user\Documents\Tidy\html-entities.html'; # from : https://www.freeformatter.com/html-entities.html # # HTML entities.

sub process_in_file6($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$len,$num,$ent,$dec,$desc,$ra,$tvers,$tcode); my $lnn = 0; my $intable = 0; my $intbody = 0; my $tblcnt = 0; my $entcnt = 0; my $htm = ''; clear_hash_counts(\%entities); foreach $line (@lines) { chomp $line; $line = trim_all($line); $lnn++; $len = length($line); next if ($len == 0); if ($line =~ /^

/) { pgm_exit(1,"$lnn: Already IN tbody! '$line'\n") if ($intbody); prt("$lnn: Start tbody '$line'\n") if (VERB9()); $intbody = 1; next; } elsif ($line =~ /<\/tbody>/) { pgm_exit(1,"$lnn: NOT IN tbody! '$line'\n") if (!$intbody); prt("$lnn: End tbody '$line'\n") if (VERB9()); $intbody = 0; next; } if ($intbody) { if ($line =~ /^/) { $ent = ''; if (get_cols($line,\$num,\$ent,\$dec,\$desc)) { $len = length($ent); next if ($len == 0); $entcnt++; if (defined $entities{$ent}) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; ${$ra}[2]++; prtw("Warning: free_ents $ent $tvers, tidy $tcode vs $dec\n") if ($tcode != $dec); } else { $tvers = 'VERS_CHECK'; $tcode = $dec; } $free_ents{$ent} = [$tvers, $tcode, 0]; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n"; } else { pgm_exit(1,"$lnn: TR NOT HANDLED! '$line'\n"); } } else { pgm_exit(1,"$lnn: NOT HANDLED! '$line'\n"); } } } } my @arr = keys %entities; my ($added); my $missed = 0; foreach $ent (@arr) { $ra = $entities{$ent}; $tvers = ${$ra}[0]; $tcode = ${$ra}[1]; $added = ${$ra}[2]; $dec = $tcode; if ($added == 0) { prtw("Warning: Seem to have missed $ent, $tvers, $tcode!\n") if (VERB9()); $htm .= "\n" if ($missed == 0); $missed++; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= ""; $htm .= "\n"; } } write_table("$entcnt FreeFormatter.com, missing $missed",$htm,$out_file6); prt("$tblcnt tables, found $entcnt entities... written to $out_file6\n"); clear_hash_counts(\%entities); # pgm_exit(1,"TEMP EXIT 6\n"); } ######################################### ### MAIN ### ###parse_args(@ARGV); ### check_hash(); process_in_file2($in_file2); # 253 - 'C:\Users\user\Documents\Tidy\entities.c'; #process_in_file6($in_file6); # 238 'C:\Users\user\Documents\Tidy\html-entities.html'; #process_in_file4f($in_file4f); # 'C:\Users\user\Documents\Tidy\html4-ents.html'; #process_in_file4($in_file4); # 253 - 'C:\Users\user\Documents\Tidy\wiki-list-entities.html'; process_in_file3($in_file3); # 'C:\Users\user\Documents\Tidy\charref.html'; => %all_ents do_output(\%all_ents,$out_file,$in_file3); # process_in_json($in_file1); # 'C:\Users\user\Documents\Tidy\htmlmathml.json'; process_in_json2($in_file5j); # = 'C:\Users\user\Documents\Tidy\html5-ents.json'; do_output(\%json_ents,$out_file2,$in_file5j); compare_ent_hashes(\%all_ents,\%json_ents); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); my $verb = VERB2(); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } $verb = VERB2(); prt("Verbosity = $verbosity\n") if ($verb); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log at end. ($load_log)\n") if ($verb); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if ($verb); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if ($verb); } shift @av; } if ($debug_on) { prtw("WARNING: DEBUG is ON!\n"); if (length($in_file) == 0) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); } } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); } # Hypertext Markup Language - 2.0 September 22, 1995 # https://www.w3.org/MarkUp/html-spec/html-spec_9.html#SEC9.7 # sub get_html2_ents() { my $txt = < EOF return $txt; } # from : https://www.w3.org/TR/REC-html32#latin1 sub get_html3_ents() { my $txt = < EOF return $txt; } # eof - tidyentities.pl

$entcnt	&$ent;	$dec	&$ent;	&#$dec;	$tvers	$desc
* NOT FOUND IN FreeFormatter.com TABLES *
$missed	&$ent;	$dec	&$ent;	&#$dec;	$tvers	NOT FOUND in tables