imgalt01.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:42 2010 from imgalt01.pl 2006/10/27 13.7 KB.

#!/Perl
# imgalt01.pl - 2006.10.24 - geoff mclane (geoffmclane.com)
# AIM: To extract the <img alt="..." atribute for translation
# If $addtr is 1, then a search and load current 'tranlation'
# which is added to the table ...
# =====================================================================
use strict;
require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n";
require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
# user variable
my $def_folder = 'C:\HOMEPAGE\P26\travel'; 
my $def_input = $def_folder . '\tunisia.htm';
my $def_output = 'tempalt2.htm';
my $addtr = 1;
# from file
my $tr_file = $def_folder . '\tempalt.htm';
my $dosubs = 1; # modify in3 file, changing the alt text, and write out3
my $def_in3 = $def_folder . '\tunisfr2.htm';
my $def_out3 = $def_folder . '\tempalt3.htm';
my @trtable = ();
my @tlines = ();
my @langarr = ();
# debug
my $dbg1 = 1; # show length after adjustments
my $dbg2 = 0; # show 'other' tags
my $dbg3 = 0; # show collections phase
my $dbg4 = 0; # show sub collection phase
my $dbg5 = 0; # show the text collection
my $dbg6 = 0; # show substitution
# program variables
my $line = '';
my @lines = ();
my @frlines = ();
my $cnt = 0;
my $txt = '';
my $ccnt = 0;
my $newtxt = '';
my @attlist = ();
my @altlist = ();
my $in_file = $def_input;
my $out_file = $def_output;
my $htm_head = <<"EOF";
<html>
<head>
<title>Alt List</title>
</head>
<body>
<table border="2">
EOF
my $htm_tail = <<"EOF";
</table>
</body>
</html>
EOF
$in_file = pop @ARGV if (@ARGV);
$out_file = pop @ARGV if (@ARGV);
prt( "Got input from [$in_file], output to [$out_file] ...\n" );
if ( ! -f $in_file) {
   mydie("OOPS: Can NOT locate [$in_file] ...\n");
}
if ($addtr) {
   load_existing_table($tr_file);
}
open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n");
@lines = <IF>;   # slurp it all in
close IF;
$cnt = scalar @lines;
prt("Processing $cnt lines from [$in_file] ...\n");
$txt = join("\n", @lines);
$ccnt = length($txt);
prt("Or $ccnt characters from [$in_file] ...\n");
extract_img_alts( $txt );
show_att_list();
out_alt_list( $out_file );
if ($dosubs && @langarr) {
   open IFF, "<$def_in3" or mydie( "OOPS: Can not open file $def_in3 ... $! ...\n" );
   @frlines = <IFF>;
   close IFF;
   prt( "Process " . scalar @frlines . " lines from [$def_in3] ...\n" );
   $txt = do_substitution();
   open OFF, ">$def_out3" or mydie( "YEEK! Unable to create [$def_out3] ... $! ...\n" );
   print OFF $txt;
   close OFF;
   system( $def_out3 );
}
#$ccnt = length($newtxt);
#write_out_file($newtxt, $out_file);
#system($out_file);
close_log($outfile,1);
exit(0);
# ###############################################
# all subs below
# ##############
sub do_substitution {
   my $lc = scalar @langarr;
   my ($i, $img, $eng, $fr, $j, $c, $d, $imtag, $im2);
   my $frhtm = join('', @frlines);
   my $tl = length($frhtm);
   prt( "Attempting $lc substitutions ... in $tl htm chars...\n" );
   my $fnd = 0;
   my $newfr = ''; # accumulate into here
   for ($i = 0; $i < $lc; $i++) {
      $img = $langarr[$i][0];
      $eng = $langarr[$i][1];
      $fr  = $langarr[$i][2];
      $imtag = '';
      $d = '';
      $fnd = 0;
      $newfr = '';
      $tl = length($frhtm);
      prt( "\nText length now $tl characters ...\n" ) if ($dbg6);
      for ($j = 0; $j < $tl; $j++) {
         $c = substr($frhtm,$j,1);
         if ($d eq '<') {
            if ($c eq "\n") {
               if (substr($imtag,-1) =~ /\s/) {
                  $c = '';
               } else {
                  $c = ' ';
               }
            }
            $imtag .= $c;
            if ($c eq '>') {
               $d = $c;
               if ($imtag =~ /^<img.+/) {
                  $imtag = trimall($imtag);
                  if ($imtag =~ /src=['"](.+?)['"]/i) {
                     $im2 = $1;
                     if ($im2 eq $img) {
                        if ($imtag =~ /alt=['"](.+?)['"]/i) {
                           substr($imtag, index($imtag,$1),length($1),$fr);
                           prt( "Change [$1] to [$fr] ..\n" ) if ($dbg6);
                           prt( "$imtag\n" ) if ($dbg6);
                           $fnd = 1;
                        }
                        $newfr .= $imtag; # add in this block
                        last;
                     }
                  }
               }
               $newfr .= $imtag; # add in this block
            }
         } elsif ($c eq '<') {
            $imtag = $c;
            $d = $c;
         } else {
            $newfr .= $c;
         }
      }
      ##############################################################
      if (!$fnd) {
         prt( "Did not find [$img] ...\n" );
      } else {
         $j++ if ($j < $tl);
         $newfr .= substr($frhtm, $j) if ($j < $tl); # use the NEW text
         $frhtm = $newfr;
      }
   }
   return $frhtm;
}
sub get_table_block {
   my ($tn) = shift; # table number
   my $lc = scalar @tlines;
   my ($l, $i, $c, $tg, $d, $ln, $ll);
   my $tbl = '';
   my $tc = 0;
   my $in_tbl = 0;
   $d = '';
   for ($l = 0; $l < $lc; $l++) {
      $ln = $tlines[$l]; # entract a line
      $ln = trimall($ln); # clean it up
      $ll = length($ln);
      if ($ll && $in_tbl && (length($tbl))) {
         $c = substr($tbl,-1);
         if ( !(($c =~ /\s/)||($c eq '>')) ) {
            $tbl .= ' ';
         }
      }
      for ($i = 0; $i < $ll; $i++) {
         $c = substr($ln,$i,1);
         $tbl .= $c if ($in_tbl);
         if ($d eq '<') {
            $tg .= $c;
            if ($c eq '>') {
               # got a tag
               if ($tg =~ /<table.*?>/i) {
                  $tc++;
                  if ($tn == $tc) {
                     $in_tbl = 1;
                  }
               } elsif ($tg =~ /<\/table>/i) {
                  if ($in_tbl) {
                     $tbl = substr($tbl, 0, length($tbl) - length($tg));
                  }
                  $in_tbl = 0;
               }
               $d = '';
            }
         } elsif ($c eq '<') {
            $tg = $c;
            $d = $c;
         }
      }
   }
   return $tbl;
}
sub load_existing_table {
   my ($fil) = shift;
   my $ln = '';
   my $rows = 0;
   my $cols = 0;
   my $in_row = 0;
   my $in_td = 0;
   my $img = '';
   my $eng = '';
   my $fr = '';
   if ( ! -f $fil) {
      mydie( "ERROR: Unable to locate exisitng [$fil] file ... $! ...\n" );
   }
   open INF, "<$fil" or mydie( "ERROR: Unable to OPEN exisitng [$fil] file ... $! ...\n" );
   @tlines = <INF>;
   close INF;
   prt( "Got " . scalar @tlines . " lines from file [$fil] ...\n" );
   my $tt = get_table_block(1);
   ##prt( "Table block = [$tt]\n" );
   #$tt = tag2newline($tt, 'caption');
   #$tt = tag2newline($tt, 'tr');
   #$tt = tag2newline($tt, 'th');
   #$tt = tag2newline($tt, 'td');
   #$tt = trimblanklines($tt);
   #prt( "\nTable block 2 = \n[$tt]\n" );
   $tt = alltags2newline($tt);
   ##prt( "\nTable block 3 = \n[$tt]\n" );
   @tlines = split("\n",$tt);
   prt( "Got " . scalar @tlines . " table lines ...\n" );
   foreach $ln (@tlines) {
      $ln = trimall($ln);
      if ($ln =~ /<tr.*>/i) {
         $rows++;
         $in_row = 1;
         $cols = 0;
      } elsif ($ln =~ /<th.*>/i) {
         # ignore these
         $cols = 0;
      } elsif ($ln =~ /<caption.*>/i) {
         # ignore
         $cols = 0;
      } elsif ($ln =~ /<td.*>/i) {
         $cols++;
         $in_td = 1;
      } elsif ($ln =~ /<\/caption>/i) {
         # ignore this
         $cols = 0;
      } elsif ($ln =~ /<\/th>/i) {
         # ignore
         $cols = 0;
      } elsif ($ln =~ /<\/tr>/i) {
         $in_row = 0;
         $cols = 0;
      } elsif ($ln =~ /<\/td>/i) {
         $in_td = 0;
      } else {
         # should be a text entry
         if ($in_td) {
            if ($cols == 1) {
               $img = $ln;
               prt( "img=[$ln]\n" ) if ($dbg5);
            } elsif ($cols == 2) {
               $eng = $ln;
               prt( "eng=[$ln]\n" ) if ($dbg5);
            } elsif ($cols == 3) {
               $fr = $ln;
               prt( "fr=[$ln]\n" ) if ($dbg5);
               push(@langarr, [$img, $eng, $fr]);
            }
         }
      }
   }
}
sub alltags2newline {
   my ($tx) = shift;
   my $tl = length($tx);
   my ($i, $c, $d);
   my $nt = '';
   $d = '';
   for ($i = 0; $i < $tl; $i++) {
      $c = substr($tx,$i,1);
      if ($c eq '<') {
         if (length($nt) && (substr($nt,-1) ne "\n")) {
            $nt .= "\n";
         }
      } 
      if (($d eq '>')&&($c ne "\n")) {
         if (length($nt) && (substr($nt,-1) ne "\n")) {
            $nt .= "\n";
         }
      }
      $nt .= $c;
      $d = $c;
   }
   return $nt;
}
sub short_text {
   my ($tx, $len) = @_;
   my $ln = length($tx);
   my $ntx = $tx;
   if ($ln > ($len + 3)) {
      my $hl = int( $len / 2 );
      $ntx = substr($tx,0,$hl);
      $ntx .= '...';
      $hl = $len - $hl;
      $ntx .= substr($tx, $ln - $hl);
   }
   return $ntx;
}
sub write_out_file {
   my ($tx, $fil) = @_;
   open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n");
   print OF $tx;
   close OF;
   prt("Written " . length($tx) . " characters to [$fil]...\n");
}
sub get_tag {
   my ($t) = shift;
   my $m = length($t);
   my ($j, $c);
   my $tg = '';
   for ($j = 0; $j < $m; $j++) {
      $c = substr($t,$j,1);
      if ($c eq '<') {
         $tg = $c;
         $j++;
         for ( ; $j < $m; $j++) {
            $c = substr($t,$j,1);
            $tg .= $c;
            if ($c eq '>') {
               last;
            }
         }
         last;
      }
   }
   return $tg;
}
sub get_att_hash {
   my ($tg) = shift;
   $tg =~ s/\n/ /gm;
   $tg =~ s/\r/ /gm;
   my $ml = length($tg);
   my ($i, $c, $d);
   my $tag = '';
   my $att = '';
   my $val = '';
   my %h = ();
   for ($i = 0; $i < $ml; $i++) {
      $c = substr($tg,$i,1);
      if ($c eq '<') {
         $i++;
         for ( ; $i < $ml; $i++) {
            $c = substr($tg,$i,1);
            if (($c =~ /\s/)||($c eq '>')) {
               last;
            }
            $tag .= $c;
         }
         # got the tag, now the attributes, if any
         prt( "tag=[$tag]\n" ) if ($dbg4);
         while (($c =~ /\s/)&&(($i + 1) < $ml)) {
            while (($c =~ /\s/)&&(($i + 1) < $ml)) {
               $i++;
               $c = substr($tg,$i,1);
            }
            $att = '';
            $val = '';
            if ( !($c =~ /\s/) && ($c ne '>')) {
               $att = $c; # start attribute
               $i++;
               for ( ; $i < $ml; $i++) {
                  $c = substr($tg,$i,1);
                  if ($c eq '=') {
                     last;
                  }
                  $att .= $c;
               }
               if (($c eq '=')&&(($i + 1) < $ml)) {
                  $i++;
                  $d = substr($tg,$i,1);
                  if (($d eq '"')||($d eq "'")) {
                     $val = $d; # keep the inverted comma
                  } else {
                     $val = $d; # keep first item
                     $d = ' ';
                  }
                  $i++;
                  for ( ; $i < $ml; $i++) {
                     $c = substr($tg,$i,1);
                     if ($c eq '>') {
                        last;
                     } elsif ($c eq $d) {
                        if ($c ne ' ') {
                           $val .= $c;
                           if (($i + 1) < $ml) {
                              $i++;
                              $c = substr($tg,$i,1);
                           }
                        }
                        last;
                     }
                     $val .= $c;
                  }
               }
               if (length($att) && length($val)) {
                  prt( "att=[$att] value=[$val] c=[$c]\n" ) if ($dbg4);
                  if (defined $h{$att}) {
                     prt("Duplicate attribute!!! [$att] val1=[" . $h{$att} . "] adding [$val] ...\n" );
                     if ($h{$att} ne $val) {
                        $h{$att} .= '|' . $val;
                     }
                  } else {
                     $h{$att} = $val;
                  }
               } else {
                  prt( "Warning: failed to get att=[$att] value=[$val] c=[$c]\n" );
               }
            }
         } # end while 
         #############################
         push(@attlist, [$tag, \%h]);
      }
   }
}
sub trim_tail {
   my ($ln) = shift;
   while ($ln =~ /\s$/m) {
      $ln = substr($ln,0, length($ln) - 1);
   }
   return $ln;
}
sub strip_quotes {
   my ($tx) = shift;
   $tx =~ s/^('|")//;
   $tx =~ s/('|")$//;
   return $tx;
}
sub show_att_list {
   my $ac = scalar @attlist;
   prt( "Got $ac entries in attlist ...\n" );
   my ($i, $src, $alt);
   for ($i = 0; $i < $ac; $i++) {
      my $tg = $attlist[$i][0];
      my %th = $attlist[$i][1];
      prt( "TAG=[$tg]\n" ) if ($dbg4);
      ##foreach my $k (keys(%th)) {
      ##   my $v = $th{$k};
      ##   prt( "k=[$k] v=[$v]\n" );
      ##}
      $src = '';
      $alt = '';
      foreach my $k (keys(%{$attlist[$i][1]})) {
         my $v = ${$attlist[$i][1]}{$k};
         prt( "k=[$k] v=[$v]\n" ) if ($dbg4);
         if ($k =~ /^src$/i) {
            $src = strip_quotes($v);
         } elsif ($k =~ /^alt$/) {
            $alt = strip_quotes($v);
         }
      }
      if (length($src) && length($alt)) {
         push(@altlist, [$src, $alt]);
      } else {
         prt( "WARNING: Failed to find src and alt ...\n" );
      }
   }
}
sub get_fr {
   my ($ig) = shift;
   my ($img, $eng, $fr, $i);
   my $icnt = scalar @langarr;
   for ($i = 0; $i < $icnt; $i++) {
      $img = $langarr[$i][0];
      $eng = $langarr[$i][1];
      $fr = $langarr[$i][2];
      if ($img eq $ig) {
         return $fr;
      }
   }
   return '&nbsp;';
}
sub out_alt_list {
   my ($fil) = shift;
   my $ct = scalar @altlist;
   if ($ct) {
      my ($i, $sr, $at, $msg);
      prt( "Outputting $ct alt list entries to $fil ...\n" );
      open OTF, ">$fil" or mydie( "ERROR: Unable to open $fil file ... $! \n" );
      print OTF $htm_head;
      for ($i = 0; $i < $ct; $i++) {
         $sr = $altlist[$i][0];
         $at = $altlist[$i][1];
         $msg = "<tr>\n";
         $msg .= "<td>\n";
         ##$msg .= $sr;
         $msg .= '<img src="' . $def_folder . '/' . $sr . '" width="60" height="40">';
         $msg .= "</td>\n";
         $msg .= "<td>\n";
         $msg .= $at;
         $msg .= "</td>\n";
         $msg .= "<td>\n";
         $msg .= get_fr($sr);
         $msg .= "</td>\n";
         $msg .= "</tr>\n";
         print OTF $msg;
      }
      print OTF $htm_tail;
      close OTF;
      ###system($fil);
   } else {
      prt( "WARNING: Did not find any src/alt sets ...\n" );
   }
}
sub extract_img_alts {
   my ($tx) = shift;
   my $tl = length($tx);
   my ($i);
   my $ch = '';
   my $nt = '';
   my $tag = '';
   my $att = '';
   my $tgl = '';
   my $intd = 0;
   my $ntag = '';
   for ($i = 0; $i < $tl; $i++) {
      $ch = substr($tx,$i,1);
      if ($ch eq '<') {
         $tag = get_tag( substr($tx,$i) );
         $i += (length($tag) - 1) if (length($tag));
         $tgl = $tag;
         $tgl =~ s/\n/ /g;
         $tgl =~ s/\r/ /g;
         if ($tgl =~ /<img(.*)>/im) {
            $att = $1;
            prt( "IMG tag [$tag]...\n" ) if ($dbg3);
            get_att_hash($tag);
         } elsif ((length($tag) > 4)&&(substr($tag,0,4) eq '<!--')) {
            prt( "Got comment ...\n" ) if ($dbg2);
         } else {
            prt( "other tag [$tag] ...\n" ) if ($dbg2);
         }
         $nt .= $tag;
      } else {
         $nt .= $ch;
      }
   }
   $tl = length($nt);
   prt("Now returning $tl characters ...\n") if $dbg1;
   return $nt;
}
sub trimall {
   my ($ln) = shift;
   chomp $ln;
   $ln =~ s/\r$//;
   $ln =~ s/\t/ /g;
   while ($ln =~ /\s\s/) {
      $ln =~ s/\s\s/ /g;
   }
   while ($ln =~ /^\s/) {
      $ln = substr($ln,1);
   }
   while ($ln =~ /\s$/) {
      $ln = substr($ln,0, length($ln) - 1);
   }
   return $ln;
}
# eof - imgalt01.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional