cleantd01.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:26 2010 from cleantd01.pl 2006/10/24 5.6 KB.

#!/Perl
# cleantd01.pl - 2006.10.24 - geoff mclane (geoffmclane.com)
# AIM: To clean certain items from a HTML document ...
# specifically target microsoft word 'filtered' output.
# Search for 'tables', and remove 'style' attribute from <td>,
# and '<p ...></p><o:p></o:p> and <span> from within ...
# =====================================================================
use strict;
require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n";
require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
# user variable
my $def_input = 'C:\HOMEPAGE\P26\travel\tunisfr2.htm';
my $def_output = 'temptunis.htm';
# debug
my $dbg1 = 1; # show length after adjustments
# program variables
my $line = '';
my $word = 0;
my @lines = ();
my $cnt = 0;
my $txt = '';
my $ccnt = 0;
my $newtxt = '';
my $in_file = $def_input;
my $out_file = $def_output;
$in_file = pop @ARGV if (@ARGV);
$out_file = pop @ARGV if (@ARGV);
prt( "Got input from [$in_file], output to [$out_file] ...\n" );
if ( ! -f $in_file) {
   mydie("OOPS: Can NOT locate [$in_file] ...\n");
}
open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n");
@lines = <IF>;   # slurp it all in
close IF;
$word = check_for_word();
$cnt = scalar @lines;
prt("Processing $cnt lines from [$in_file] ... " . ($word ? 'is word' : 'not word') . "\n");
$txt = join("\n", @lines);
$ccnt = length($txt);
prt("Or $ccnt characters from [$in_file] ...\n");
$newtxt = make_adjustments( $txt );
$ccnt = length($newtxt);
write_out_file($newtxt, $out_file);
system($out_file);
close_log($outfile,1);
exit(0);
# ###############################################
# all subs below
# ##############
sub short_text {
   my ($tx, $len) = @_;
   my $ln = length($tx);
   my $ntx = $tx;
   if ($ln > ($len + 3)) {
      my $hl = int( $len / 2 );
      $ntx = substr($tx,0,$hl);
      $ntx .= '...';
      $hl = $len - $hl;
      $ntx .= substr($tx, $ln - $hl);
   }
   return $ntx;
}
sub write_out_file {
   my ($tx, $fil) = @_;
   open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n");
   print OF $tx;
   close OF;
   prt("Written " . length($tx) . " characters to [$fil]...\n");
}
sub check_for_word {
   my $lc = scalar @lines;
   prt( "Processing $lc lines ... seeking MS Word meta ...\n" );
   my $isword = 0;
   my ($cont);
   foreach $line (@lines) {
      chomp $line;
      $line =~ s/\r$//;
      ## <meta name="Generator" content="Microsoft Word 10 (filtered)">
      if ($line =~ /<meta\s+name="?Generator"?\s+?content="?(.*)"?>/i) {
         $cont = $1;
         if ($cont =~ /Microsoft/i) {
            prt( "Found [$cont] [$line] ...\n" );
            if ($cont =~ /Word/i) {
               $isword = 1;
               prt( "Found WORD signature ...\n" );
               last;
            }
         }
      }
   }
   return $isword;
}
sub get_tag {
   my ($t) = shift;
   my $m = length($t);
   my ($j, $c);
   my $tg = '';
   for ($j = 0; $j < $m; $j++) {
      $c = substr($t,$j,1);
      if ($c eq '<') {
         $tg = $c;
         $j++;
         for ( ; $j < $m; $j++) {
            $c = substr($t,$j,1);
            ##if (($c eq "\n")||($c eq "\r")) {
            ##   $c = ' ';
            ##}
            $tg .= $c;
            if ($c eq '>') {
               last;
            }
         }
         last;
      }
   }
   return $tg;
}
sub trim_tail {
   my ($ln) = shift;
   while ($ln =~ /\s$/m) {
      $ln = substr($ln,0, length($ln) - 1);
   }
   return $ln;
}
sub del_td_style {
   my ($td) = shift;
   my $mx = length($td);
   my ($j, $c, $d);
   my $ntd = '';
   my $hds = 0;
   my $ss = '';
   $d = '';
   for ($j = 0; $j < $mx; $j++) {
      $c = substr($td,$j,1);
      if ($hds && ($c =~ /s/i) && (($mx - $j) > 7)) {
         $ss = substr($td,$j); # get balance
         if ($ss =~ /^style=(.*)/) {
            $j += 6;
            $d = substr($td,$j,1); # get " or '
            if (($d eq '"')||($d eq "'")) {
               $j++;
            } else {
               $d = ' ';
            }
            for ( ; $j < $mx; $j++) {
               $c = substr($td,$j,1);
               if (($c eq $d)||($c eq '>')) {
                  last;
               }
            }
         }
      }
      if ($c =~ /\s/) {
         $hds = 1;
      } else {
         $hds = 0;
      }
      if ($c ne $d) {
         if ($c eq '>') {
            $ntd = trim_tail($ntd);
         }
         $ntd .= $c;
      }
      $d = '';
   }
   return $ntd;
}
sub make_adjustments {
   my ($tx) = shift;
   my $tl = length($tx);
   my ($i);
   my $ch = '';
   my $nt = '';
   my $tag = '';
   my $att = '';
   my $tgl = '';
   my $intd = 0;
   my $ntag = '';
   for ($i = 0; $i < $tl; $i++) {
      $ch = substr($tx,$i,1);
      if ($ch eq '<') {
         $tag = get_tag( substr($tx,$i) );
         $i += (length($tag) - 1) if (length($tag));
         $tgl = $tag;
         $tgl =~ s/\n/ /g;
         $tgl =~ s/\r/ /g;
         if ($tgl =~ /<td(.*)>/im) {
            $intd = 1;
            $att = $1;
            prt( "TD tag [$att] [$tag]...\n" );
            if ($tgl =~ /<td\s+?(.+)>/im) {
               $att = $1;
               if ($tgl =~ /style=/i) {
                  prt( "Is TD with STYLE attrib [$att] ...\n" );
                  $tag = del_td_style($tag);
                  prt( "New tag [$tag]\n" );
               } else {
                  prt( "Is TD with attrib [$att] ...\n" );
               }
            } else {
               prt( "Is simple TD tag ...\n" );
            }
         } elsif ((length($tag) > 4)&&(substr($tag,0,4) eq '<!--')) {
            prt( "Got comment ...\n" );
         } elsif ($tag =~ /<\/td>/) {
            prt( "Close TD [$tag]\n" );
            $intd = 0;
         } else {
            prt( "other tag [$tag] ...\n" );
         }
         $nt .= $tag;
      } else {
         $nt .= $ch;
      }
   }
   $tl = length($nt);
   prt("Now returning $tl characters ...\n") if $dbg1;
   return $nt;
}
sub trimall {
   my ($ln) = shift;
   chomp $ln;
   $ln =~ s/\r$//;
   $ln =~ s/\t/ /g;
   while ($ln =~ /\s\s/) {
      $ln =~ s/\s\s/ /g;
   }
   while ($ln =~ /^\s/) {
      $ln = substr($ln,1);
   }
   while ($ln =~ /\s$/) {
      $ln = substr($ln,0, length($ln) - 1);
   }
   return $ln;
}
# eof - cleantd01.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional