Generated: Tue Feb 2 17:54:26 2010 from cleantd01.pl 2006/10/24 5.6 KB.
#!/Perl # cleantd01.pl - 2006.10.24 - geoff mclane (geoffmclane.com) # AIM: To clean certain items from a HTML document ... # specifically target microsoft word 'filtered' output. # Search for 'tables', and remove 'style' attribute from <td>, # and '<p ...></p><o:p></o:p> and <span> from within ... # ===================================================================== use strict; require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n"; require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n"; # log file stuff my ($LF); my $outfile = 'temp.'.$0.'.txt'; open_log($outfile); prt( "$0 ... Hello, World ...\n" ); # user variable my $def_input = 'C:\HOMEPAGE\P26\travel\tunisfr2.htm'; my $def_output = 'temptunis.htm'; # debug my $dbg1 = 1; # show length after adjustments # program variables my $line = ''; my $word = 0; my @lines = (); my $cnt = 0; my $txt = ''; my $ccnt = 0; my $newtxt = ''; my $in_file = $def_input; my $out_file = $def_output; $in_file = pop @ARGV if (@ARGV); $out_file = pop @ARGV if (@ARGV); prt( "Got input from [$in_file], output to [$out_file] ...\n" ); if ( ! -f $in_file) { mydie("OOPS: Can NOT locate [$in_file] ...\n"); } open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n"); @lines = <IF>; # slurp it all in close IF; $word = check_for_word(); $cnt = scalar @lines; prt("Processing $cnt lines from [$in_file] ... " . ($word ? 'is word' : 'not word') . "\n"); $txt = join("\n", @lines); $ccnt = length($txt); prt("Or $ccnt characters from [$in_file] ...\n"); $newtxt = make_adjustments( $txt ); $ccnt = length($newtxt); write_out_file($newtxt, $out_file); system($out_file); close_log($outfile,1); exit(0); # ############################################### # all subs below # ############## sub short_text { my ($tx, $len) = @_; my $ln = length($tx); my $ntx = $tx; if ($ln > ($len + 3)) { my $hl = int( $len / 2 ); $ntx = substr($tx,0,$hl); $ntx .= '...'; $hl = $len - $hl; $ntx .= substr($tx, $ln - $hl); } return $ntx; } sub write_out_file { my ($tx, $fil) = @_; open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n"); print OF $tx; close OF; prt("Written " . length($tx) . " characters to [$fil]...\n"); } sub check_for_word { my $lc = scalar @lines; prt( "Processing $lc lines ... seeking MS Word meta ...\n" ); my $isword = 0; my ($cont); foreach $line (@lines) { chomp $line; $line =~ s/\r$//; ## <meta name="Generator" content="Microsoft Word 10 (filtered)"> if ($line =~ /<meta\s+name="?Generator"?\s+?content="?(.*)"?>/i) { $cont = $1; if ($cont =~ /Microsoft/i) { prt( "Found [$cont] [$line] ...\n" ); if ($cont =~ /Word/i) { $isword = 1; prt( "Found WORD signature ...\n" ); last; } } } } return $isword; } sub get_tag { my ($t) = shift; my $m = length($t); my ($j, $c); my $tg = ''; for ($j = 0; $j < $m; $j++) { $c = substr($t,$j,1); if ($c eq '<') { $tg = $c; $j++; for ( ; $j < $m; $j++) { $c = substr($t,$j,1); ##if (($c eq "\n")||($c eq "\r")) { ## $c = ' '; ##} $tg .= $c; if ($c eq '>') { last; } } last; } } return $tg; } sub trim_tail { my ($ln) = shift; while ($ln =~ /\s$/m) { $ln = substr($ln,0, length($ln) - 1); } return $ln; } sub del_td_style { my ($td) = shift; my $mx = length($td); my ($j, $c, $d); my $ntd = ''; my $hds = 0; my $ss = ''; $d = ''; for ($j = 0; $j < $mx; $j++) { $c = substr($td,$j,1); if ($hds && ($c =~ /s/i) && (($mx - $j) > 7)) { $ss = substr($td,$j); # get balance if ($ss =~ /^style=(.*)/) { $j += 6; $d = substr($td,$j,1); # get " or ' if (($d eq '"')||($d eq "'")) { $j++; } else { $d = ' '; } for ( ; $j < $mx; $j++) { $c = substr($td,$j,1); if (($c eq $d)||($c eq '>')) { last; } } } } if ($c =~ /\s/) { $hds = 1; } else { $hds = 0; } if ($c ne $d) { if ($c eq '>') { $ntd = trim_tail($ntd); } $ntd .= $c; } $d = ''; } return $ntd; } sub make_adjustments { my ($tx) = shift; my $tl = length($tx); my ($i); my $ch = ''; my $nt = ''; my $tag = ''; my $att = ''; my $tgl = ''; my $intd = 0; my $ntag = ''; for ($i = 0; $i < $tl; $i++) { $ch = substr($tx,$i,1); if ($ch eq '<') { $tag = get_tag( substr($tx,$i) ); $i += (length($tag) - 1) if (length($tag)); $tgl = $tag; $tgl =~ s/\n/ /g; $tgl =~ s/\r/ /g; if ($tgl =~ /<td(.*)>/im) { $intd = 1; $att = $1; prt( "TD tag [$att] [$tag]...\n" ); if ($tgl =~ /<td\s+?(.+)>/im) { $att = $1; if ($tgl =~ /style=/i) { prt( "Is TD with STYLE attrib [$att] ...\n" ); $tag = del_td_style($tag); prt( "New tag [$tag]\n" ); } else { prt( "Is TD with attrib [$att] ...\n" ); } } else { prt( "Is simple TD tag ...\n" ); } } elsif ((length($tag) > 4)&&(substr($tag,0,4) eq '<!--')) { prt( "Got comment ...\n" ); } elsif ($tag =~ /<\/td>/) { prt( "Close TD [$tag]\n" ); $intd = 0; } else { prt( "other tag [$tag] ...\n" ); } $nt .= $tag; } else { $nt .= $ch; } } $tl = length($nt); prt("Now returning $tl characters ...\n") if $dbg1; return $nt; } sub trimall { my ($ln) = shift; chomp $ln; $ln =~ s/\r$//; $ln =~ s/\t/ /g; while ($ln =~ /\s\s/) { $ln =~ s/\s\s/ /g; } while ($ln =~ /^\s/) { $ln = substr($ln,1); } while ($ln =~ /\s$/) { $ln = substr($ln,0, length($ln) - 1); } return $ln; } # eof - cleantd01.pl