Generated: Tue Feb 2 17:54:57 2010 from stripms5.pl 2005/11/23 16.3 KB.
#!/Perl ## ## geoffair _at_ hotmail _dot_ com ## just looking for a way to clean up 'complicated' HTML files, and build ## a 'simpler' version. Naturally, some of the 'look and feel' MAY BE lost, ## but the information is ALL there ... plus some ... ## use HTML::Parser (); use Data::Dump (); ### use URI::URL; ### use APR::URI (); my $program = "stripms5"; ### 2005.05.11 - 2005.07.25 clean up mainly ... ## user feature variables ## my $dodebug = 1; my $bf = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/"; my $definp = "$bf/Uwe/Uwe-13cg.htm"; my $defout = "$bf/Uwe/temphtm3.htm"; # $definp = "$bf/Louis/lou-031-bps01.htm"; # $defout = "temphtm3.htm"; ##my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/Russ-29-sc01.htm"; ##my $defout = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/temphtm2.htm"; ###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Wednesday.htm"; ###my $defout = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/temphtml.htm"; ###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm"; ###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/Russ-04.htm"; ###my $defout = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/temphtml.htm"; ##my $definp = "C:/HOMEPAGE/P26/compgr.htm"; ##my $defout = "C:/HOMEPAGE/P26/temphtml.htm"; my $deflog = "temphtml.txt"; # output log file ... more if $dodebug = 1! my $defskip = "tempskip.txt"; # view what has been REJECTED, DELETED, CHOPPED my $WEBVERS = "P26.2005.07.25"; my $addcode = "<!-- $WEBVERS - geoffmclane.com - stripms.pl -->"; my $clearhtml = 1; # clear HTML attributes ## paragraph handling my $clearop = 1; # clear MS o:p paragraph thingy my $clearpatts = 0; # clear P paragraph attributes my $clearplang = 1; # modify P lang attribute my $defnorm1 = 'MsoNormal'; my $defnorm2 = 'MsoPlainText'; my $postpara = 1; # handle paragraphs post </p>, to allow delete of only para my @paraarr = (); my $innsep = 0; # got C '<!', [ ], in not support empty paras = '<![if !supportEmptyParas]>' my $nsepif = '<![if !supportEmptyParas]>'; my $delpara = 0; my $clearhstyl = 1; # no SYTLE statment in head - include through file, if required ... *TBD* my $cleartdsty = 0; # clear TD attributes my $fiximg = 1; # modify the IMG tag my $clearhlink = 1; # clear a LINK REL statement my $clearspan = 1; # remove all SPAN tags my $cleardiv = 1; # remove all DIV tags my $clearmeta = 1; # remove META (head) tag ## BODY actions my $clearbsyle = 0; # no BODY attributes my $fixblstyle = 1; # modify body language, if given my $deflang = 'en-au'; # use English (Austrlian) my $delayclose = 1; # close after PARSE is DONE ie no E body, or html ... ## A tag HREF hyperlink my $keeplinks = 1; # store links found my @links; # store of links my $actlink = ""; my $act1 = ""; ## program variables ## my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # spacey if ($var =~ /$WHITE_PATTERN2/o ) { ...} my ($FH, $HH, $CH); # run log, html and strip log ... my $doout = 1; # do the OUTPUT, but can be off'ed ... my $inpfil = ""; my $subok = 0; my $msg = ""; # used to build a message, for multiple output my $inhtml = 0; # in document my $inpara = 0; # in paragraph tag my $inhead = 0; # processing header my $inbody = 0; # body processing my $instyle = 0; # style processing my $start_time = time(); open $HH, ">$defout" or die "No HTML output file ... [$defout]!\n"; open $FH, ">$deflog" or die "No OUT LOG file ...\n"; open $CH, ">$defskip" or die "No SKIP file ...\n"; # create the parser my $p = HTML::Parser->new(api_version => 3); # set the default function handler $p->handler(default => \&hand, "event, line, column, text, tagname, attr"); # $p->parse_file(@ARGV ? shift : die "No input given ....\n"); parse_args(@ARGV); # if we did NOT get an INPUT file, what to DO ... if ( !length($inpfil) ) { if ($dodebug) { $inpfil = $definp; } else { die "No input file given ...\n"; } } $msg = "$program: Started on " . localtime($start_time) ; ### . " in $cwdir ...\n" if $shwtm; print "$msg\n"; print $FH "$msg\n"; $p->parse_file($inpfil); # do the ACTION my $cnt = @links; if ($cnt > 0) { my %links1 = (); $msg = "Found $cnt hyperlinks ..."; prt2 ("$msg\n"); $i = 0; foreach $actlink (@links) { $i++; $msg = "$i [$actlink]"; prt2 ( "$msg\n" ); local $lnk = $actlink; $lnk =~ s,^http://,,i; # remove HTTP:// start, if any $msg = "$i [$lnk]"; prt2 ("$msg\n"); ## split URL from HREF of A tag local @ar = split(/\?/, $lnk); local $param = $ar[1]; # parameter, if any local @ar1 = split(/\//, $ar[0]); local $ar1cnt = @ar1; # get count of items $act1 = uc($ar1[0]); # always a first local $page = $ar1[1]; # if more pages ... chomp $page; if (length($page)) { local $i2 = 0; for ($i2 = 2; $i2 < $ar1cnt; $i2++) { chomp $ar1[$i2]; if (length($ar1[$i2])) { $page .= "/$ar1[$i2]"; } } } ## combine the value $msg = "$page"; if (length($param)) { $msg .= "?$param"; } if ( exists $links1{$act1} ) { local $val2 = $links1{$act1}; chomp $val2; if ( length($val2) ) { if ( $val2 =~ "/$msg/" ) { ## already exists in message == a REPEAT param prt2 ("Avoided adding $msg\n"); } else { $links1{$act1} .= " " . $msg; ### ADD "$page?$param"; ## $ar[1]; } } else { $links1{$act1} = $msg; ### ADD "$page?$param"; ## $ar[1]; } } else { $links1{$act1} = $msg; ### "$page?$param"; ## $ar[1]; } ### log message $msg = "$i "; if ($act1 eq $actlink) { $msg .= ' same'; } else { $msg .= " site[$act1]"; } prt2( "$msg\n"); } $msg = "lISTED $cnt hyperlinks ..."; prt2 ( "$msg\n"); my @urlkeys = %links1; # get the key list $cnt = @urlkeys / 2; # pairs key,value $msg = "Found $cnt DOMAINS ..."; prt2 ( "$msg\n" ); print $HH "<p>$msg<br>\n"; $i = 0; for my $url ( sort keys %links1 ) { $i++; $msg = "url[$url]"; if ( length( $links1{$url} ) ) { $msg .= "/$links1{$url}"; } prt2("$msg\n"); print $HH "Link $i: <A HREF=\"http://$url\"><b>$url</b></A><br>\n"; } print $HH "Listed $i DOMAINS</p>\n"; } ### end of if ($cnt) print $HH "</body>\n"; print $HH "</html>\n"; if ($cnt > 0) { # print @urlkeys; $msg = "List of $cnt DOMAINS ..."; prt2( "$msg\n" ); } $msg = "$program: Ending on " . localtime(time()); prt2( "$msg\n" ); close $FH; # log file output close $HH; # ouput HTML file close $CH; # log of discarded items my $bakname = getback($inpfil); # get back up name # print "\fFrom [$inpfil] got [$bakname] ... \n"; system $defout; # run the HTML file if ($subok) { open $HH, "<$defout" or die "No new HTML input file ... [$defout]!\n"; ##my $bakname = getback($inpfil); # get back up name open $FH, "<$inpfil" or die "No re-open of the source ... [$inpfil]!\n"; my @infil = <$HH>; # slurp the file, from the disk my @outfil = <$FH>; # slurp the original source close $FH; # log file output close $HH; # ouput HTML file open $HH, ">$defout" or die "No HTML input file ... [$defout]!\n"; open $FH, ">$inpfil" or die "No re-open of the source ... [$inpfil]!\n"; print $HH @outfil; print $FH @infil; close $FH; # log file output close $HH; # ouput HTML file } ## Event table ## ["S", $tag, $attr, $attrseq, $text] ## ["E", $tag, $text] ## ["T", $text, $is_data] ## ["C", $text] ## ["D", $text] ## ["PI", $token0, $text] sub hand { my($event, $line, $column, $text, $tagname, $attr) = @_; my $typ = uc(substr($event,0,1)); ## get TYPE my @d = "$typ L$line C$column"; #substr($text, 40) = "..." if length($text) > 40; push(@d, $text); push(@d, $tagname) if defined $tagname; push(@d, $attr) if $attr; my $otxt = Data::Dump::dump(@d); #print $FH Data::Dump::dump(@d), "\n"; #print Data::Dump::dump(@d), "\n"; print "$otxt\n"; # now process the data ... my $locout = 1; # one time only output flag my $i; my $tag = '*NO_TAG*'; if (defined $tagname) { $tag = uc($tagname); } # Event table ######################################################################## if ($typ eq 'S') { # START OF TAG, and possible ATTRIBUTES ## ["S", $tag, $attr, $attrseq, $text] if ($tag eq 'HTML') { $inhtml = 1; if ($clearhtml) { $text = '<html>'; } if (defined $addcode) { ## = "<!-- $WEBVERS - geoffmclane.com - stripms.pl -->"; $text .= "\n"; $text .= "$addcode"; } } elsif ($tag eq 'A') { # a hyperlink - collect, at least ... if ($keeplinks) { my %att = %$attr; # copy the HASH, to do modifications $i = 0; prt( "Checking A attribs ...\n" ); foreach $key (keys %att) { prt ( "Checking A attrib $key ...\n" ); if ($key eq 'href') { prt ( "Found and saving $key=$att{$key} ...\n" ); push(@links, $att{$key}); } } } } elsif ($tag eq 'P') { $inpara = 1; @paraarr = (); # clear paragraph accumulator if ($clearpatts) { print $FH "Paragraph from $text to <$tagname> ...\n"; $text = "<$tagname>"; } elsif ($clearplang) { # modify P lang= attribute # use my $deflang = 'EN-AU'; # use English (Austrlian) my %att = %$attr; # copy the HASH, to do modifications $i = 0; prt( "Checking P attrib ...\n" ); foreach $key (keys %att) { prt ( "Checking attrib $key ...\n" ); if ($key eq 'lang') { prt ( "Found $key=$att{$key} ...\n" ); if ($att{$key} ne $deflang) { prt ( "Modifying $key=$att{$key} to [$deflang] ...\n" ); $att{$key} = $deflang; } $i++; } elsif ($key eq 'class') { ##my $defnorm1 = 'MsoNormal'; ##my $defnorm2 = 'MsoPlainText'; prt ( "Found $key=[" . $att{$key} . "]\n" ); if (($att{$key} eq $defnorm1)||($att{$key} eq $defnorm2) ) { prtd ( "Deleting $key=$att{$key} ...\n" ); #$att{$key} = $deflang; delete $att{$key}; # remove this MS reference $i++; } } } if ($i) { # ok, change output, re-run to build new HTML $text = "<$tagname"; # start tag again while (($key,$value) = each %att) { $text .= " $key=$value"; } $text .= '>'; # close tag } } } elsif ($tag eq 'HEAD') { $inhead = 1; } elsif ($tag eq 'BODY') { $inbody = 1; if ($clearbsyle) { # no BODY attributes $text = '<body>'; } elsif ($fixblstyle) { # modify body language, if given # use my $deflang = 'EN-AU'; # use English (Austrlian) my %att = %$attr; # copy the HASH, to do modifications $i = 0; prt( "Checking BODY attrib ...\n" ); foreach $key (keys %att) { prt ( "Checking attrib $key ...\n" ); if ($key eq 'lang') { prt ( "Found lang=$key ...\n" ); if ($att{$key} ne $deflang) { prt ( "Modifying 'lang=$att{$key} to [$deflang] ...\n" ); $att{$key} = $deflang; } $i++; } } if ($i) { # ok, change output, re-run to build new HTML $text = "<$tag"; # start tag again while (($key,$value) = each %att) { $text .= " $key=$value"; } $text .= '>'; # close tag } } } elsif ($tag eq 'STYLE') { $instyle =1; if ($clearhstyl) { # in head - close out S style to E sytle if ($inhead) { $doout = 0; # CLOSE output } } } elsif ($tag eq 'SPAN') { if ($clearspan) { # remove all SPAN tags $locout = 0; } } elsif ($tag eq 'O:P') { if ($clearop) { # clear MS o:p paragraph thingy $locout = 0; } } elsif ($tag eq 'LINK') { if ($clearhlink) { $locout = 0; } } elsif ($tag eq 'DIV') { if ($cleardiv) { $locout = 0; } } elsif ($tag eq 'TD') { if ($cleartdsty) { $text = '<td>'; } } elsif ($tag eq 'IMG') { if ($fiximg) { # modify the IMG tag my %att = %$attr; # copy the HASH, to do modifications $i = 0; foreach $key (keys %att) { if ($key eq 'v:shapes') { prtd ( "Deleting attrib $key ...\n" ); delete $att{$key}; # remove this MS reference $i++; } } if ($i) { $text = '<IMG'; # start IMG tag again while (($key,$value) = each %att) { $text .= " $key=$value"; } $text .= '>'; # close IMG tag } } } elsif ($tag eq 'META') { if ($clearmeta) { # remove META (head) tag $locout = 0; } } ######################################################################## } elsif ($typ eq 'E') { ## ["E", $tag, $text] if ($tag eq 'HTML') { if ($delayclose) { # close after PARSE is DONE ie no E body, or html ... $locout = 0; } else { $inhtml = 0; } } elsif ($tag eq 'P') { $inpara = 0; if ($postpara) { $locout = 0; # clear any output reqd push(@paraarr,$text); # do it all here $i = @paraarr; # count item count if ($delpara) { # if a DUMMY thing prtd ("Para components $i [ @paraarr ]\n"); prtd ("This NON-BLAMK-SPACE dummy paragraph has been deleted ...\n"); } else { # send em out, line by line ... $i = 0; foreach $msg (@paraarr) { $i++; prt ( "$msg\n" ); print $HH "$msg\n"; # out to HTML file } } } $delpara = 0; ####### E = clear DELETE OF PARAGRAPH ####### } elsif ($tag eq 'HEAD') { $inhead = 0; } elsif ($tag eq 'BODY') { if ($delayclose) { # close after PARSE is DONE ie no E body, or html ... $locout = 0; } else { $inbody = 0; } } elsif ($tag eq 'STYLE') { $instyle = 0; if ($clearhstyl) { # in head - close out S style to E sytle if ($inhead) { $doout = 1; # OPEN output $locout = 0; # but NOT for this style one } } } elsif ($tag eq 'SPAN') { if ($clearspan) { # remove all SPAN tags $locout = 0; } } elsif ($tag eq 'O:P') { if ($clearop) { # clear MS o:p paragraph thingy $locout = 0; } } elsif ($tag eq 'LINK') { if ($clearhlink) { $locout = 0; } } elsif ($tag eq 'DIV') { if ($cleardiv) { $locout = 0; } } } elsif ($typ eq 'T') { ## ["T", $text, $is_data] if ($text eq ' ') { if ($innsep) { $delpara = 1; prt ("Should DELETE this 'dummy' paragraph...\n"); } } } elsif ($typ eq 'C') { ## ["C", $text] $locout = 0; # toss all CODE if ($text eq $nsepif ) { #'<![if !supportEmptyParas]>' $innsep = 1; # got C '<!', [ ], in not support empty paras } else { $innsep = 0; } } elsif ($typ eq 'D') { ## ["D", $text] } elsif ($typ eq 'P') { ## ["PI", $token0, $text] } ### end event table ########################################################### if ($text =~ /$WHITE_PATTERN2/o) { print $CH "ws[$otxt]\n"; print $CH "ws[$text]\n"; } else { if ($doout && $locout) { if ($postpara && $inpara) { push(@paraarr, $text); # store paragraph components } else { print $FH "$otxt\n"; print $HH "$text\n"; } } else { print $CH "$otxt\n"; print $CH "$text\n"; } } } sub parse_args { my (@av) = @_; # get stack while (@av) { my $arg = uc($av[0]); if ($arg =~ /^-/) { if ($arg eq '-V') { print "Version: 0.0.2 - July 2005\n"; } elsif (($arg eq '-H') || ($arg eq '-?')) { die "stripms infile [options]\n"; } else { die "ERROR: Unknown option [$arg]\n"; } } else { if (length($inpfil)) { die "ERROR: Can not handle two input files ...\n"; } $inpfil = $arg; if ( !(-f $inpfil) ) { die "ERROR: Can NOT locate file [$inpfil] ...\n"; } } shift @av; } } sub prt { if ($dodebug) { print $FH @_; } } sub prt2 { if ($dodebug) { prt(@_); print @_; } } sub prtd { if ($dodebug) { prt(@_); print $CH @_; } } sub getback { local ($ff) = @_; $ff =~ s/\\/\//g; # sub/ensure *nix path separators ie c:\usr becomes c:/usr ... local $nf = getfn($ff); $nf .= '.bak'; return $nf; } sub dirname { # passed a path, './dir1/dir2/file.name' returns './dir1/dir2/ local ($file) = @_; local ($sub); ($sub = $file) =~ s,/+[^/]+$,,g; $sub = '.' if $sub eq $file; return $sub; } sub getfn { local ($ff) = @_; local $dn = dirname($ff); # get the directory/path name ... local $nf; ($nf = $ff) =~ s,^$dn,,; $nf =~ s,^/,,; return $nf; } # EOF