wordindex01.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:55:00 2010 from wordindex01.pl 2007/08/22 5 KB.

#!/perl -w
# NAME: wordindex01.pl
# AIM: Given a FILE, load it in WORD, extract the text, and build an alphabetic
# index of words ...
# Uses Word OLE engine
# see http://www.ngbdigital.com/perl_ole_word.html
# 21/08/2007 geoff mclane - http://geoffair.net/mperl
#
use strict;
use warnings;
use Win32::OLE;
use Win32::OLE qw(in with);
use Win32::OLE::Variant;
use Win32::OLE::Const 'Microsoft Word'; 
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
if ($0 =~ /\w{1}:\\.*/) {
   my @tmpsp = split(/\\/,$0);
   $outfile = 'temp.'.($tmpsp[-1]).'.txt';
}
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
my $in_file = 'C:\Documents and Settings\Geoff McLane\My Documents\Tidy\Php-01.doc';
my @common = qw( am as be br but by can do eof etc for from get got has
have hi if in it its may my no not now of or re see so some an on such 
sure at to too us is was with you );
sub in_common {
   my ($tx) = shift;
   foreach my $t (@common) {
      if ($t eq $tx) {
         return 1;
      }
   }
   return 0;
}
# debug
my $dbg1 = 0;   # show stored value
my $dbg2 = 0;   # show REPEATED words
my $dbg3 = 0;   # show progress each 100 words
my $dbg9 = 1;   # show actions sent to prtv9 ...
my %distinct = (); # TO HOLD THE FINAL LIST
enumerate_doc( $in_file );
my $wcnt = keys( %distinct );
prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" );
my $cnt = 0;
foreach my $wort (sort keys %distinct){
   $cnt++;
   if ($cnt < 10) {
      prt("  $cnt ");
   } elsif ($cnt < 100) {
      prt(" $cnt ");
   } else {
      prt("$cnt ");
   }
   prt( "[$wort] $distinct{$wort}\n" ); 
} 
prt( "Done $cnt output of sorted keys, with count ...\n" );
close_log($outfile,1);
exit(0);
#####################################
###### subs
sub Quit {
   my( $Obj ) = @_;
   $Obj->Quit();
}
sub enumerate_doc {
   my ($infile) = shift;
   my $wdcnt = 0;
   my $lcword = '';
   my $newcnt = 0;
   # a stepped approach to openning, or loading Microsoft Word
   prt( "Attaching to Word application ...\n" );
   my $Word = Win32::OLE->GetActiveObject('Word.Application');
   if ($Word) {
      prt( "Using existing running Word application ...\n" );
   } else {
      prt( "Starting NEW Word application ...\n" );
      $Word = Win32::OLE->new('Word.Application', 'Quit'); 
      if ($Word) {
         prt("New application running ...\n");
      } else {
         mydie( "ERROR: Failed to load Word application ...\n" );
      }
   }
   $Word->{'Visible'}     = 0;
   $Word->{DisplayAlerts} = 0;
   # Load the application with the document
   prt( "Openning document $infile ...\n" );
   my $Doc = $Word->Documents->Open($infile) 
   || mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); 
   prt( "Getting contents of the ActiveDocument ... wait ...\n" );
   my $myRange = $Word->ActiveDocument->Content; 
   # Collections - Characters Words Sentences Paragraphs Sections HeadersFooters 
   foreach my $word (in $myRange->Words){ 
      $wdcnt++;
      if (($wdcnt % 100) == 0) {
         prt( "Processed $wdcnt words ...\n" ) if ($dbg3);
      }
      $lcword = lc($word->{Text});   # extract the 'word', in lowercase
      # try to trim it up a bit
      chomp $lcword; # remove trailing \n char, if any ...
      $lcword = replace_hibits($lcword);   # have seen 0xA0 in string - replace with SPACE
      $lcword = remove_quotes($lcword);   # remove any QUOTES, " or ' at begin/end
      $lcword = trim_all($lcword);      # trim it up
      $lcword = remove_quotes($lcword);   # remove any onner quotes
      $lcword = trim_all($lcword);      # and trim AGAIN
      ###next if not $lcword =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha 
      next if ( !($lcword =~ /^\w{2}/) );   # forget it if not start with 2 alphanumeric
      if ($lcword =~ /^\d+$/) {
         next if (length($lcword) < 4);   # dump numbers less than length 4
      }
      next if (in_common($lcword));      # exclude a bumch of 'common' words
      if (length($lcword) > 3) {         # tried to exclude plurals, but mainly FAILED
         if (substr($lcword,length($lcword)-1) eq 's') {
            my $tmp = substr($lcword,0,length($lcword)-1);
            next if (defined $distinct{$tmp} );
         }
      }
      # keep count of words collected
      if (defined $distinct{$lcword} ) {
         $distinct{$lcword}++;
         prtv9( "[$lcword] bumped count to $distinct{$lcword}...\n" ) if ($dbg2);
      } else {
         prtv9( "[$lcword] stored ...\n" ) if ($dbg1);
         $distinct{$lcword} = 1;
         $newcnt++;
      }
   } 
   prt( "Processed $wdcnt words ... collected $newcnt ...\n" );
}
sub prtv9 {
   my ($txt) = shift;
   prt( "$txt" ) if ($dbg9);
}
sub remove_quotes {
   my ($tx) = shift;
   $tx =~ s/^'//; # remove any beginning single quotes
   $tx =~ s/^"//g; # remove any beginning double quotes
   $tx =~ s/'$//; # remove any ending single quotes
   $tx =~ s/"$//g; # remove any ending double quotes
   return $tx;
}
sub replace_hibits {
   my ($tx) = shift;
   my $mx = length($tx);
   my $ntx = '';
   my ($ch, $val);
   for (my $i = 0; $i < $mx; $i++) {
      $ch = substr($tx,$i,1);
      $val = ord($ch);
      if ($val > 127) {
         $ch = ' ';
      }
      $ntx .= $ch;
   }
   return $ntx;
}
# eof - wordindex01.htm

index -|- top

checked by tidy  Valid HTML 4.01 Transitional