#!/usr/bin/perl -w # NAME: listhrefs.pl # AIM: Given a html file, output a list of hrefs # 22/12/2013 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; require 'lib_html.pl' or die "Unable to load 'lib_html.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.1 2013-12-22"; my $load_log = 0; my $in_file = ''; my $verbosity = 0; my $out_file = ''; # ### DEBUG ### my $debug_on = 1; my $def_file = 'F:\data\DEM\map.html'; ### program variables my @warnings = (); my $cwd = cwd(); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub process_in_file($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$lnn,$len,$ch,$tag,$intag,$txt,$inquot,$qc,$i,$show,$rha,$hadsp,$attrs,$key,$val,$msg); $intag = 0; $txt = ''; $inquot = 0; $tag = ''; $lnn = 0; $show = 0; $hadsp = 0; $attrs = ''; my @hrefs = (); foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); for ($i = 0; $i < $len; $i++) { $ch = substr($line,$i,1); $show = 0; if ($intag) { if ($ch eq '>') { $show = 1; } elsif ($ch eq '<') { prt("Got OPEN before CLOSE!\n"); $show = 1; } else { if ($hadsp) { $attrs .= $ch; } elsif ($ch =~ /\s/) { $hadsp = 1; } else { $tag .= $ch; } } } elsif ($ch eq '<') { $intag = 1; $hadsp = 0; } else { $txt .= $ch; } # got a CLOSE if ($show) { if ($tag =~ /^\//) { # just a close tag } elsif ($tag =~ /^AREA$/i) { $msg = ''; $msg .= "$txt " if (length($txt)); if (length($tag)) { $msg .= "<$tag"; if (length($attrs)) { $msg .= " $attrs"; $rha = get_attr_refhash($attrs); foreach $key (keys %{$rha}) { if ($key =~ /href/i) { $val = ${$rha}{$key}; $val = strip_quotes($val); push(@hrefs,$val); } } } $msg .= ">"; } if (length($txt) || length($tag)) { $msg .= "\n"; } if (length($msg)) { prt($msg) if (VERB5()); } } $txt = ''; $tag = ''; $intag = 0; $attrs = ''; $hadsp = 0; } } if ($intag) { if ($hadsp) { $attrs .= ' ' if (length($attrs)); } else { $tag .= ' ' if (length($tag)); } } else { $txt .= ' ' if (length($txt)); } } $len = scalar @hrefs; if ($len) { @hrefs = sort @hrefs; prt(join("\n",@hrefs)."\n"); prt("Listed $len AREA hrefs in $inf.\n"); } else { prt("No AREA referecnes found in $inf.\n"); } $load_log = 1; } sub process_in_file_FAILED($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$ra,$rarea,$rahrefs,$cnt,$i,$item,$txt,$i2,$rha,$key); $line = join("",@lines); $ra = get_html_refarray($line); my $opts = -1; $rarea = get_whole_tag_array($ra,'area',$opts); $cnt = scalar @{$rarea}; for ($i = 0; $i < $cnt; $i++) { $i2 = $i + 1; $rha = ${$rarea}[$i][2]; # get attribute ref hash foreach $key (keys %{$rha}) { if ($key =~ /^href$/i) { $item = ${$rha}{$key}; prt("$item\n"); } } } } ######################################### ### MAIN ### parse_args(@ARGV); process_in_file($in_file); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log at end. ($load_log)\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if (VERB1()); } shift @av; } if ($debug_on) { prtw("WARNING: DEBUG is ON!\n"); if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); } } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); } # eof - template.pl