Generated: Sun Apr 15 11:46:20 2012 from getimg4htmurl.pl 2012/02/26 23.1 KB.
#!/usr/bin/perl -w # NAME: getimg4htmurl.pl # AIM: Read a HTML file, and get the <img, and fetch the image via HTTP, and save... # 23/02/2012 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use File::stat; use Cwd; use LWP::Simple; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; require 'lib_css.pl' or die "Unable to load 'lib_css.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.1 2012-02-23"; my $load_log = 0; my $in_file = ''; my $verbosity = 0; my $out_dir = ''; ### program variables my @warnings = (); my $cwd = cwd(); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } # my @noclose = qw(img link meta); my @noclose = qw( meta link area base basefont br frame hr isindex param bgsound embed keygen img ); # tags which do NOT need a closing, like </p>, tag my @optclose = ( "body", "colgroup", "dd", "dt", "head", "html", "li", "optgroup", "option", "p", "tbody", "td", "tfoot", "th", "thead", "tr", "marquee" ); # my @optclose = qw(li); my @html_ext = qw( .htm .html .shtml .php ); my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg .tif ); # debug my $debug_on = 1; my $def_file = 'C:\HOMEPAGE\GA\fg\release260.htm'; my $def_out = 'C:\GTools\perl\temp'; my $do_fetch = 0; # switch off file fetch my $dbg_css = 0; # 1,031 bridge.jpg my $test_url = "http://geoffair.org/images/bridge.jpg"; #my $test_url = "http://geoffair.org/images/spacer.gif"; my $test_out = $temp_dir.$PATH_SEP."tempimg.jpg"; sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub is_noclose_tag($) { my ($tag) = @_; my $lctag = lc($tag); my ($tst); foreach $tst (@noclose) { return 1 if ($tst eq $tag); } return 0; } sub is_optclose_tag($) { my ($tag) = @_; my $lctag = lc($tag); my ($tst); foreach $tst (@optclose) { return 1 if ($tst eq $tag); } return 0; } sub is_ext_html($) { my $ext = shift; $ext = lc($ext); my ($tst); foreach $tst (@html_ext) { return 1 if ($tst eq $ext); } return 0; } sub is_ext_graf($) { my $ext = shift; $ext = lc($ext); my ($tst); foreach $tst (@graf_ext) { return 1 if ($tst eq $ext); } return 0; } sub file_has_htm_ext($) { my $fn = shift; my ($nm,$dir,$ext) = fileparse($fn, qr/\.[^.]*/); if ($ext && length($ext)) { return 4 if ($ext eq '.css'); return 3 if (is_ext_graf($ext)); return 2 if (is_ext_html($ext)); return 1; } return 0; } sub fetch_url { my ($url,$fil) = @_; prt( "Fetching: $url...\n" ); my $img = get($url); my ($sb,$sz,$isz); if ($img) { $isz = length($img); prt("Writing $isz to $fil...\n"); if (open OF, ">$fil") { binmode OF; print OF $img; close OF; if ($sb = stat($fil)) { $sz = $sb->size; prt("Written $sz bytes to $fil\n"); } } else { pgm_exit(1,"ERROR: Unable to create file [$fil]\n"); } } else { prtw("WARNING: get of URL '$url' FAILED!\n"); } } sub get_attr_hash($) { my $ra = shift; my %hash = (); my ($itm,$key,$val,$len); foreach $itm (@{$ra}) { if ($itm =~ /^(\w+)=/) { $key = lc($1); $len = length($key) + 1; $val = substr($itm,$len); $val = strip_quotes($val); $hash{$key} = $val; } } return \%hash; } sub sanetise_filename($) { my $rn = shift; ${$rn} =~ s/\?/_/g; ${$rn} =~ s/\*/_/g; ${$rn} =~ s/&/&/g; ${$rn} =~ s/%3D/=/g; ${$rn} =~ s/:/_/g; } # $hash{'REF_HASH'} = $rhash; # $hash{'H_CLASSES'} = \%h_classes; # $hash{'H_NAMES'} = \%h_names; # $hash{'H_ELEMENTS'} = \%h_elements; # $hash{'H_ELECLASS'} = \%h_eleclass; # $hash{'H_ELEHASH'} = \%h_elehash; # $hash{'H_ELECOLON'} = \%h_elecolon; # $hash{'H_ELEPLUS'} = \%h_eleplus; # $hash{'H_OTHERS'} = \%h_others; sub check_class_list($$) { my ($rcc,$rclasses) = @_; if (!defined ${$rcc}{'REF_HASH'}) { prtw("WARNING: Has passed does NOT contain main ref hash!\n"); return; } my $rhash = ${$rcc}{'REF_HASH'}; my $rh_classes = ${$rcc}{'H_CLASSES'}; # = \%h_classes; begin with '.' my $rh_names = ${$rcc}{'H_NAMES'}; # = \%h_names; begin with '#' my $rh_elements = ${$rcc}{'H_ELEMENTS'}; # = \%h_elements; raw elements my $rh_eleclass = ${$rcc}{'H_ELECLASS'}; # = \%h_eleclass; element.class my $rh_elehash = ${$rcc}{'H_ELEHASH'}; # = \%h_elehash; element#class my $rh_elecolon = ${$rcc}{'H_ELECOLON'}; # = \%h_elecolon; element:class my $rh_eleplus = ${$rcc}{'H_ELEPLUS'}; # = \%h_eleplus; element+element my $rh_others = ${$rcc}{'H_OTHERS'}; # = \%h_others; NOT any of the above??? my ($class,$val,$cont,$rh2,$key2,$done,$val2); foreach $class (sort keys %{$rclasses}) { $val = ${$rclasses}{$class}; # this is only the count of useage if (defined ${$rh_classes}{$class}) { $class = '.'.$class; if (defined ${$rhash}{$class}) { $rh2 = ${$rhash}{$class}; $cont = ''; foreach $key2 (sort keys %{$rh2}) { $val = ${$rh2}{$key2}; $cont .= "$key2:$val;" } prt("$class { $cont }\n"); } else { prt("$class NOT FOUND! dot.class format\n"); } } elsif (defined ${$rh_names}{$class}) { $class = '#'.$class; if (defined ${$rhash}{$class}) { $rh2 = ${$rhash}{$class}; $cont = ''; foreach $key2 (sort keys %{$rh2}) { $val = ${$rh2}{$key2}; $cont .= "$key2:$val;" } prt("$class { $cont }\n"); } else { prt("$class NOT FOUND! hash.class format\n"); } } else { $done = 0; foreach $key2 (keys %{$rh_eleclass}) { $val = ${$rh_eleclass}{$key2}; foreach $val2 (@{$val}) { ###prt("Checking $key2.$val2\n"); if ($val2 eq $class) { $class = $key2.".".$class; if (defined ${$rhash}{$class}) { $rh2 = ${$rhash}{$class}; $cont = ''; foreach $key2 (sort keys %{$rh2}) { $val = ${$rh2}{$key2}; $cont .= "$key2:$val;" } prt("$class { $cont }\n"); } else { prt("$class NOT FOUND! element.class format\n"); } $done = 1; last; } } } if ($done == 0) { prt("$class NOT FOUND!\n"); } } } ### pgm_exit(1,"TEMP EXIT 2"); } sub process_html_file($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($file_name,$file_dir) = fileparse($inf); my ($line,$inc,$lnn); my ($inquot,$intag,$i,$len,$ch,$qc,$tag,@arr,$ptag,$cnt,$rh,$val,$fn,$of,$res); my ($tmp,$msg,$tscnt,$ff,$nf,$rcss,$ra,$rceh); $lnn = 0; $intag = 0; $tag = ''; my @tagstack = (); my %classes = (); my %class_element = (); my %elements = (); my %element_list = (); my %local_jumps = (); my %local_names = (); my %local_id = (); my %file_srcs = (); my %file_hrefs = (); my %remote_srcs = (); my %remote_hrefs =(); my %css_hash = (); foreach $line (@lines) { chomp $line; $lnn++; $line = trim_all($line); $len = length($line); next if ($len == 0); $inquot = 0; for ($i = 0; $i < $len; $i++) { $ch = substr($line,$i,1); if ($intag) { $tag .= $ch; if ($inquot) { $inquot = 0 if ($ch eq $qc); prt("$lnn: End quote...($ch)\n") if (($inquot == 0) && VERB9()); } elsif (($ch eq '"')||($ch eq "'")) { $inquot = 1; $qc = $ch; prt("$lnn: Begin quote...($ch)\n") if (VERB9()); } else { if ($ch eq '>') { $intag = 0; $tag =~ s/^<(.*)>$/$1/; #@arr = split(/\s+/,$tag); @arr = space_split($tag); $cnt = scalar @arr; $rh = get_attr_hash(\@arr); $tag = $arr[0]; if ($tag =~ /^\//) { # CLOSE TAG ie </element> $tag = substr($tag,1); $tscnt = scalar @tagstack; if ($tscnt) { $ptag = $tagstack[-1]; # get last if (lc($tag) eq lc($ptag)) { $ptag = pop @tagstack; $tscnt = scalar @tagstack; prt("$lnn: Close tag [$tag] [$ptag]$tscnt...\n") if (VERB2()); } else { if (is_optclose_tag($ptag) && ($tscnt > 1)) { } prtw("WARNING:$lnn: Close tag [$tag] ne [$ptag], but NOT last on STACK!\n"); } } else { prtw("WARNING:$lnn: Close tag [$tag], but NONE on STACK!\n"); } } elsif ($tag =~ /^!/) { # SPECIAL TAG - <!DOCTYPE...> prt("$lnn: Special tag [$tag]...\n") if (VERB9()); } elsif (is_noclose_tag($tag)) { # ELEMENTS WHICH DO NOT HAVE A CLOSE prt("$lnn: Noclose tag [$tag]...\n") if (VERB5()); } else { # normal ELEMENT push(@tagstack,$tag); $element_list{$tag} = 1; $elements{$tag} = [ ] if (!defined $elements{$tag}); $ra = $elements{$tag}; push(@{$ra},$rh); # store the attribute hash for the element $msg = ''; foreach $tmp (@tagstack) { $msg .= ":" if (length($msg)); $msg .= $tmp; } $tscnt = scalar @tagstack; prt("$lnn: Push tag [$tag]$tscnt... $msg\n") if (VERB2()); if (VERB9()) { @arr = sort keys(%{$rh}); $msg = ''; foreach $tmp (@arr) { $val = ${$rh}{$tmp}; $msg .= ' ' if (length($msg)); $msg .= "$tmp=\"$val\""; } prt("$lnn: Attributes: $msg\n") if (length($msg)); } } if (defined ${$rh}{'src'}) { $val = ${$rh}{'src'}; if ($val =~ /^http:\/\//i) { @arr = split(/\//,$val); $fn = $arr[-1]; prt("$lnn: src = [$val] file [$fn]\n") if (VERB1()); $of = $out_dir.$PATH_SEP.$fn; fetch_url($val,$of) if ($do_fetch); $remote_srcs{$val} = $of; } else { $ff = $file_dir.$val; $nf = 1; if (-f $ff) { $msg = 'ok'; $nf = 0; } else { $msg = "NOT FOUND!"; } prt("$lnn: src = [$val] $msg\n") if ($nf || VERB1()); $file_srcs{$val} = $ff; } } if (defined ${$rh}{'href'}) { $val = ${$rh}{'href'}; if ($val =~ /^http:\/\//i) { @arr = split(/\//,$val); $fn = $arr[-1]; sanetise_filename(\$fn); $res = file_has_htm_ext($fn); if ($res == 0) { $fn .= ".htm"; } elsif ($res == 1) { $fn .= ".html"; } prt("$lnn: src = [$val] file [$fn]\n") if (VERB1()); $of = $out_dir.$PATH_SEP.$fn; fetch_url($val,$of) if ($do_fetch); $remote_hrefs{$val} = $of; } elsif ($val =~ /^\#/) { # local jump target $val = substr($val,1); $local_jumps{$val} = 1; } else { $ff = $file_dir.$val; $nf = 1; if (-f $ff) { $msg = 'ok'; $nf = 0; if (lc($tag) eq 'link') { $rcss = read_css_file(\%css_hash,$ff,$dbg_css); ### pgm_exit(1,"TEMP EXIT"); } } else { $msg = "NOT FOUND!"; } prt("$lnn: href = [$val] $msg\n") if ($nf || VERB1()); $file_hrefs{$val} = $ff; } } if (defined ${$rh}{'class'}) { $val = ${$rh}{'class'}; $val = strip_quotes($val); @arr = split(/\s+/,$val); foreach $tmp (@arr) { if (defined $classes{$tmp}) { $classes{$tmp}++; } else { $classes{$tmp} = 1; } $class_element{$tmp} = { } if (!defined $class_element{$tmp}); $rceh = $class_element{$tmp}; ${$rceh}{$tag} = 1; } } if (defined ${$rh}{'name'}) { $val = ${$rh}{'name'}; $val = strip_quotes($val); if (defined $local_names{$val}) { prtw("WARNING:$lnn: Attribute name [$val] REPEATED!\n"); } else { $local_names{$val} = 1; } } if (defined ${$rh}{'id'}) { $val = ${$rh}{'id'}; $val = strip_quotes($val); if (defined $local_id{$val}) { prtw("WARNING:$lnn: Attribute name [$val] REPEATED!\n"); } else { $local_id{$val} = 1; } } $tag = ''; # kill this tag } } } else { if ($ch eq '<') { $intag = 1; prt("$lnn: Begin tag...\n") if (VERB9()); $tag = $ch; } } } # for this line $tag .= ' ' if (length($tag)); if ($inquot) { prtw("WARNING:$lnn: Ended line in QUOTES!\n"); $inquot = 0; } } @arr = sort keys(%local_jumps); $tscnt = scalar @arr; if ($tscnt) { $msg = ''; foreach $tmp (@arr) { $msg .= ' ' if (length($msg)); $msg .= "$tmp"; if (! defined $local_names{$tmp}) { $msg .= " NOT FOUND!"; } } prt("Found $tscnt jumps: $msg\n"); } my $rcc = get_class_counts(\%css_hash); @arr = sort keys(%classes); $tscnt = scalar @arr; if ($tscnt) { $msg = ''; foreach $tmp (@arr) { $val = $classes{$tmp}; $msg .= ' ' if (length($msg)); $msg .= "$tmp($val)" } prt("Found $tscnt classes: $msg\n"); check_class_list($rcc,\%classes) if (VERB9()); } $tscnt = scalar @tagstack; if ($tscnt) { $msg = ''; foreach $tmp (@tagstack) { $msg .= ":" if (length($msg)); $msg .= $tmp; } prtw("WARNING: $lnn: End of file [$inf], with tag stack [$msg] NOT cleared.\n"); } else { prt("$lnn: End of file [$inf], with tag stack cleared.\n"); } my %hash = (); $hash{'INPUT_FILE'} = $inf; $hash{'CLASSES'} = \%classes; $hash{'ELEMENTS'} = \%elements; $hash{'CLASS_ELEMENT'} = \%class_element; $hash{'ELEMENT_LIST'} = \%element_list; $hash{'LOCAL_JUMPS'} = \%local_jumps; $hash{'LOCAL_NAMES'} = \%local_names; $hash{'LOCAL_ID'} = \%local_id; $hash{'FILE_SRCS'} = \%file_srcs; $hash{'FILE_HREFS'} = \%file_hrefs; $hash{'REMOTE_SRCS'} = \%remote_srcs; $hash{'REMOTE_HREFS'} = \%remote_hrefs; $hash{'CSS_HAHS'} = \%css_hash; return \%hash; } sub process_in_file($) { my $file = shift; my $ref_hash = process_html_file($file); } ######################################### ### MAIN ### #fetch_url($test_url,$test_out); #pgm_exit(1,"EOP"); parse_args(@ARGV); process_in_file($in_file); pgm_exit(0,""); ######################################## sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname -o dir [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out <dir> (-o) = Write output to this directory.\n"); prt(" Read the input HTML page, and fetch any remote href or src items,\n"); prt(" and write them to the output directory.\n"); } sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^l/) { $load_log = 1; prt("Set to load log at end.\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_dir = $sarg; prt("Set out directory to [$out_dir].\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if (VERB1()); } shift @av; } if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); $out_dir = $def_out; $load_log = 1; #$verbosity = 1; } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input file found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } if (length($out_dir) == 0) { pgm_exit(1,"ERROR: No output directory found in command!\n"); } if (! -d $out_dir) { pgm_exit(1,"ERROR: Unable to find output directory [$out_dir]!\n"); } } # eof - template.pl