Generated: Sun Aug 21 11:11:04 2011 from getfgxlist.pl 2011/07/09 15.7 KB.
#!/usr/bin/perl -w # NAME: getfgxlist.pl # AIM: Get google FGx list... # 01/07/2011 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use LWP::Simple; use Cwd; my $perl_dir = 'C:\GTools\perl'; unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' ... Check \@INC values...\n"; require 'lib_html.pl' or die "Unable to load 'lib_html.pl' ... Check \@INC values...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $load_log = 1; my $in_file = ''; my $out_url2 = $perl_dir."\\tempurl2.txt"; my $out_url3 = $perl_dir."\\tempurl3.txt"; my $use_html_lib = 1; my $get_new_file = 1; # force to fetch a NEW file from the WEB my $debug_on = 0; my $def_file = 'def_file'; ### program variables my @warnings = (); my $cwd = cwd(); my $os = $^O; #my $TAG_NORM = 0; #my $TAG_CLOSE = 1; #my $TAG_CLOSED = 2; #my $TAG_CLOSEA = 3; #my $TAG_SPECIAL = 4; #my $TAG_COMMENT = 5; #my $TAG_TEXT = 6; #my $ATT_NV = '<no_value>'; my $git = 'http://code.google.com'; my $fgx = '/p/fgx/issues/list'; my $fgx_base = '/p/fgx/issues/'; my $chk_page = 'detail?id=42'; # debug my $dbg_01 = 0; # show tags as decoded my $dbg_02 = 0; # extra debug output sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { ###prt( "\nNo warnings issued.\n\n" ); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub process_in_file($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn); $lnn = 0; foreach $line (@lines) { chomp $line; $lnn++; if ($line =~ /\s*#\s*include\s+(.+)$/) { $inc = $1; prt("$lnn: $inc\n"); } } } sub get_attr_rhash($) { my $txt = shift; my %hash = (); my @arr = space_split($txt); my ($item,@arr2,$cnt,$att,$val,$j); foreach $item (@arr) { @arr2 = split("=",$item); $cnt = scalar @arr2; $att = trim_all($arr2[0]); $val = ''; if ($cnt > 1) { if ($cnt > 2) { for ($j = 1; $j < $cnt; $j++) { $val .= '=' if (length($val)); $val .= $arr2[$j]; } } else { $val = $arr2[1]; } } else { next if ($att eq '/'); $val = get_attr_no_value(); # $ATT_NV; } $hash{$att} = $val; } return \%hash; } sub get_url() { my $URL = $git.$fgx; my ($content); prt("Fetching content form [$URL]...\n"); unless (defined ($content = get $URL)) { pgm_exit(1,"ERROR: could not get $URL\n"); } write2file("$content\n",$out_url2); } sub get_html() { if ($get_new_file || (! -f $out_url2)) { get_url(); } if (! -f $out_url2) { pgm_exit(1,"ERROR: No URL file [$out_url2]!\n"); } if (! open(FIL,"<$out_url2")) { pgm_exit(1,"ERROR: Failed to open file [$out_url2]!\n"); } my @lines = <FIL>; close FIL; my $cnt = scalar @lines; my $content = join("",@lines); my $ra = get_html_refarray($content); prt("Done $cnt lines, from [$out_url2] file...\n"); return $ra; } my @ignore_tags = qw( div script tr th td span u img small form input tbody option table pre ); sub ignore_tag($) { my ($tag) = @_; my ($tt); foreach $tt (@ignore_tags) { return 1 if ($tag =~ /^$tt/i); } return 0; } my @ignore_text = qw( ID Type Status Priority Platform Branch Milestone Owner Area BlockedOn Enhancement Accepted High OSX Master 2.4.0 y...@sablonier.ch Defect All Medium p...@freeflightsim.org ); # 427: text [Summary + Labels] sub ignore_text($) { my ($tag) = @_; my ($tt); return 1 if ($tag eq 'Summary + Labels'); return 1 if ($tag eq 'My favorites'); foreach $tt (@ignore_text) { return 1 if ($tt eq $tag); } return 0; } sub remove_html_entities($) { my ($txt) = @_; $txt =~ s/\"/"/gm; $txt =~ s/\>/</gm; $txt =~ s/\</>/gm; return $txt; } sub get_text_in_tag($$) { my ($ra,$txt) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add); $cnt = scalar @{$ra}; #prt("get_text_in_tag: HTML ref array had $cnt items\n"); my @html_array = (); my $intag = 0; my $tagcnt = 0; my $txtcnt = 0; my $srchcnt = 0; my $tag_close = get_tag_close_value(); my $tag_text = get_tag_text_value(); my $tag_norm = get_tag_normal_value(); my %found = (); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; $found{$tag} = 1; if ($intag) { if (($typ == $tag_close) && ($tag =~ /^$txt$/i)) { $intag = 0; #prt("$lnn: End tag [$txt]\n"); } elsif ($typ == $tag_text) { #next if ($tag eq '(No comment was entered for this change.)'); next if ($tag =~ /No\s+comment\s+was\s+entered\s+for\s+this\s+change/i); push(@html_array,[$typ,$tag,$rha,$lnn]); } } else { if (($typ == $tag_norm) && ($tag =~ /^$txt$/i)) { $intag = 1; #prt("$lnn: Begin tag [$txt]\n"); $tagcnt++; } } } if ($tagcnt == 0) { $tagcnt = scalar keys(%found); #pgm_exit(1,"ERROR: Tag [$txt] NOT found in array of $cnt items... $tagcnt tags...\n"); prtw("WARNING: Tag [$txt] NOT found in array of $cnt items... $tagcnt tags...\n"); } $cnt = scalar @html_array; #prt("get_text_in_tag: HTML returning $cnt items\n"); return \@html_array; } sub show_text_item($) { my ($ra) = @_; my ($typ,$tag,$rha,$lnn,$cnt,$i,$ttxt); $cnt = scalar @{$ra}; #prt("HTML ref array had $cnt items\n"); my $msg = ''; my $max = 76; my $line = ''; my ($len,$ch,$j); my %dupes = (); my %dupes2 = (); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; next if (defined $dupes{$tag}); $dupes{$tag} = 1; $ttxt = lc(trim_all($tag)); next if (defined $dupes2{$ttxt}); $dupes2{$ttxt} = 1; $tag = remove_html_entities($tag); $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; #prt("$lnn: text [$tag]"); $len = length($tag); # get line length for ($j = 0; $j < $len; $j++) { $ch = substr($tag,$j,1); # get char if ($ch =~ /\s/) { # deal with spaces if ($ch =~ /\n/) { $msg .= "$line\n" if (length($line)); # add this line $line = ''; # and start again } else { # not a newline char if ($line =~ /\s$/) { # discard extra spaces } elsif (length($line)) { if (length($line) > $max) { $msg .= "$line\n"; $line = ''; } else { $line .= ' '; } } } } else { $line .= $ch; } } $msg .= "$line\n" if (length($line)); $line = ''; } $msg =~ s/\n$//; #prt("Content\n$msg\n"); prt("$msg\n"); } sub process_page_content($) { my ($content) = shift; my $ra = get_html_refarray($content); my $ra2 = get_html_body_only($ra); #show_html_refarray($ra2); #my $ra3 = drop_div_tag($ra2); #my $ra3 = drop_html_tags($ra2,\@ignore_tags); my $ra3 = get_text_in_tag($ra2,"pre"); #show_html_refarray($ra3); show_text_item($ra3); } sub get_this_page($) { my ($href) = @_; my $URL = $git.$fgx_base.$href; my ($content); unless (defined ($content = get $URL)) { pgm_exit(1,"ERROR: could not get $URL\n"); } write2file("$content\n",$out_url3); #prt("Contents of $href, written to $out_url3\n"); process_page_content($content); #pgm_exit(1,"CHECK EXIT\n"); } sub check_a_page($) { my ($file) = shift; if (! open( INF, "<$file" )) { pgm_exit(1,"ERROR: Can NOT open file [$file]\n"); } my @lines = <INF>; close INF; my $content = join("",@lines); process_page_content($content); } sub show_html_ra($) { my ($ra) = @_; my ($cnt,$typ,$tag,$rha,$i,$lnn); my ($hcnt,$key,$val,$att,$isnorm); my ($lckey,$isclose,$istext,$msg); my $inbody = 0; my $intable = 0; my $chref = ''; my $phref = ''; my $hrtext = ''; my $hadload = 0; my %lchash = (); my $max = 85; $cnt = scalar @{$ra}; prt("HTML ref array had $cnt items\n"); for ($i = 0; $i < $cnt; $i++) { $typ = ${$ra}[$i][0]; $tag = ${$ra}[$i][1]; $rha = ${$ra}[$i][2]; $lnn = ${$ra}[$i][3]; $hcnt = scalar keys(%{$rha}); $att = ''; next if (ignore_tag($tag)); $isnorm = ($typ == get_tag_normal_value()) ? 1 : 0; $isclose = ($typ == get_tag_close_value()) ? 1 : 0; $istext = ($typ == get_tag_text_value()) ? 1 : 0; if ($istext) { $tag = trim_all($tag); $tag =~ s/\ / /g; $tag = trim_all($tag); next if (length($tag) == 0); next if ($tag =~ /^\&.+;$/); next if ($tag =~ /^\W+$/); next if (ignore_text($tag)); next if ($tag =~ /^\d+$/); # ignroe all digit text last if ($tag eq 'CSV'); $hadload = 1 if ($tag eq 'Loading...'); } if ($inbody && $hadload) { %lchash = (); foreach $key (keys %{$rha}) { $lckey = lc($key); next if ($key =~ /^onclick$/i); next if ($key =~ /^style$/i); next if ($key =~ /^class$/i); $val = ${$rha}{$key}; $att .= " " if (length($att)); if ($val eq get_attr_no_value()) { $att .= $key; } else { $att .= "$key=$val"; } if ($key =~ /^href$/i) { $chref = strip_quotes($val); } $lchash{$lckey} = $val; } # if ($typ == get_tag_normal_value()) { if ($isnorm) { if ($tag =~ /^a$/i) { if (defined $lchash{'href'}) { $val = strip_quotes($lchash{'href'}); next if ($val eq '#'); next if ($val =~ /^\#/); next if ($val =~ /^\/p\/fgx/i); next if ($val eq $phref); # only show NEW HREFS if (length($hrtext) && length($phref) && ($phref =~ /^detail/) ) { $msg = "==bug== [$phref] [$hrtext] "; $msg .= '=' while (length($msg) < $max); prt("$msg\n"); get_this_page($phref); $msg = "=== HREF DONE [$phref] [$hrtext] ==="; $msg .= '=' while (length($msg) < $max); prt("$msg\n\n"); } $phref = $val; $hrtext = ''; } } if ($dbg_02) { prt("$lnn: norm [$tag]"); } else { next; } # } elsif ($typ == get_tag_close_value()) { } elsif ($isclose) { if ($tag =~ /^body/i) { $inbody = 0; prt("$lnn: exit BODY\n"); next; } #prt("$lnn: close [$tag]"); next; } elsif ($typ == get_tag_closed_value()) { prt("$lnn: closed [$tag]"); } elsif ($typ == get_tag_closea_value()) { prt("$lnn: closea [$tag]"); } elsif ($typ == get_tag_special_value()) { prt("$lnn: spl [$tag]"); } elsif ($typ == get_tag_comment_value()) { # prt("$lnn: comm [$tag]"); next; #} elsif ($typ == get_tag_text_value()) { } elsif ($istext) { $hrtext .= ' ' if (length($hrtext)); $hrtext .= remove_html_entities($tag); if ($dbg_02) { prt("$lnn: text [$tag]"); } else { next; } } else { prt("$lnn: unknown [$tag]"); } prt(" attr [$att]") if (length($att)); prt("\n"); } else { # not yet in body if ($isnorm && ($tag =~ /^body/i)) { $inbody = 1; prt("$lnn: entered BODY\n"); } } } } ######################################### ### MAIN ### #parse_args(@ARGV); #prt( "$pgmname: in [$cwd]: Hello, World...\n" ); #process_in_file($in_file); #get_this_page($chk_page); #check_a_page($out_url3); #pgm_exit(0,""); prt("$pgmname: get FGx issues list as of ".get_YYYYMMDD(time())."\n"); my $ref_arr = get_html(); show_html_ra($ref_arr); pgm_exit(0,""); ######################################## sub give_help { prt("$pgmname: version 0.0.1 2010-09-11\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); } sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n"); } shift @av; } if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } # eof - getfgxlist.pl