#!/perl -w # NAME: gethrefs02.pl # AIM: Parse a HTML file, and extract HREF links # 2016-08-03 - Reciew # 05/11/2015 - Lots of quick improvements # 18/07/2010 - revisit and test... use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use constant { HRT_UNKNOWN => 0, HRT_LOCAL => 1, HRT_LINK => 2, HRT_SCRIPT => 4, HRT_FILE => 8, HRT_BASE => 16, HRT_PARAMS => 32 }; use constant { FT_UNKNOWN => 0, FT_HTML => 1, FT_GRAF => 2, FT_CSS => 3, FT_SCRIPT => 4, FT_TEXT => 5, FT_ZIP => 6, FT_BIN => 7, FT_CODE => 8, FT_DIR => 9, FT_HIDDEN => 10, FT_PARAM => 11 }; # offsets in file array use constant { OF_FF => 0, # full file name OF_HR => 1, # array ref of href links OF_IM => 2, # array ref of image links OF_LK => 3, # linked count OF_SP => 4, # spare OF_TO => 5, # links TO OF_FM => 6, # links FROM OF_FT => 7 # file type }; my $perl_root = 'C:\Gtools\perl'; unshift(@INC,$perl_root); #require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'lib_utils.pl' or die "Unable to load lib_utils.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; # for htmltools, if functions used my @imgs = (); my @hrefs = (); # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $perl_root."\\temp.$pgmname.txt"; open_log($outfile); ###prt( "$0 ... Hello, World ...\n" ); my $os = $^O; my $in_file = ''; ## my $in_file = 'C:\GTools\java\examples\JavaTech\Code_List.htm'; ## my $in_file = 'C:\HOMEPAGE\GA\travel\maroc\index.htm'; ## my $in_file = 'temphtml.htm'; my $usr_url = ''; my @all_hrefs = (); my $outtemp = $perl_root."\\templist.txt"; my $show_full_list = 0; my $show_missed_files = 0; my $verbosity = 0; my $load_log = 0; my $out_file = ''; my $VERS = "0.0.5 2015-11-05"; ##my $VERS = "0.0.4 2010-07-18"; sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } # CONSTANTS ########### # File Type Extensions my @html_extension = qw( .htm .html .shtml .php ); my @graf_extension = qw( .jpg .jpeg .gif .png .bmp .ico .mpg ); my @css_extension = qw( .css ); my @script_extension = qw( .js .class .cgi .java .remote ); my @zip_extension = qw( .zip .tar .gz .jar .tgz ); my @txt_extension = qw( .txt .doc .bat .cmd .old .bak .policy .pdf .cfg ); my @code_extension = qw( .c .cxx .cpp .h .hxx .hpp .idl .mak ); my @bin_extension = qw( .dat .exe .au ); # private FRONTPAGE folders my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived ); # features my $ignfpd = 1; # ignore FRONTPAGE folders my @excludes = qw( desktop.ini php.ini blank.html blank.htm ); my $recurse = 0; # recursive my @splexcludes = qw( macpc ); my %ext_hash = (); my @all_files = (); my $refcnt = 0; my @done_files = (); my %not_found = (); my ($base_file,$base_dir); my $base_href = ''; # set if found # DEBUG my $dbg1 = 0; # show discarded material my $dbg2 = 0; # show "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]... my $dbg3 = 0; # show Processing $lncnt lines from $fil ... my $dbg4 = 0; # show File [$name], in [$rdir] ... my $dbg5 = 0; # show HREF immediately my $dbg6 = 0; # show FOLDERS searched... # ### DEBUG ### my $debug_on = 0; my $def_file = 'C:\Users\user\Downloads\temp\index.html'; my @warnings = (); sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } ################################################################## # OF_FF => 0, # full file name # OF_HR => 1, # array ref of href links # OF_IM => 2, # array ref of image links # OF_LK => 3, # linked count # OF_SP => 4, # spare # OF_TO => 5, # links TO # OF_FM => 6, # links FROM # OF_FT => 7 # file type sub show_results { my $fcnt = scalar @all_files; my ($i, $ff, $ft, $cnt, $mcnt); $mcnt = 0; for ($i = 0; $i < $fcnt; $i++) { $ft = $all_files[$i][OF_FT]; if ($ft == FT_HTML) { $cnt = $all_files[$i][OF_SP]; if ($cnt == 0) { $ff = $all_files[$i][OF_FF]; $mcnt++; # prt( "Missed [$ff]\n" ); } } } if ($mcnt) { prt("Got $mcnt 'missed' files... not marked...\n"); for ($i = 0; $i < $fcnt; $i++) { $ft = $all_files[$i][OF_FT]; if ($ft == FT_HTML) { $cnt = $all_files[$i][OF_SP]; if ($cnt == 0) { $ff = $all_files[$i][OF_FF]; prt( "Missed [$ff]\n" ); } } } } } # ========================================================================= # url_parse - needs some more to remove any other post, like index.htm?a=b... # --------- sub url_parse($) { my ($url) = @_; my $post = ''; my $name = ''; my $dir = ''; my $ind = index($url,'#'); if ($ind > 0) { $post = substr($url,$ind); $url = substr($url,0,$ind); } if ($url =~ /\/$/) { $dir = $url; } else { ($name,$dir) = fileparse($url); if ( !($name =~ /\./) ) { # without an EXTENT, assume directory $dir .= $name.'/'; $name = ''; } } return $dir,$name,$post; # url_parse - return (dir,name,post) } sub uri_parse2($) { my ($uri) = shift; $uri =~ /^(([^:\/\?#]+):)?(\/\/([^\/\?#]*))?([^\?#]*)(\?([^#]*))?(#(.*))?/; # Then: my $scheme = (defined $2) ? $2 : ''; my $authority = (defined $4) ? $4 : ''; my $path = (defined $5) ? $5 : ''; my $query = (defined $7) ? $7 : ''; my $fragment = (defined $9) ? $9 : ''; return $scheme,$authority,$path,$query,$fragment; } ################################################################## ######################################################### # Passed an array of extensions, # check if this is one of them? ######################################################### sub is_my_extension { my ($fil, $rexts) = @_; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); my $lcext = lc($ext); my ($ex); foreach $ex (@{$rexts}) { return 1 if (lc($ex) eq $lcext); } return 0; } ############################################ # only looking for HTM, HTML, PHP, # could be extended to others maybe ... ############################################ sub is_htm_extension { my ($fil) = shift; return( is_my_extension($fil, \@html_extension) ); } sub is_graphic_extension { my ($fil) = shift; return( is_my_extension($fil, \@graf_extension) ); } sub is_zip_extension { my ($fil) = shift; return( is_my_extension($fil, \@zip_extension) ); } sub is_css_extension { my ($fil) = shift; return( is_my_extension($fil, \@css_extension) ); } sub is_txt_extension { my ($fil) = shift; return( is_my_extension($fil, \@txt_extension) ); } sub is_code_extension { my ($fil) = shift; return( is_my_extension($fil, \@code_extension) ); } sub is_script_extension { my ($fil) = shift; return( is_my_extension($fil, \@script_extension) ); } sub is_bin_extension { my ($fil) = shift; return( is_my_extension($fil, \@bin_extension) ); } #use constant { # FT_UNKNOWN => 0, ## FT_HTML => 1, ## FT_GRAF => 2, ## FT_CSS => 3, ## FT_SCRIPT => 4, ## FT_TEXT => 5, ## FT_ZIP => 6, ## FT_DIR => 7 #}; sub get_file_type_const($); sub get_file_type_const($) { my ($fil) = shift; if (is_htm_extension($fil)) { return FT_HTML; } elsif (is_graphic_extension($fil)) { return FT_GRAF; } elsif (is_zip_extension($fil)) { return FT_ZIP; } elsif (is_css_extension($fil)) { return FT_CSS; } elsif (is_txt_extension($fil)) { return FT_TEXT; } elsif (is_script_extension($fil)) { return FT_SCRIPT; } elsif (is_bin_extension($fil)) { return FT_BIN; } elsif (is_code_extension($fil)) { return FT_CODE; } elsif ($fil =~ /\/$/) { return FT_DIR; } elsif ($fil =~ /#/) { my $ih = index($fil,'#'); if ($ih > 0) { my $f2 = substr($fil,0,$ih); return get_file_type_const($f2); } } elsif ($fil =~ /\//) { return FT_DIR; # gross assumption } elsif ($fil =~ /^\w+$/) { return FT_DIR; # another gross assumption } return FT_HIDDEN if ($fil =~ /^\./); return FT_PARAM if ($fil =~ /^\?.+/); ### pgm_exit(1,"Why UNKNOWN for [$fil]?\n"); return FT_UNKNOWN; } sub file_type_const_to_string { my ($ft) = shift; if ($ft == FT_HTML) { return "html"; } elsif ($ft == FT_GRAF) { return "graphic"; } elsif ($ft == FT_ZIP) { return "zip"; } elsif ($ft == FT_CSS) { return "css"; } elsif ($ft == FT_TEXT) { return "text"; } elsif ($ft == FT_SCRIPT) { return "script"; } elsif ($ft == FT_BIN) { return "binary"; } elsif ($ft == FT_CODE) { return "code"; } elsif ($ft == FT_DIR) { return "directory"; } elsif ($ft == FT_UNKNOWN) { return "unknown"; } elsif ($ft == FT_HIDDEN) { return "hidden"; } elsif ($ft == FT_PARAM) { return "parameter"; } pgm_exit(1,"***FIX ME*** uncased type [$ft]!"); return ""; } sub fix_rel_url($) { my ($path) = @_; my @a = split(/\//, $path); my $npath = ''; my $max = scalar @a; my @na = (); for (my $i = 0; $i < $max; $i++) { my $p = $a[$i]; if ($p eq '.') { # ignore this } elsif ($p eq '..') { if (@na) { pop @na; # discard previous } else { pgm_exit(1,"ERROR: Got relative .. without previous!!! path=$path\n" ); } } else { push(@na,$p); } } foreach my $pt (@na) { $npath .= "/" if length($npath); $npath .= $pt; } return $npath; } sub is_http_link($) { my ($hr) = shift; return 1 if ($hr =~ /^http(s*):\/\//); return 0; } sub get_full_base_href($$) { my ($rrhr, $bhr) = @_; my $rhr = ${$rrhr}; my ($nm,$dir,$fhr); if ( length($bhr) && !is_http_link($rhr) && !($rhr =~ /^#/) ) { if ($bhr =~ /\/$/) { $dir = $bhr; # assume a DIRECTORY } else { ($nm,$dir) = fileparse($bhr); # assume a FILE, so get the dir only... } $fhr = $dir.$rhr; $fhr = fix_rel_url($fhr); ${$rrhr} = $fhr; return 1; } return 0; } sub get_href_type_const($); sub get_href_type_const($) { my ($hrf) = shift; my ($ih,$id,$is); if (is_http_link($hrf)) { return HRT_LINK; } elsif ($hrf =~ /^ftp:\/\//i) { return HRT_LINK; } elsif ($hrf =~ /^javascript:/i) { return HRT_SCRIPT; } elsif (substr($hrf,0,1) eq '#') { return HRT_LOCAL; } #if ( get_full_base_href(\$hrf,$base_href) ) { # if ($hrf =~ /^http(s*):\/\//i) { # return HRT_LINK; # } #} $ih = index($hrf,'#'); $id = rindex($hrf,'.'); $is = rindex($hrf,'/'); if ($ih > 0) { my $hr2 = substr($hrf,0,$ih); my $srt = get_href_type_const($hr2); $id = rindex($hr2,'.'); $is = rindex($hr2,'/'); if ($id > 0) { return ($srt | HRT_FILE); } } if ($id > 0) { # contains a DOT - assume a file return HRT_FILE; } if ($hrf =~ /\/$/) { # ends in '/', assume file - acutally directory return HRT_FILE; } if ($hrf =~ /\//) { # contains any '/', assume a file return HRT_FILE; } if ($hrf =~ /^\w+$/) { # contains any alphanumeric only, assume a file return HRT_FILE; } if ($hrf =~ /^\?.+/) { # if a href param, like '?C=N;O=D', ... return HRT_PARAMS; } prtw("WARNING: Why UNKNOWN on href [$hrf] ih=$ih id=$id is=$is\n"); return HRT_UNKNOWN; } my %done_warning = (); sub href_type_to_string { my ($hrt) = shift; my $ret = ''; if ($hrt & HRT_LINK) { $ret .= "extern link "; } if ($hrt & HRT_SCRIPT) { $ret .= "script "; } if ($hrt & HRT_LOCAL) { $ret .= "local "; } if ($hrt & HRT_FILE) { $ret .= "file "; } if ($hrt & HRT_BASE) { $ret .= "BASE "; } if ($hrt == HRT_UNKNOWN) { $ret = "unknown"; } if ($hrt == HRT_PARAMS) { $ret = "parameter"; } $ret =~ s/\s+$//; if (length($ret) == 0) { my $err = "***FIX ME*** uncased type [$hrt]!"; if (!defined $done_warning{$err}) { $done_warning{$err} = 1; prtw("WARNING: $err\n"); } } return $ret; } ############################################## sub get_hrefs_from_string($) { my ($ln) = shift; my ($i, $j, $line, $ch, $ch2, $len, $tag, $disc, $hcnt); my ($bgn, $fhr, $hr, $tail, $max, $hrt, $ft, $ctyp); my ($sp,$tag2,$gottag); my @hrf = (); $ln =~ s/\n/ /g; $ln = trim_all($ln); # sub write2file { my ($txt,$fil) = @_; # write2file($fulln,'tempfl.txt'); $len = length($ln); $disc = ''; $hcnt = 0; $base_href = ''; # assume NO # process single long string, char by char for ($i = 0; $i < $len; $i++) { $ch = substr($ln,$i,1); if ($ch eq '<') { $tag = $ch; # start a tag $i++; $ch = substr($ln,$i,1); # could check for things like ') { last; } if (!$gottag) { if ($ch =~ /\w/) { $tag2 .= $ch; } else { $gottag = 1; } } } if ($tag =~ /(.*\s+)href(\s*)=/i) { $bgn = $1; $sp = length($2); $hcnt++; $fhr = substr($tag,length($bgn)+5+$sp); $fhr = substr($fhr,1) while ($fhr =~ /^\s/); # remove all LEADING space $ch = substr($fhr,0,1); prt("$tag [$tag2] [$fhr]\n") if ($dbg5); if (($ch eq '"')||($ch eq "'")) { $max = length($fhr); $hr = ''; $tail = ''; # collect actual HREF= for ($j = 1; $j < $max; $j++) { $ch2 = substr($fhr,$j,1); if ($ch eq $ch2) { $tail = substr($fhr,$j); last; } $hr .= $ch2; } if ($tag2 =~ /^BASE$/i) { $hrt = HRT_BASE; #prt("Got [$tag2] [$fhr] [$hr]\n"); my ($d,$n,$p) = url_parse($hr); prt("Got BASE [$hr] = [$d]+[$n]+[$p]\n"); $base_href = $hr; } else { get_full_base_href(\$hr,$base_href); $hrt = get_href_type_const($hr); } $ctyp = ''; $ft = FT_UNKNOWN; if ($hrt & HRT_FILE) { $ft = get_file_type_const($hr); $ctyp = "ext[".file_type_const_to_string($ft)."] "; } $ctyp = 'type['.href_type_to_string($hrt)."] $ctyp"; #prt("tag [$tag2] [$hr] $ctyp\n"); prt( "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]\n" ) if ($dbg2); # href HRT-type FT-file # 0 1 2 push(@hrf, [$hr, $hrt, $ft]); } else { prt( "$hcnt:HREF: fhr[$fhr] pre[$disc] tag[$tag] bgn[$bgn] CHECK ME\n" ); } } else { prt( "DISCARDED: pre[$disc] tag[$tag] ...\n" ) if ($dbg1); } $disc = ''; } else { $disc .= $ch; } } return @hrf; } sub trim_href($) { my $fil = shift; my $nfil = ''; my $len = length($fil); my ($i,$ch); for ($i = 0; $i < $len; $i++) { $ch = substr($fil,$i,1); if (($i == 0)&&($ch eq '/')) { next; } if (($ch eq '#')||($ch eq '?')) { last; } $nfil .= $ch; } return $nfil; } sub parse_file($$) { my ($bdir,$bfil) = @_; #if ($bdir = /^\.(\\|\/)$/) { # $bdir = ''; #} my $fil = $bdir.$bfil; prt( "Processing file '$fil' ...\n" ); my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt,$linkcnt,$ra,$hr,$ci2); my @hrf = (); if ( ! open INF, "<$fil") { prt( "WARNING: Can NOT open file [$fil]...\n" ); return @hrf; } my @lines = ; close INF; $lncnt = scalar @lines; prt( "Processing $lncnt lines from [$fil] ...\n" ); $full = join('',@lines); # sub write2file { my ($txt,$fil) = @_; #my $scrp = return_tag($full,'script'); ##my $scrp = get_all_tag_text($full,'script'); ##write2file($scrp,'tempscript.txt'); ##prt( "Got script text [$scrp]\n" ); @hrf = get_hrefs_from_string($full); $hrcnt = scalar @hrf; prt( "Got $hrcnt HREF entries, from $fil...\n" ); $filcnt = 0; # href HRT-type FT-file # 0 1 2 # push(@hrf, [$hr, $hrt, $ft]); my %hr_dupes = (); my $url = $usr_url; # get any user url my @list = (); if (length($url)) { $url .= '/' if (!($url =~ /\/$/)); # ensure ends with '/' } for ($i = 0; $i < $hrcnt; $i++) { $ra = $hrf[$i]; $hr = ${$ra}[0]; $typ = ${$ra}[1]; $fil = $bdir.$hr; $fil = trim_href($fil); if (defined $hr_dupes{$hr}) { $hr_dupes{$hr}++; next; } else { $ci2 = sprintf("%3d", ($i + 1)); push(@list,"$url$hr"); prt("$ci2: $url$hr\n") if (VERB1()); } next if (-d $fil); if ( ($typ & HRT_FILE) && !($typ & HRT_LINK) ) { $filcnt++; if (! -f $fil) { if (defined $not_found{$fil}) { $not_found{$fil}++; } else { prt( "WARNING: File [$fil] NOT found ...\n" ) if (VERB2()); $not_found{$fil} = 1; } } } } my $cnt = scalar @list; prt( "Got $cnt diff HREF entries, from $bfil... $filcnt appear file refs...\n" ); if ($cnt && length($out_file)) { $fil = join("\n",@list)."\n"; write2file($fil,$out_file); prt("List written to '$out_file'...\n"); } $linkcnt = 0; my %counted = (); my %by_extent = (); my %by_fn = (); my @dupes = (); my $msg = ''; my ($nm,$dir,$ext,$ind,$ff); for ($i = 0; $i < $hrcnt; $i++) { $fil = $hrf[$i][0]; $typ = $hrf[$i][1]; if ($typ & HRT_LINK) { $ind = index($fil,'#'); $fil = substr($fil,0,$ind) if ($ind > 0); if (defined $counted{$fil}) { $counted{$fil}++; } else { $counted{$fil} = 1; $linkcnt++; ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); $by_extent{$ext} = [] if (!defined $by_extent{$ext}); push( @{$by_extent{$ext}}, $fil ); $ff = $nm.$ext; if ($ext =~ /^\.java/) { if (defined $by_fn{$ff}) { push(@dupes,$ff); $by_fn{$ff}++; } else { $by_fn{$ff} = 1; } } } } } if ($linkcnt) { %counted = (); prt("Listing $linkcnt links...\n"); if ($show_full_list) { for ($i = 0; $i < $hrcnt; $i++) { $fil = $hrf[$i][0]; $typ = $hrf[$i][1]; if ($typ & HRT_LINK) { $ind = index($fil,'#'); $fil = substr($fil,0,$ind) if ($ind > 0); if (defined $counted{$fil}) { $counted{$fil}++; } else { $counted{$fil} = 1; prt("$fil\n"); $msg .= "$fil\n"; } } } } foreach $ext (keys %by_extent) { my $list = $by_extent{$ext}; foreach $fil (@{$list}) { #prt("$fil\n"); $msg .= "$fil\n"; } } write2file($msg,$outtemp); prt("Written list to $outtemp...\n"); if (@dupes) { prt("Note: ".scalar @dupes." duplicated file names...\n"); prt( join(" ",@dupes)."\n"); } else { prt("Appears NO duplicated names...\n"); } #} else { # prt("No link count in $fil...\n"); } return @hrf; } #################################### #################################################################### # process_folder(folder) # Main DIRECTORY processing function # # Open the FOLDER given, and collect ALL files found, # iterate into sub-directories, if $recurse is non-zero, # and it is NOT a special FRONTPAGE (hidden) FOLDER. # # Files are collected into multidemensional arrays #################################################################### sub process_folder { my ($inf) = shift; my ($ft,$ff,$nm,$dir,$ext,$val,$fil,$idir); my $fcnt = 0; prt( "Processing $inf folder ...\n" ) if ($dbg1 || VERB5()); if ( opendir( DIR, $inf ) ) { my @files = readdir(DIR); closedir DIR; $idir = $inf; $idir .= "\\" if (!($idir =~ /(\\|\/)$/)); foreach $fil (@files) { next if (($fil eq ".")||($fil eq "..")); $ff = $idir.$fil; if ( -d $ff ) { if ($recurse) { if ($ignfpd && is_fp_folder($fil)) { # ignore FRONTPAGE folders next; } if (@splexcludes && in_spl_excludes($fil)) { next; } process_folder( $ff ); } } else { $ft = get_file_type_const($fil); # NOTE: multidimensional arrays pushed - offsets into arrays if ( !in_excludes($fil) ) { # NOT in @excludes ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); $val = 0; $val = $ext_hash{$ext} if ( defined $ext_hash{$ext} ); $val++; $ext_hash{$ext} = $val; push(@all_files, [$ff, '', '', 0, 0, '', '', $ft] ); $fcnt++; } } } prt( "Processed $inf folder finding $fcnt files ...\n" ) if ($dbg6 || VERB2()); } else { prt( "ERROR: Failed to open folder $inf ...\n" ); } } ################################################ # my $ignfpd = 1; # ignore FRONTPAGE folders ################################################ sub is_fp_folder { my ($inf) = shift; foreach my $fil (@fpfolders) { if (lc($inf) eq lc($fil)) { return 1; } } return 0; } #################################### # Check if FILE is in EXCLUDE list #################################### sub in_excludes { my ($fil) = shift; my $lcf = lc($fil); foreach my $f (@excludes) { if (lc($f) eq $lcf) { return 1; } } return 0; } sub in_spl_excludes { my ($fldr) = shift; my $lfldr = lc($fldr); foreach my $f (@splexcludes) { if (lc($f) eq $lfldr) { return 1; } } return 0; } sub set_status_case { my ( $ch, $pch, $inccm, $inlnc, $inqot, $qot ) = @_; my $ldbg2 = 0; if ($$inccm) { if (($ch eq '/')&&($pch eq '*')) { $$inccm = 0; prt( "status: End C comment /* */ ...\n" ) if ($ldbg2); } } elsif ($$inlnc ) { if ($ch eq "\n") { $$inlnc = 0; prt( "status: End line comment // ...\n" ) if ($ldbg2); } } elsif ($$inqot ) { if ($ch eq $$qot) { prt( "status: End quote $$qot ...\n" ) if ($ldbg2); $$inqot = 0; $$qot = ''; } } else { if ($ch eq '/') { if ($pch eq '/') { $$inlnc = 1; prt( "status: Entered line comment // ...\n" ) if ($ldbg2); } } elsif ($ch eq '*') { if ($pch eq '/') { $$inccm = 1; prt( "status: Entered C comment /* */ ...\n" ) if ($ldbg2); } } elsif (($ch eq '"')||($ch eq "'")) { $$qot = $ch; $$inqot = 1; prt( "status: Entered quote $$qot ...\n" ) if ($ldbg2); } } } sub get_all_tag_text { my ($txt, $tag) = @_; my $len = length($txt); my $ldbg1 = 0; my $ntxt = ''; my $ch = ''; my $pch = ''; my $ftag = ''; my $nline = ''; my $i = 0; my $intag = 0; my $incomment = 0; my $inqot = 0; # in quotes ' or " my $qot = ''; my $inlnc = 0; # in line comment my $inccm = 0; # in C comment my ($part, $shlen); ###prt("Processing $len chars for $tag ...\n"); for ($i = 0; $i < $len; $i++) { $pch = $ch; $ch = substr($txt, $i, 1); set_status_case( $ch, $pch, \$inccm, \$inlnc, \$inqot, \$qot ); if ($incomment) { $ntxt .= $ch; if ($ch eq '>') { $shlen = -15; if (length($ntxt) < 15) { $shlen = 0 - length($ntxt); } prt( "Potential close [".substr($ntxt,$shlen)."] ...($i)" ) if ($ldbg1); if (substr($ntxt,-3) eq '-->') { if (!$inqot && !$inlnc && !$inccm) { prt( " Yes\n" ) if ($ldbg1); $incomment = 0; # no longer IN comment prt("End comment ...\n") if ($ldbg1); } else { if ($inqot) { prt( " NO DUE TO IN QUOTE\n" ) if ($ldbg1); } elsif ($inlnc) { prt( " NO DUE TO IN LINE COMMENT\n" ) if ($ldbg1); } elsif ($inccm) { prt( " NO DUE TO IN C COMMENT\n" ) if ($ldbg1); } else { prt( " NO DUE TO SOME REASON!!! **** CHECK ME!!! ****\n" ) if ($ldbg1); } } } else { prt( " NO!\n" ) if ($ldbg1); } } } elsif ($intag) { if ($ch eq "<") { ###prt("Got begin < ...\n"); $part = substr($txt,$i,4); if ($part eq '