Generated: Tue Feb 2 17:54:43 2010 from iso639.pl 2006/09/25 7.1 KB.
#!/Perl # iso639.pl # AIM to download the table from the site, and build a 'validation' table ... use strict; use warnings; use LWP::Simple; require "logfile.pl" or die "Missing logfile.pl ...\n"; # my simple log file and some other utility subs require "htmltools.pl" or die "Missing htmltools.pl ...\n"; # log file stuff my ($LF); my $outfile = 'temp'.$0.'.txt'; my $dodownload = 0; my $URL = 'http://www.loc.gov/standards/iso639-2/englangn.html'; my $out_file = 'iso639.htm'; open_log($outfile); prt( "$0 ... Hello, World...\n" ); prt( "Moment ... loading $URL ...\n" ); my $tbltext = ''; my @lines = (); my @tlines = (); my $cnt = 0; my $line = ''; my $tln = ''; my $col = 0; my $col1 = ''; my $col2 = ''; my $col3 = ''; my $col4 = ''; my $i = 0; my @colist = (); my %iso3 = (); my %iso2 = (); my $tblcnt = 0; my $lncnt = 0; my $maxcnt = 15; # maximum search for </td> my $enttbl = 0; my $bgntbl = 0; my $thcnt = 0; if ($dodownload) { $tbltext = get($URL); $cnt = length($tbltext); prt( "Downloaded $cnt characters ... writing $out_file ...\n" ); write2file( $tbltext, $out_file ); } else { prt( "Loading $out_file ... moment ...\n" ); open INF, "<$out_file" or mydie( "ERROR: Failed to open [$out_file] ... $! \n" ); @lines = <INF>; # slurp it into lines close INF; $tbltext = join("",@lines); $tbltext = tag2newline($tbltext, 'table'); $tbltext = tag2newline($tbltext, 'tr'); $tbltext = tag2newline($tbltext, 'th'); $tbltext = tag2newline($tbltext, 'td'); $tbltext = trimblanklines($tbltext); write2file($tbltext, 'temptbl.htm'); $cnt = length($tbltext); prt( "Loaded $cnt characters ... from $out_file ...\n" ); } @lines = split("\n", $tbltext); $cnt = scalar @lines; prt( "Got $cnt lines to process ... \n" ); foreach $line (@lines) { $tln = trimall( html_subs($line) ); push(@tlines, $tln) if length($tln); } $cnt = scalar @tlines; prt( "Got $cnt trimmed lines to process ... \n" ); ##foreach $line (@tlines) { for ($i = 0; $i < $cnt; $i++) { $lncnt++; $line = $tlines[$i]; if ($line =~ /<table.*>/i) { $tblcnt++; $bgntbl = 1; $enttbl++; # count another entry to a table prt( "$lncnt - Enter table $tblcnt ...[$line]\n" ); $thcnt = 0; } elsif ($line =~ /<\/table>/i) { if ($tblcnt) { prt( "$lncnt - Exit table $tblcnt ...[$line]\n" ); $tblcnt--; } else { prt( "$lncnt - Exit table NO COUNT! ...[$line]\n" ); } } elsif ($line =~ /<tr.*?>/i) { if ($col == 4) { ###prt( "[$col1] [$col2] [$col3] [$col4] \n" ); # Language Name English French 639-2 639-1 push(@colist, [ $col1, $col2, $col3, $col4 ]); } else { if ($bgntbl || ($enttbl < 4) || ($thcnt > 0)) { # $col should be zero if just entered, and not yet in second } else { prt( "$lncnt - $tblcnt CHECK col=$col ...[$col1] [$col2] [$col3] [$col4]\n" ); } } $bgntbl = 0; $thcnt = 0; $col = 0; } elsif ($line =~ /<td.*?>/i) { my $lc = 0; while ( !($line =~ /<\/td>/i) ) { $i++; $lncnt++; if ($i < $cnt) { $line .= $tlines[$i]; } else { last; } $lc++; if ($lc > $maxcnt) { last; # exit anyway ... } } $tln = removetag($line, 'td'); $tln = removetag($tln, 'span'); $tln = removetag($tln, 'p'); $tln = removetag($tln, 'br'); $tln = removetag($tln, 'font'); $tln = striptag($tln, 'a'); $col++; if ($col == 1) { $col1 = $tln; } elsif ($col == 2) { $col2 = $tln; } elsif ($col == 3) { $col3 = $tln; } elsif ($col == 4) { $col4 = $tln; } } elsif ($line =~ /<th.*?>/i) { my $lc = 0; while ( !($line =~ /<\/th>/i) ) { $i++; $lncnt++; if ($i < $cnt) { $line .= $tlines[$i]; } else { last; } $lc++; if ($lc > $maxcnt) { last; # exit anyway ... } } $thcnt++; } } $cnt = scalar @colist; prt( "Got $cnt in country list ... \n" ); # Language Name English French 639-2 639-1 for ($i = 0; $i < $cnt; $i++) { $col1 = trimall($colist[$i][0]); $col2 = trimall($colist[$i][1]); $col3 = trimall($colist[$i][2]); $col4 = trimall($colist[$i][3]); if ($col3 =~ /\//) { my @arr = split("/",$col3); foreach my $i3 (@arr) { if (defined $iso3{$i3}) { if ( !is_the_same( $iso3{$i3}, $col1) ) { prt( "Exists! [$i3] = [" . $iso3{$i3} . "] adding [$col1] ...\n" ); $iso3{$i3} .= ' ' . $col1; } } else { $iso3{$i3} = $col1; } } } else { if (defined $iso3{$col3}) { if ( !is_the_same( $iso3{$col3}, $col1) ) { prt( "Exists! [$col3] = [" . $iso3{$col3} . "] adding [$col1] ...\n" ); $iso3{$col3} .= ' ' . $col1; } } else { $iso3{$col3} = $col1; } } if (length($col4)) { if (defined $iso2{$col4}) { if ( !is_the_same( $iso2{$col4}, $col1) ) { prt( "Exists! [$col4] = [" . $iso2{$col4} . "] adding [$col1] ...\n" ); $iso2{$col4} .= ' ' . $col1; } } else { $iso2{$col4} = $col1; } } ###prt( "[$col1] [$col2] [$col3] [$col4] \n" ); } my @keys3 = keys %iso3; my @keys2 = keys %iso2; my $kc3 = scalar @keys3; my $kc2 = scalar @keys2; prt( "Got $kc3 ISO 639-2 and $kc2 ISO 639-1 ... writting file ...\n" ); my $msg = ''; my $maxln = 75; $line = ''; $cnt = 0; $msg .= 'tmbstr ISO639_2 = {' . "\n"; foreach $tln (sort @keys3) { $cnt++; $line .= '"'.$tln.'"'; ##if ($cnt < $kc3) { $line .= ', '; ##} if (length($line) > $maxln) { $msg .= $line . "\n"; $line = ''; } } if (length($line)) { $msg .= $line . "\n"; $line = ''; } $msg .= " 0 };\n\n"; $msg .= 'tmbstr ISO639_1[] = {' . "\n"; $cnt = 0; foreach $tln (sort @keys2) { $cnt++; $line .= '"'.$tln.'"'; ##if ($cnt < $kc2) { $line .= ', '; ##} if (length($line) > $maxln) { $msg .= $line . "\n"; $line = ''; } } if (length($line)) { $msg .= $line . "\n"; $line = ''; } $msg .= " 0 };\n"; write2file( $msg, "tempiso.txt" ); prt( "All written to tempiso.txt ...\n" ); if ($dodownload) { system( $out_file ); } close_log($outfile,1); exit(0); sub is_the_same { my ($t1, $t2) = @_; my (@a1, @a2); my $ct = 0; my $ct1 = 0; my $ct2 = 0; @a1 = split(/[;,]/,$t1); @a2 = split(/[;,]/,$t2); foreach my $b1 (@a1) { $ct1++; $b1 = trimall($b1); foreach my $b2 (@a2) { $b2 = trimall($b2); if ($b1 eq $b2) { $ct2++; last; } } } if ($ct1 == $ct2) { return 1; } return 0; } sub write2file { my ($txt,$fil) = @_; open WOF, ">$fil" or mydie( "ERROR: Unable to open $fil! - $! \n" ); print WOF $txt; close WOF; } sub trimall { my ($ln) = shift; chomp $ln; $ln =~ s/\r$//; $ln =~ s/\t/ /g; while ($ln =~ /\s\s/) { $ln =~ s/\s\s/ /g; } while ($ln =~ /^\s/) { $ln = substr($ln,1); } while ($ln =~ /\s$/) { $ln = substr($ln,0, length($ln) - 1); } return $ln; } sub html_subs { my ($htm) = shift; $htm = substitutions($htm); $htm =~ s/é/e/gm; $htm =~ s/è/e/gm; $htm =~ s/ç/c/gm; $htm =~ s/å/a/gm; $htm =~ s/ç/c/gm; $htm =~ s/à/a/gm; $htm =~ s/´/'/gm; $htm =~ s/ï/i/gm; $htm =~ s/â/a/gm; $htm =~ s/Î/I/gm; $htm =~ s/ü/u/gm; return $htm; } # eof - iso639.pl