#!/usr/bin/perl -w
# extract fields from the CIA factbook
# usage:
#        cd factbook/print
#        ciafbxf *.html

use strict;
use Getopt::Std;

my ($field, $key, $val);

$field = [
    'Geographic coordinates' => '(\S.*\S)',
    'Map references' => '(\S.*\S)',
    'Area' => 'total:<\/i>\s*(.*) sq km',
    'Land boundaries' => '(\d\S*) km',
    'Coastline' => '(\d\S*) km',
    'Population' => '(\S+)',
    'Unemployment rate' => '(\S+%)',
];

print <<eof;
# arbitrary info extracted from the CIA fact book:
#     http://www.cia.gov/cia/download.html
# generated by ciafbxf: 
#     http://www.cyut.edu.tw/~ckhung/b/ma/ciafbxf
eof
print "#code";
for (my $i=0; $i<$#$field; $i+=2) {
    print ":$field->[$i]";
}
print "\n";
$field = { @$field };

while (<>) {
    my ($k);
    if ($. == 1) {
	$ARGV =~ /(\w+)\.htm/;
	print $1;
    }
    if (/>Introduction</) {
	$k = <>;
	$k =~ />(.*?)</;
	print ":$1";
    }
    next unless m#<div#;
    foreach $k (keys %$field) {
	if (/>\s*$k\s*:\s*</) { $key = $k; last; }
	# $k loses its value after leaving the loop
    }
    next unless ($key and />\s*$key\s*:\s*</);
    undef $val;
    while (<>) { last if m#<td#; }
    while (<>) {
	last if m#</td#;
	# pattern match using each key's own rule
	next unless /$field->{$key}/;
	$val = $1; last;
    }
    $val = "undef" unless defined $val;
    $val = $1 * 1e6 if ($key eq "Area" and $val =~ /(\S+)\s*million/i);
    $val =~ s/,//g if $val =~ /^\s*[\d,]*(\.\d*)?\s*$/;
    print ":$val";
} continue {
    if (eof) {
        print "\n";
        close ARGV;
    }
}
print "\n";

