Genealogy-Wills

 view release on metacpan or  search on metacpan

bin/create_db.PL  view on Meta::CPAN


BEGIN {
        $SIG{__WARN__} = sub {
                my $warning = shift;
                if(($warning =~ /^Use of uninitialized value/) ||
                   ($warning =~ /isn't numeric in numeric eq /i)) {
                        die $warning;
                }
                warn $warning;
        }
}

my $force_flag;
my $dir = 'lib/Genealogy/Wills/data';

if(defined($ARGV[0]) && ($ARGV[0] eq '-f')) {
	$force_flag++;
} elsif($ENV{'AUTOMATED_TESTING'}) {
	if(!-d $dir) {
		mkdir $dir, 0755;
	}
	exit;
}

if(!-d $dir) {
	mkdir $dir, 0755;
}

my $filename = File::Spec->catdir($dir, 'wills.sql');
my %normalised;

if(!-d $dir) {
	mkdir $dir, 0755;
}

if(-r $filename) {
	# Don't bother downloading if the file is less than a day old
	if(((-s $filename) > 0) && (-M $filename < 1) && !$force_flag) {
		exit;
	}
	unlink $filename;
}

my $cache_dir = $ENV{'CACHE_DIR'} || $ENV{'CACHEDIR'};
if($cache_dir) {
	mkdir $cache_dir, 0700 if(!-d $cache_dir);
	$cache_dir = File::Spec->catfile($cache_dir, 'http-cache-transparent');
} else {
	$cache_dir = File::Spec->catfile(File::HomeDir->my_home(), '.cache', 'http-cache-transparent');
}

HTTP::Cache::Transparent::init({
	BasePath => $cache_dir,
	Verbose => 0,
	NoUpdate => 60 * 60 * 24 * 7 * 31,	# The archive never changes
	MaxAge => 30 * 24
}) || die "$0: $cache_dir: $!";

# print "This will take some time. It'd be best to go and make yourself a cup of tea.\n";

my $ua = LWP::UserAgent::WithCache->new(timeout => 10, keep_alive => 1);
$ua->env_proxy(1);
$ua->agent('Mozilla/5.0');
$ua->conn_cache()->total_capacity(undef);
$Lingua::EN::NameCase::POSTNOMINAL = 0;

# print '"last","first","maiden","age","place","newspapername","newspaperdate","tag"', "\n";

my $dbh = DBI->connect("dbi:SQLite:dbname=$filename", undef, undef, { RaiseError => 1, AutoCommit => 0, synchronous => 0, locking_mode => 'EXCLUSIVE' });
die "$filename: $!" if(!defined($dbh));

$dbh->do('PRAGMA cache_size = -65536');	# 64MB
$dbh->do('PRAGMA journal_mode = OFF');
$dbh->do('CREATE TABLE wills(first VARCHAR NOT NULL, middle VARCHAR, last VARCHAR NOT NULL, town VARCHAR, year INTEGER, url VARCHAR)');

my @queue;
foreach my $page ('ab', 'c', 'dg', 'hj', 'km', 'nr', 'sv', 'wy') {
	mrawson($ua, $page);
	flush($dbh) if(scalar(@queue) > 200_000);
};

print ' ' x 78, "\r";

flush($dbh);

$dbh->commit();
$dbh->prepare('CREATE INDEX name_index ON wills(first, last)')->execute();
$dbh->prepare('CREATE INDEX name_index_year ON wills(first, last, year)')->execute();
$dbh->do('pragma optimize');
$dbh->disconnect();

print "\n";

sub mrawson($$) {
	my $ua = shift;
	my $page = shift;
	my @lines;

	my $url = "https://freepages.rootsweb.com/~mrawson/genealogy/will_$page.html";

	$| = 1;
	printf "%-70s\r", $url;
	$| = 0;

	my $response = $ua->get($url);

	my $data;
	if($response->is_success) {
		$data = $response->decoded_content();
	} else {
		die "\n$url: ", $response->status_line();
	}

	$data =~ s/\r//g;
	@lines = split(/$/ms, $data);

	while(my $line = shift(@lines)) {
		# LAST NAME, First Name (MAIDEN); Age; Place of Death; Newspaper Name; Newspaper date; tagname
		return if($line =~ /Created by/);

		next unless($line =~ /^\s*<li><a href="(.+)">(.+),\s+(.+)\s+<\/a> of (.+) (\d{4})/);



( run in 2.039 seconds using v1.01-cache-2.11-cpan-0bb4e1dffa6 )