Genealogy-Wills
view release on metacpan or search on metacpan
bin/create_db.PL view on Meta::CPAN
BEGIN {
$SIG{__WARN__} = sub {
my $warning = shift;
if(($warning =~ /^Use of uninitialized value/) ||
($warning =~ /isn't numeric in numeric eq /i)) {
die $warning;
}
warn $warning;
}
}
my $force_flag;
my $dir = 'lib/Genealogy/Wills/data';
if(defined($ARGV[0]) && ($ARGV[0] eq '-f')) {
$force_flag++;
} elsif($ENV{'AUTOMATED_TESTING'}) {
if(!-d $dir) {
mkdir $dir, 0755;
}
exit;
}
if(!-d $dir) {
mkdir $dir, 0755;
}
my $filename = File::Spec->catdir($dir, 'wills.sql');
my %normalised;
if(!-d $dir) {
mkdir $dir, 0755;
}
if(-r $filename) {
# Don't bother downloading if the file is less than a day old
if(((-s $filename) > 0) && (-M $filename < 1) && !$force_flag) {
exit;
}
unlink $filename;
}
my $cache_dir = $ENV{'CACHE_DIR'} || $ENV{'CACHEDIR'};
if($cache_dir) {
mkdir $cache_dir, 0700 if(!-d $cache_dir);
$cache_dir = File::Spec->catfile($cache_dir, 'http-cache-transparent');
} else {
$cache_dir = File::Spec->catfile(File::HomeDir->my_home(), '.cache', 'http-cache-transparent');
}
HTTP::Cache::Transparent::init({
BasePath => $cache_dir,
Verbose => 0,
NoUpdate => 60 * 60 * 24 * 7 * 31, # The archive never changes
MaxAge => 30 * 24
}) || die "$0: $cache_dir: $!";
# print "This will take some time. It'd be best to go and make yourself a cup of tea.\n";
my $ua = LWP::UserAgent::WithCache->new(timeout => 10, keep_alive => 1);
$ua->env_proxy(1);
$ua->agent('Mozilla/5.0');
$ua->conn_cache()->total_capacity(undef);
$Lingua::EN::NameCase::POSTNOMINAL = 0;
# print '"last","first","maiden","age","place","newspapername","newspaperdate","tag"', "\n";
my $dbh = DBI->connect("dbi:SQLite:dbname=$filename", undef, undef, { RaiseError => 1, AutoCommit => 0, synchronous => 0, locking_mode => 'EXCLUSIVE' });
die "$filename: $!" if(!defined($dbh));
$dbh->do('PRAGMA cache_size = -65536'); # 64MB
$dbh->do('PRAGMA journal_mode = OFF');
$dbh->do('CREATE TABLE wills(first VARCHAR NOT NULL, middle VARCHAR, last VARCHAR NOT NULL, town VARCHAR, year INTEGER, url VARCHAR)');
my @queue;
foreach my $page ('ab', 'c', 'dg', 'hj', 'km', 'nr', 'sv', 'wy') {
mrawson($ua, $page);
flush($dbh) if(scalar(@queue) > 200_000);
};
print ' ' x 78, "\r";
flush($dbh);
$dbh->commit();
$dbh->prepare('CREATE INDEX name_index ON wills(first, last)')->execute();
$dbh->prepare('CREATE INDEX name_index_year ON wills(first, last, year)')->execute();
$dbh->do('pragma optimize');
$dbh->disconnect();
print "\n";
sub mrawson($$) {
my $ua = shift;
my $page = shift;
my @lines;
my $url = "https://freepages.rootsweb.com/~mrawson/genealogy/will_$page.html";
$| = 1;
printf "%-70s\r", $url;
$| = 0;
my $response = $ua->get($url);
my $data;
if($response->is_success) {
$data = $response->decoded_content();
} else {
die "\n$url: ", $response->status_line();
}
$data =~ s/\r//g;
@lines = split(/$/ms, $data);
while(my $line = shift(@lines)) {
# LAST NAME, First Name (MAIDEN); Age; Place of Death; Newspaper Name; Newspaper date; tagname
return if($line =~ /Created by/);
next unless($line =~ /^\s*<li><a href="(.+)">(.+),\s+(.+)\s+<\/a> of (.+) (\d{4})/);
( run in 2.039 seconds using v1.01-cache-2.11-cpan-0bb4e1dffa6 )