Genealogy-ObituaryDailyTimes

 view release on metacpan or  search on metacpan

bin/create_db.PL  view on Meta::CPAN

use autodie qw(:all);

sub freelists($$);
sub mlarchives($$);
sub queue($$$$$$$$$);
sub flush($);
sub normalise_name($);

BEGIN {
	$SIG{__WARN__} = sub {
		my $warning = shift;
		if(($warning =~ /^Use of uninitialized value/) ||
		   ($warning =~ /isn't numeric in numeric eq /i)) {
			die $warning;
		}
		warn $warning;
	}
}

my %normalised;
my $force_flag;
my $dir = 'lib/Genealogy/ObituaryDailyTimes/data';

if(defined($ARGV[0]) && ($ARGV[0] eq '-f')) {
	$force_flag++;
} elsif($ENV{'AUTOMATED_TESTING'}) {
	exit(0);
}

if(!-d $dir) {
	mkdir $dir, 0755;
}

my $filename = File::Spec->catdir($dir, 'obituaries.sql');

if(-r $filename) {
	# Don't bother building if the current file is less than a day old
	if(((-s $filename) > 0) && (-M $filename < 1) && !$force_flag) {
		exit;
	}
	unlink $filename;
}

my $cache_dir = $ENV{'CACHE_DIR'} || $ENV{'CACHEDIR'};
if($cache_dir) {
	mkdir $cache_dir, 0700 if(!-d $cache_dir);
	$cache_dir = File::Spec->catfile($cache_dir, 'http-cache-transparent');
} else {
	$cache_dir = File::Spec->catfile(File::HomeDir->my_home(), '.cache', 'http-cache-transparent');
}

HTTP::Cache::Transparent::init({
	BasePath => $cache_dir,
	Verbose => 0,
	NoUpdate => 60 * 60 * 24 * 7 * 31,	# The archive never changes
	MaxAge => 30 * 24
}) || die "$0: $cache_dir: $!";

print "This will take some time. It'd be best to go and make yourself a cup of tea.\n";

my $ua = LWP::UserAgent::WithCache->new(timeout => 60, keep_alive => 1);
$ua->env_proxy(1);
$ua->agent('Mozilla/5.0');
$ua->conn_cache()->total_capacity(undef);
$Lingua::EN::NameCase::POSTNOMINAL = 0;

# print '"last","first","maiden","age","place","newspapername","newspaperdate","tag"', "\n";

my $dbh = DBI->connect("dbi:SQLite:dbname=$filename", undef, undef, { RaiseError => 1, AutoCommit => 0, synchronous => 0, locking_mode => 'EXCLUSIVE' });
die "$filename: $!" if(!defined($dbh));

$dbh->do('PRAGMA cache_size = -65536');	# 64MB
$dbh->do('PRAGMA journal_mode = OFF');
$dbh->do('CREATE TABLE obituaries(first VARCHAR NOT NULL, middle VARCHAR, last VARCHAR NOT NULL, maiden VARCHAR, age INTEGER, place VARCHAR, newspaper VARCHAR NOT NULL, date DATE NOT NULL, source CHAR NOT NULL, page VARCHAR NOT NULL)');

my @queue;

# Entries on funeral-notices.co.uk

my %notices;
my $url = 'https://www.nigelhorne.com/downloads/obt.txt';
my $response = $ua->get($url);
my $data;
if($response->is_success()) {
	$data = $response->decoded_content();
} else {
	die "$url: ", $response->status_line();
}

# Parse the downloaded file
$data =~ s/\r//g;
foreach my $line (split/^/ms, $data) {
	chomp $line;
	if($line =~ /^'(.+)' => (\d+),$/) {
		$notices{$1} = $2;
	} else {
		die "$url: can't parse $line";
	}
}

while(my($last, $page) = each %notices) {
	funeral_notices($last, $page);
}

# use Data::Dumper;
# die Data::Dumper->new([\@queue])->Dump();

data();
flush($dbh);

if($ENV{'MLARCHIVE_DIR'} || $ENV{'MLARCHIVEDIR'}) {
	if(mlarchives($ua, 'other')) {
		flush($dbh)
	}
}

print ' ' x 78, "\r";

my $page = 1;
while(mlarchives($ua, $page)) {
	flush($dbh) if(scalar(@queue) > 10_000);



( run in 1.826 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )