ACME-QuoteDB
view release on metacpan or search on metacpan
t/01-load_quotes.t view on Meta::CPAN
my ($self, $file) = @_;
my $p = HTML::TokeParser->new($file) || croak $!;
while (my $token = $p->get_tag("p")) {
my $idn = $token->[1]{class} || q{};
my $id = $token->[1]{id} || q{}; # if a quotation is continued (id
#is not set)
next unless $idn and ( $idn eq 'quotation' || $idn eq 'source');
#my $data = $p->get_trimmed_text("/p");
my $data = $p->get_text('p', 'cite');
#warn Dumper $data;
# XXX see $self->set_record in ACME::QuoteDB::LoadDB for fields
# to populate
if ($idn eq 'quotation' and $id) {
$self->set_record(quote => $data);
}
elsif ($idn eq 'quotation' and not $id) {
my $d = $self->get_record('quote') || q{};
$self->set_record(quote => qq{$d $data});
}
elsif ($idn eq 'source'){
my ($name, $source) = split /,/, $data;
if ($name) {
chomp $name;
$name =~ s/\A\s+//xms;
$name =~ s/\s+\z//xms;
}
$self->set_record(name => $name);
$self->set_record(source => $source);
# TODO
#$self->set_record({
# name => $name,
# source => $source
#});
}
if ($self->get_record('quote') and $self->get_record('name')) {
# we provided a category and rating, otherwise would have to
# parse from data too
$self->set_record(catg => $self->{category});
$self->set_record(rating => $self->{rating});
# TODO
#$self->set_record({
# catg => $self->{category},
# rating => $self->{rating}
#});
#$self->debug_record;
$self->write_record;
}
}
}
package main;
use File::Basename qw/dirname/;
use File::Spec;
# simple glob pattern accepted
my $py_quot = File::Spec->catfile(
dirname(__FILE__), 'data', 'www.amk.ca', 'quotations',
'python-quotes', '*.html'
);
my $load_db = LoadQuoteDBFromHtml->new({
dir => $py_quot,
file_format => 'html',
create_db => 1, # first run, create the db
# provide a category for all (if not in data)
category => 'Python',
# provide a rating for all (if not in data)
# and desired
rating => 5,
});
isa_ok $load_db, 'ACME::QuoteDB::LoadDB';
$load_db->data_to_db;
ok $load_db->success;
is $load_db->success, 1;
my $sq = ACME::QuoteDB->new;
isa_ok $sq, 'ACME::QuoteDB';
# expected attribution list from our data (ok, so the data has some
# 'inconsistancies',...
#grep "'source'" *.html|sed -e 's/,.*$//g' -e 's/<\/p>//g' -e s'/^.*>//g'| sort -u
#seems more accurate: grep "'source'" *.html|sed -e "s/^.*source'>//g" -e 's/,.*$//g' | sort -u
my @expected_attribution_list = (
'Aaron Watters',
'Alex Martelli',
'Allan Bailey',
'A.M. Kuchling',
'Andrew Mullhaupt',
'Anthony Baxter',
'An unknown poster and Fredrik Lundh',
'Brett Cannon',
'Christian Tismer',
'Donald E. Knuth',
'Donn Cave uses sarcasm with devastating effect',
'Fred Drake on the Documentation SIG',
'Fredrik Lundh',
'From Kim "Howard" Johnson\'s',
'Gareth McCaughan',
'Gordon McMillan',
'Guido van Rossum',
'GvR',
'Jack Jansen',
'Jeremy Hylton',
'Jim Ahlstrom',
'Jim Fulton and Paul Everitt on the Bobo list',
'Jim Fulton and Tim Peters',
'John Eikenberry on the Bobo list',
'John Holmgren',
'John J. Lehmann',
'John Redford',
'Joseph Strout',
"Kristj\x{E1}n J\x{F3}nsson",
'Larry Wall',
'Mark Jackson',
( run in 0.696 second using v1.01-cache-2.11-cpan-df04353d9ac )