ACME-QuoteDB
view release on metacpan or search on metacpan
lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN
#$Id: LoadDB.pm,v 1.30 2009/09/30 07:37:09 dinosau2 Exp $
# /* vim:et: set ts=4 sw=4 sts=4 tw=78: */
package ACME::QuoteDB::LoadDB;
use 5.008005; # require perl 5.8.5, re: DBD::SQLite Unicode
use warnings;
use strict;
#use criticism 'brutal'; # use critic with a ~/.perlcriticrc
use version; our $VERSION = qv('0.1.1');
# with Text::CSV only use 'perl csv loader'
# 'one time' db load performance not a concern
BEGIN {local $ENV{PERL_TEXT_CSV} = 0}
use aliased 'ACME::QuoteDB::DB::Attribution' => 'Attr';
use aliased 'ACME::QuoteDB::DB::QuoteCatg' => 'QuoteCatg';
use aliased 'ACME::QuoteDB::DB::Category' => 'Catg';
use aliased 'ACME::QuoteDB::DB::Quote' => 'Quote';
use aliased 'ACME::QuoteDB::DB::DBI' => 'QDBI';
use File::Basename qw/dirname basename/;
use File::Glob qw(:globally :nocase);
use Encode qw/is_utf8 decode/;
use Data::Dumper qw/Dumper/;
use Carp qw/carp croak/;
use Text::CSV;
use Readonly;
use DBI;
# if not in utf8 latin1 is assumed
my $FILE_ENCODING = 'iso-8859-1';
Readonly my @QUOTE_FIELDS => qw/quote name source catg rating/;
# XXX refactor
sub new {
my ($class, $args) = @_;
# TODO encapsulation
my $self = bless {}, $class;
# store each record we extract - keys map to database fields
# TODO proper encapsulation
$self->{record} = {};
$self->{record}->{quote} = q{};
$self->{record}->{rating} = q{};
$self->{record}->{name} = q{};
$self->{record}->{source} = q{};
$self->{record}->{catg} = q{};
$self->{file} = $args->{file};
$self->{dir} = $args->{dir};
$self->{data} = $args->{data};
$self->{file_format} = $args->{file_format};
$FILE_ENCODING = $args->{file_encoding} || $FILE_ENCODING;
$self->{delim} = $args->{delimiter};
$self->{verbose} = $args->{verbose};
$self->{category} = $args->{category};
$self->{rating} = $args->{rating};
$self->{attr_source} = $args->{attr_source};
$self->{orig_args} = $args;
$self->{success} = undef;
# start with if set
$self->{record}->{rating} = $self->{rating};
$self->{record}->{name} = $self->{attr_source};
$self->{record}->{source} = $self->{attr_source};
if (ref $self->{category} eq 'ARRAY') {
$self->{record}->{catg} = ();
foreach my $c (@{$self->{category}}){
push @{$self->{record}->{catg}}, $c;
}
}
else {
$self->{record}->{catg} = $self->{category};
}
# db connection info
if ($ENV{ACME_QUOTEDB_DB}) {
$self->{db} = $ENV{ACME_QUOTEDB_DB};
$self->{host} = $ENV{ACME_QUOTEDB_HOST};
$self->{user} = $ENV{ACME_QUOTEDB_USER};
$self->{pass} = $ENV{ACME_QUOTEDB_PASS};
}
if (!$args->{dry_run}){$self->{write_db} = 1};
#if ($args->{create_db}) {$self->create_db};
if ($args->{create_db}) {$self->create_db_tables};
return $self;
}
sub set_record {
my ($self, $field, $value) = @_;
# TODO support mult-field simultanous loading
if ($value) {
$self->{record}->{$field} = $value;
}
return $self;
}
sub debug_record {
my ($self) = @_;
print Dumper $self->{record};
return;
}
sub get_record {
my ($self, $field) = @_;
lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN
if (($self->{file_format} eq 'csv') || ($self->{file_format} eq 'tsv')){
$self->dbload_from_csv($file);
}
elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
# not supported, too many possibilities
# supply your own
$self->dbload($file);
}
else {
croak 'unsupported file format requested, format must be csv or tsv';
}
return;
}
sub _parse_data {
my ($self, $data) = @_;
if (!$data) {croak "data empty $data"}
if ($self->{verbose}){carp 'processing data:'};
if ($self->{file_format} =~ /(?:csv|tsv)/sm) {
croak 'TODO: not yet supported';
#$self->dbload_from_csv($data);
}
elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
# not supported, too many possibilities
# supply your own
$self->dbload($data);
}
else {
croak 'unsupported file format requested, '
.'format must be csv, tsv. html, custom also possible';
}
return $self;
}
sub _confirm_header_order {
my ($hr) = @_;
return ($hr->{quote} eq 'Quote'
and $hr->{name} eq 'Attribution Name',
and $hr->{source} eq 'Attribution Source',
and $hr->{catg} eq 'Category',
and $hr->{rating} eq 'Rating')
or croak 'incorrect headers or header order';
}
sub dbload_from_csv {
my ($self, $file) = @_;
my $delim = $self->{delim} || ',';
my $csv = Text::CSV->new({
sep_char => $delim,
binary => 1
});
$csv->column_names (@QUOTE_FIELDS);
open my $source, '<:encoding(utf8)', $file || croak $!;
_confirm_header_order($csv->getline_hr($source));
while (my $hr = $csv->getline_hr($source)) {
next unless $hr->{quote} and $hr->{name};
if ($self->{verbose}){
print "\n",
'Quote: ', $hr->{quote},"\n",
'Name: ', $hr->{name},"\n",
'Source: ', $hr->{source},"\n",
'Category:', $hr->{catg},"\n",
'Rating: ', $hr->{rating},"\n\n";
};
$self->set_record(quote => $hr->{quote});
$self->set_record(name => $hr->{name});
$self->set_record(source => ($self->{attr_source} || $hr->{source}));
# take user defined first
# TODO support multi categories
$self->set_record(catg => ($self->{category} || $hr->{catg}));
$self->set_record(rating => ($self->{rating} || $hr->{rating}));
$self->write_record;
}
close $source or carp $!;
return $self;
}
# sub class this - i.e. provide this method in your code (see test
# 01-load_quotes.t)
sub dbload {
croak 'Override this. Provide this method in a sub class (child) of this object';
# see tests: t/01-load_quotes.t for examples
}
sub _to_utf8 {
my ($self) = @_;
RECORD:
foreach my $r (@QUOTE_FIELDS){
my $val = $self->{record}->{$r};
if (ref $val eq 'ARRAY'){
foreach my $v (@{$val}){
if (!is_utf8($v)){
push @{$self->{record}->{$r}}, decode($FILE_ENCODING, $v);
}
}
}
else {
if (!is_utf8($val)){
$self->{record}->{$r} = decode($FILE_ENCODING, $val);
}
}
}
return $self;
}
# XXX refactor (the following 3 methods)
lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN
This is an Object Oriented module. There is no proceedural interface.
=head2 new
Instantiate a ACME::QuoteDB::LoadDB object.
Argument is a hash ref. Params below
=head4 Data Related Parameters
=over 4
=item file or directory - one or the other required (not both)
if file, must be in our defined format, full path is needed.
if directory, full path is needed, can supply a basic glob type filter.
example:
{ file => '/home/me/data/simpsons_quotes.csv' }
{ dir => '/home/me/data/*.csv' }
=item file_format - required
can be one of: 'csv', 'tsv', 'custom', or 'html'
if 'html' or 'custom' you must supply the method for parsing.
(see tests for examples)
example:
{ file_format => 'csv' }
=item delimiter - optional, default is a comma for csv
csv/tsv options tested: comma(,) and tab(\t)
'html' - not applicable
example:
{ delimiter => "\t" }
=item category - optional, extracted from data if exists, otherwise will use what you
specify
TODO one quote to multiple categories
=item attr_source - extracted from data if exists, otherwise will use what you
specify
example:
{attr_source => 'The Simpsons'}
=item file_encoding - optional
Files being loaded are assumed to be utf8 encoded. if utf8 flag is not detected,
falls back to latin1 (iso-8859-1). If neither of these is correct, set this
option to the encoding your file is in.
=back
=head4 Operation Related Parameters
=over 4
=item dry_run - optional
do not write to the database. Use with verbose flag to see what would have beed
written.
This can be helpful for testing the outcome of Loading results.
i.e. like to confirm that the parsing of your data is correct
example:
{
dry_run => 1,
verbose => 1
}
=item verbose - optional
display to STDOUT what is being done
This can be helpful for testing quotes extraction from file parsing
example:
{verbose => 1}
=item create_db - optional (boolean)
L<ACME::QuoteDB::LoadDB> default behaviour is to always assume there is a
database and append new data to that. (It is usually only needed the first
time one load's data)
setting this parameter to a true value will create a new database.
(so while this is an optional param, it is required at least once ;)
B<NOTE: it is not intelligent, if you hand it a populated database,
it will happily overwrite all data>
B<AGAIN: setting this param will destroy the current database, creating a new
empty one>
example:
{create_db => 1}
=back
=head2 data_to_db
takes the data input provided to new, process' it and writes to the database.
should appropriatly blow up if not successful
( run in 0.765 second using v1.01-cache-2.11-cpan-39bf76dae61 )