ACME-QuoteDB
view release on metacpan or search on metacpan
lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN
$self->{record}->{$field} = $value;
}
return $self;
}
sub debug_record {
my ($self) = @_;
print Dumper $self->{record};
return;
}
sub get_record {
my ($self, $field) = @_;
if (not $field){return $self}
return $self->{record}->{$field};
}
sub data_to_db {
my ($self) = @_;
if ($self->{file} and $self->{data} and $self->{dir}){
croak 'only file, data or dir as arg but not both'
}
elsif (! ($self->{file} or $self->{data} or $self->{dir})) {
croak 'file, data or dir needed as arg'
}
if ($self->{file}) {
$self->_parse_file($self->{file});
}
elsif ($self->{data}) {
$self->_parse_data($self->{data});
}
elsif ($self->{dir}) {
my $dir = $self->{dir};
my $e = q{};
foreach my $f (<$dir*>) {
#if (! (-e $f) || -z $f) # no worky - need path info
$self->_parse_file($f);
$e++;
}
if (! $e){croak 'no files to parse in: ', Dumper $dir;};
}
else {
croak 'no file source in args!', Dumper $self;
}
return;
}
sub _parse_file {
my ($self, $file) = @_;
if (!-f $file) { croak "file not found: $file" }
if ($self->{verbose}){warn "processing file: $file\n"};
if (($self->{file_format} eq 'csv') || ($self->{file_format} eq 'tsv')){
$self->dbload_from_csv($file);
}
elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
# not supported, too many possibilities
# supply your own
$self->dbload($file);
}
else {
croak 'unsupported file format requested, format must be csv or tsv';
}
return;
}
sub _parse_data {
my ($self, $data) = @_;
if (!$data) {croak "data empty $data"}
if ($self->{verbose}){carp 'processing data:'};
if ($self->{file_format} =~ /(?:csv|tsv)/sm) {
croak 'TODO: not yet supported';
#$self->dbload_from_csv($data);
}
elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
# not supported, too many possibilities
# supply your own
$self->dbload($data);
}
else {
croak 'unsupported file format requested, '
.'format must be csv, tsv. html, custom also possible';
}
return $self;
}
sub _confirm_header_order {
my ($hr) = @_;
return ($hr->{quote} eq 'Quote'
and $hr->{name} eq 'Attribution Name',
and $hr->{source} eq 'Attribution Source',
and $hr->{catg} eq 'Category',
and $hr->{rating} eq 'Rating')
or croak 'incorrect headers or header order';
}
sub dbload_from_csv {
my ($self, $file) = @_;
my $delim = $self->{delim} || ',';
my $csv = Text::CSV->new({
sep_char => $delim,
binary => 1
});
$csv->column_names (@QUOTE_FIELDS);
open my $source, '<:encoding(utf8)', $file || croak $!;
_confirm_header_order($csv->getline_hr($source));
while (my $hr = $csv->getline_hr($source)) {
next unless $hr->{quote} and $hr->{name};
if ($self->{verbose}){
print "\n",
'Quote: ', $hr->{quote},"\n",
'Name: ', $hr->{name},"\n",
'Source: ', $hr->{source},"\n",
'Category:', $hr->{catg},"\n",
'Rating: ', $hr->{rating},"\n\n";
};
$self->set_record(quote => $hr->{quote});
$self->set_record(name => $hr->{name});
$self->set_record(source => ($self->{attr_source} || $hr->{source}));
# take user defined first
# TODO support multi categories
lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN
Files being loaded are assumed to be utf8 encoded. if utf8 flag is not detected,
falls back to latin1 (iso-8859-1). If neither of these is correct, set this
option to the encoding your file is in.
=back
=head4 Operation Related Parameters
=over 4
=item dry_run - optional
do not write to the database. Use with verbose flag to see what would have beed
written.
This can be helpful for testing the outcome of Loading results.
i.e. like to confirm that the parsing of your data is correct
example:
{
dry_run => 1,
verbose => 1
}
=item verbose - optional
display to STDOUT what is being done
This can be helpful for testing quotes extraction from file parsing
example:
{verbose => 1}
=item create_db - optional (boolean)
L<ACME::QuoteDB::LoadDB> default behaviour is to always assume there is a
database and append new data to that. (It is usually only needed the first
time one load's data)
setting this parameter to a true value will create a new database.
(so while this is an optional param, it is required at least once ;)
B<NOTE: it is not intelligent, if you hand it a populated database,
it will happily overwrite all data>
B<AGAIN: setting this param will destroy the current database, creating a new
empty one>
example:
{create_db => 1}
=back
=head2 data_to_db
takes the data input provided to new, process' it and writes to the database.
should appropriatly blow up if not successful
=head2 dbload_from_csv
takes a csv file (in our defined format) as an argument, parses it and writes
the data to the database. (uses L<Text::CSV> with pure perl parser)
utf-8 safe. (opens file as utf8)
will croak with message if not successful
=head2 dbload
if your file format is set to 'html' or 'custom' you must
define this method to do your parsing in a sub class.
Load from html is not supported because there are too many
ways to represt the data. (same with 'custom')
(see tests for examples - there is a test for loading a 'fortune' file format)
One can subclass ACME::QuoteDB::LoadDB and override dbload,
to do our html parsing
=head2 debug_record
dump record (show what is set on the internal data structure)
e.g. Data::Dumper
=head2 set_record
only needed it one plans to sub-class this module.
otherwise, is transparent in usage.
if you are sub-classing this module, you would have to populate
this record. (L</write_record> knows about/uses this data structure)
possible fields consist of:
$self->set_record(quote => q{});
$self->set_record(rating => q{});
$self->set_record(name => q{});
$self->set_record(source => q{});
$self->set_record(catg => q{});
currently can only set one attribute at a time.
ie. you cant do this:
$self->set_record(
name => $name,
source => $source
);
# or this even
$self->set_record({
name => $name,
source => $source
});
( run in 0.905 second using v1.01-cache-2.11-cpan-39bf76dae61 )