encoding results from the CPAN

encoding

ACME-QuoteDB

view release on metacpan or search on metacpan

lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN

#$Id: LoadDB.pm,v 1.30 2009/09/30 07:37:09 dinosau2 Exp $
# /* vim:et: set ts=4 sw=4 sts=4 tw=78: */

package ACME::QuoteDB::LoadDB;

use 5.008005;        # require perl 5.8.5, re: DBD::SQLite Unicode
use warnings;
use strict;

#use criticism 'brutal'; # use critic with a ~/.perlcriticrc

use version; our $VERSION = qv('0.1.1');

# with Text::CSV only use 'perl csv loader'
# 'one time' db load performance not a concern
BEGIN {local $ENV{PERL_TEXT_CSV} = 0}

use aliased 'ACME::QuoteDB::DB::Attribution' => 'Attr';
use aliased 'ACME::QuoteDB::DB::QuoteCatg'  => 'QuoteCatg';
use aliased 'ACME::QuoteDB::DB::Category'  => 'Catg';
use aliased 'ACME::QuoteDB::DB::Quote'    => 'Quote';
use aliased 'ACME::QuoteDB::DB::DBI'     => 'QDBI';
use File::Basename qw/dirname basename/;
use File::Glob qw(:globally :nocase);
use Encode qw/is_utf8 decode/;
use Data::Dumper qw/Dumper/;
use Carp qw/carp croak/;
use Text::CSV;
use Readonly;
use DBI;

# if not in utf8 latin1 is assumed
my $FILE_ENCODING = 'iso-8859-1';

Readonly my @QUOTE_FIELDS => qw/quote name source catg rating/;

# XXX refactor
sub new {
    my ($class, $args) = @_;

    # TODO encapsulation
    my $self = bless {}, $class;

    # store each record we extract - keys map to database fields
    # TODO proper encapsulation
    $self->{record} = {};
    $self->{record}->{quote}  = q{};
    $self->{record}->{rating} = q{};
    $self->{record}->{name}   = q{};
    $self->{record}->{source} = q{};
    $self->{record}->{catg}   = q{};

    $self->{file}        = $args->{file};
    $self->{dir}         = $args->{dir};
    $self->{data}        = $args->{data};
    $self->{file_format} = $args->{file_format};
    $FILE_ENCODING       = $args->{file_encoding} || $FILE_ENCODING;
    $self->{delim}       = $args->{delimiter};
    $self->{verbose}     = $args->{verbose};
    $self->{category}    = $args->{category};
    $self->{rating}      = $args->{rating};
    $self->{attr_source} = $args->{attr_source};
    $self->{orig_args}   = $args;
    $self->{success}     = undef;

    # start with if set
    $self->{record}->{rating} = $self->{rating};
    $self->{record}->{name}   = $self->{attr_source};
    $self->{record}->{source} = $self->{attr_source};
    if (ref $self->{category} eq 'ARRAY') {
       $self->{record}->{catg} = ();
       foreach my $c (@{$self->{category}}){
         push @{$self->{record}->{catg}}, $c;
       }
    }
    else {
       $self->{record}->{catg} = $self->{category};
    }

    # db connection info
    if ($ENV{ACME_QUOTEDB_DB}) {
        $self->{db}   = $ENV{ACME_QUOTEDB_DB};
        $self->{host} = $ENV{ACME_QUOTEDB_HOST};
        $self->{user} = $ENV{ACME_QUOTEDB_USER};
        $self->{pass} = $ENV{ACME_QUOTEDB_PASS};
    }

    if (!$args->{dry_run}){$self->{write_db} = 1};
    #if ($args->{create_db}) {$self->create_db};
    if ($args->{create_db}) {$self->create_db_tables};

    return $self;
}

sub set_record {
  my ($self, $field, $value) = @_;

  # TODO support mult-field simultanous loading

  if ($value) {
      $self->{record}->{$field} = $value;
  }

  return $self;
}

sub debug_record {
  my ($self) = @_;

  print Dumper $self->{record};

  return;
}

sub get_record {
  my ($self, $field) = @_;

lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN

  if (($self->{file_format} eq 'csv') || ($self->{file_format} eq 'tsv')){
      $self->dbload_from_csv($file);
  }
  elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
      # not supported, too many possibilities
      # supply your own
      $self->dbload($file);
  }
  else {
      croak 'unsupported file format requested, format must be csv or tsv';
  }

  return;
}

sub _parse_data {
  my ($self, $data) = @_;

  if (!$data) {croak "data empty $data"}

  if ($self->{verbose}){carp 'processing data:'};

  if ($self->{file_format} =~ /(?:csv|tsv)/sm) {
      croak 'TODO: not yet supported';
      #$self->dbload_from_csv($data);
  }
  elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
      # not supported, too many possibilities
      # supply your own
      $self->dbload($data);
  }
  else {
      croak 'unsupported file format requested, '
             .'format must be csv, tsv. html, custom also possible';
  }

  return $self;
}

sub _confirm_header_order {
  my ($hr) = @_;

  return ($hr->{quote}  eq 'Quote'
      and $hr->{name}   eq 'Attribution Name',
      and $hr->{source} eq 'Attribution Source',
      and $hr->{catg}   eq 'Category',
      and $hr->{rating} eq 'Rating')
      or croak 'incorrect headers or header order';
}

sub dbload_from_csv {
  my ($self, $file) = @_;

  my $delim = $self->{delim} || ',';
  my $csv = Text::CSV->new({
     sep_char => $delim,
     binary => 1
  });
  $csv->column_names (@QUOTE_FIELDS);

  open my $source, '<:encoding(utf8)', $file || croak $!;

  _confirm_header_order($csv->getline_hr($source));

  while (my $hr = $csv->getline_hr($source)) {
      next unless $hr->{quote} and $hr->{name};

      if ($self->{verbose}){
          print "\n",
                'Quote:   ', $hr->{quote},"\n",
                'Name:    ', $hr->{name},"\n",
                'Source:  ', $hr->{source},"\n",
                'Category:', $hr->{catg},"\n",
                'Rating:  ', $hr->{rating},"\n\n";
      };

      $self->set_record(quote  => $hr->{quote});
      $self->set_record(name   => $hr->{name});
      $self->set_record(source => ($self->{attr_source} || $hr->{source}));
      # take user defined first
      # TODO support multi categories
      $self->set_record(catg   => ($self->{category} || $hr->{catg}));
      $self->set_record(rating => ($self->{rating} || $hr->{rating}));
      $self->write_record;
  }
  close $source or carp $!;

  return $self;
}

# sub class this - i.e. provide this method in your code (see test
# 01-load_quotes.t)
sub dbload {
  croak 'Override this. Provide this method in a sub class (child) of this object';
  # see tests: t/01-load_quotes.t for examples
}

sub _to_utf8 {
    my ($self) = @_;

    RECORD:
    foreach my $r (@QUOTE_FIELDS){
        my $val = $self->{record}->{$r};
        if (ref $val eq 'ARRAY'){
         foreach my $v (@{$val}){
           if (!is_utf8($v)){
             push @{$self->{record}->{$r}}, decode($FILE_ENCODING, $v);
           }
         }
        }
        else {
          if (!is_utf8($val)){
            $self->{record}->{$r} = decode($FILE_ENCODING, $val);
          }
        }
    }

    return $self;
}

# XXX refactor (the following 3 methods)

lib/ACME/QuoteDB/LoadDB.pm view on Meta::CPAN

This is an Object Oriented module. There is no proceedural interface.

=head2 new

  Instantiate a ACME::QuoteDB::LoadDB object.

  Argument is a hash ref. Params below 


=head4 Data Related Parameters

=over 4

=item  file or directory - one or the other required (not both)

if file, must be in our defined format, full path is needed.

if directory, full path is needed, can supply a basic glob type filter.

example:

{ file  => '/home/me/data/simpsons_quotes.csv' }

{ dir  => '/home/me/data/*.csv' }
 

=item  file_format - required

can be one of: 'csv', 'tsv', 'custom', or 'html'

if 'html' or 'custom' you must supply the method for parsing. 
(see tests for examples)

example:

{ file_format => 'csv' }


=item  delimiter - optional, default is a comma for csv

csv/tsv options tested: comma(,) and tab(\t)

'html' - not applicable

example:

{ delimiter => "\t" }

=item  category - optional, extracted from data if exists, otherwise will use what you
specify

TODO one quote to multiple categories

=item  attr_source - extracted from data if exists, otherwise will use what you
specify

example:

{attr_source => 'The Simpsons'}

=item  file_encoding - optional

Files being loaded are assumed to be utf8 encoded. if utf8 flag is not detected,
falls back to latin1 (iso-8859-1). If neither of these is correct, set this
option to the encoding your file is in.

=back

=head4 Operation Related Parameters

=over 4

=item  dry_run - optional

do not write to the database. Use with verbose flag to see what would have beed
written.

This can be helpful for testing the outcome of Loading results. 

i.e. like to confirm that the parsing of your data is correct

example:

{
 dry_run => 1,
 verbose => 1
}

=item  verbose  - optional

display to STDOUT what is being done

This can be helpful for testing quotes extraction from file parsing

example:

{verbose => 1}

=item  create_db  - optional (boolean)

L<ACME::QuoteDB::LoadDB> default behaviour is to always assume there is a
database and append new data to that. (It is usually only needed the first 
time one load's data)

setting this parameter to a true value will create a new database.
(so while this is an optional param, it is required at least once ;)

B<NOTE: it is not intelligent, if you hand it a populated database,
it will happily overwrite all data>

B<AGAIN: setting this param will destroy the current database, creating a new
empty one>

example:

{create_db => 1}


=back 

=head2 data_to_db

takes the data input provided to new, process' it and writes to the database.
should appropriatly blow up if not successful

( run in 0.765 second using v1.01-cache-2.11-cpan-39bf76dae61 )