ACME-QuoteDB

 view release on metacpan or  search on metacpan

lib/ACME/QuoteDB/LoadDB.pm  view on Meta::CPAN

    $self->{record}->{source} = q{};
    $self->{record}->{catg}   = q{};

    $self->{file}        = $args->{file};
    $self->{dir}         = $args->{dir};
    $self->{data}        = $args->{data};
    $self->{file_format} = $args->{file_format};
    $FILE_ENCODING       = $args->{file_encoding} || $FILE_ENCODING;
    $self->{delim}       = $args->{delimiter};
    $self->{verbose}     = $args->{verbose};
    $self->{category}    = $args->{category};
    $self->{rating}      = $args->{rating};
    $self->{attr_source} = $args->{attr_source};
    $self->{orig_args}   = $args;
    $self->{success}     = undef;

    # start with if set
    $self->{record}->{rating} = $self->{rating};
    $self->{record}->{name}   = $self->{attr_source};
    $self->{record}->{source} = $self->{attr_source};
    if (ref $self->{category} eq 'ARRAY') {
       $self->{record}->{catg} = ();
       foreach my $c (@{$self->{category}}){
         push @{$self->{record}->{catg}}, $c;
       }
    }
    else {
       $self->{record}->{catg} = $self->{category};
    }

    # db connection info
    if ($ENV{ACME_QUOTEDB_DB}) {
        $self->{db}   = $ENV{ACME_QUOTEDB_DB};
        $self->{host} = $ENV{ACME_QUOTEDB_HOST};
        $self->{user} = $ENV{ACME_QUOTEDB_USER};
        $self->{pass} = $ENV{ACME_QUOTEDB_PASS};
    }

    if (!$args->{dry_run}){$self->{write_db} = 1};
    #if ($args->{create_db}) {$self->create_db};
    if ($args->{create_db}) {$self->create_db_tables};

    return $self;
}

sub set_record {
  my ($self, $field, $value) = @_;

  # TODO support mult-field simultanous loading

  if ($value) {
      $self->{record}->{$field} = $value;
  }

  return $self;
}

sub debug_record {
  my ($self) = @_;

  print Dumper $self->{record};

  return;
}

sub get_record {
  my ($self, $field) = @_;

  if (not $field){return $self}

  return $self->{record}->{$field};
}

sub data_to_db {
    my ($self) = @_;

    if ($self->{file} and $self->{data} and $self->{dir}){
        croak 'only file, data or dir as arg but not both'
    }
    elsif (! ($self->{file} or $self->{data} or $self->{dir})) {
        croak 'file, data or dir needed as arg'
    }

    if ($self->{file}) {
        $self->_parse_file($self->{file});
    }
    elsif ($self->{data}) {
        $self->_parse_data($self->{data});
    }
    elsif ($self->{dir}) {
        my $dir = $self->{dir};
        my $e = q{};
        foreach my $f (<$dir*>) {
           #if (! (-e $f) || -z $f) # no worky - need path info
           $self->_parse_file($f);
           $e++;
        }
        if (! $e){croak 'no files to parse in: ', Dumper $dir;};
    }
    else {
      croak 'no file source in args!', Dumper $self;
    }

    return;
}

sub _parse_file {
  my ($self, $file) = @_;

  if (!-f $file) { croak "file not found: $file" }

  if ($self->{verbose}){warn "processing file: $file\n"};

  if (($self->{file_format} eq 'csv') || ($self->{file_format} eq 'tsv')){
      $self->dbload_from_csv($file);
  }
  elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
      # not supported, too many possibilities
      # supply your own
      $self->dbload($file);
  }
  else {
      croak 'unsupported file format requested, format must be csv or tsv';
  }

  return;
}

sub _parse_data {
  my ($self, $data) = @_;

  if (!$data) {croak "data empty $data"}

  if ($self->{verbose}){carp 'processing data:'};

  if ($self->{file_format} =~ /(?:csv|tsv)/sm) {
      croak 'TODO: not yet supported';
      #$self->dbload_from_csv($data);
  }
  elsif (($self->{file_format} eq 'html') || ($self->{file_format} eq 'custom')){
      # not supported, too many possibilities
      # supply your own
      $self->dbload($data);
  }
  else {
      croak 'unsupported file format requested, '
             .'format must be csv, tsv. html, custom also possible';
  }

  return $self;
}

sub _confirm_header_order {
  my ($hr) = @_;

  return ($hr->{quote}  eq 'Quote'
      and $hr->{name}   eq 'Attribution Name',
      and $hr->{source} eq 'Attribution Source',
      and $hr->{catg}   eq 'Category',
      and $hr->{rating} eq 'Rating')
      or croak 'incorrect headers or header order';
}

sub dbload_from_csv {
  my ($self, $file) = @_;

  my $delim = $self->{delim} || ',';
  my $csv = Text::CSV->new({
     sep_char => $delim,
     binary => 1
  });
  $csv->column_names (@QUOTE_FIELDS);

  open my $source, '<:encoding(utf8)', $file || croak $!;

  _confirm_header_order($csv->getline_hr($source));

  while (my $hr = $csv->getline_hr($source)) {
      next unless $hr->{quote} and $hr->{name};

      if ($self->{verbose}){
          print "\n",
                'Quote:   ', $hr->{quote},"\n",
                'Name:    ', $hr->{name},"\n",
                'Source:  ', $hr->{source},"\n",
                'Category:', $hr->{catg},"\n",
                'Rating:  ', $hr->{rating},"\n\n";
      };

      $self->set_record(quote  => $hr->{quote});
      $self->set_record(name   => $hr->{name});
      $self->set_record(source => ($self->{attr_source} || $hr->{source}));
      # take user defined first
      # TODO support multi categories
      $self->set_record(catg   => ($self->{category} || $hr->{catg}));
      $self->set_record(rating => ($self->{rating} || $hr->{rating}));
      $self->write_record;
  }
  close $source or carp $!;

  return $self;
}

# sub class this - i.e. provide this method in your code (see test
# 01-load_quotes.t)
sub dbload {
  croak 'Override this. Provide this method in a sub class (child) of this object';
  # see tests: t/01-load_quotes.t for examples
}

sub _to_utf8 {
    my ($self) = @_;

    RECORD:
    foreach my $r (@QUOTE_FIELDS){
        my $val = $self->{record}->{$r};
        if (ref $val eq 'ARRAY'){
         foreach my $v (@{$val}){
           if (!is_utf8($v)){
             push @{$self->{record}->{$r}}, decode($FILE_ENCODING, $v);
           }
         }
        }
        else {
          if (!is_utf8($val)){
            $self->{record}->{$r} = decode($FILE_ENCODING, $val);
          }
        }
    }

    return $self;
}

# XXX refactor (the following 3 methods)

# one person can have many quotes, is this person in our attribution table
# already?
sub _get_id_if_attr_name_exist {
    my ($self) = @_;

    my $attr_id = q{};

    RECS:
    foreach my $c_obj (Attr->retrieve_all){
        next RECS if not $c_obj->name;
        if ($c_obj->name eq $self->get_record('name')){
          # use attribution id if already exists
          $attr_id = $c_obj->attr_id;
        }
    }
    return $attr_id;
}

sub _get_id_if_catg_exist {
    my ($self, $ctg) = @_;

    my $catg_id = q{};
    # get category id
    RECS:
    foreach my $c_obj (Catg->retrieve_all){
        next RECS if not $c_obj->catg;
        if ($c_obj->catg eq $ctg){
          # use cat_id if already exists
          $catg_id = $c_obj->catg_id;
        }
    }
    return $catg_id;
}

#TODO : refactor
sub write_record {
    my ($self) = @_;

    $self->_to_utf8;

    if ($self->{verbose} and $self->get_record('name')){
        print 'Attribution Name: ',$self->get_record('name'),"\n";
    };

    my $attr_id = $self->_get_id_if_attr_name_exist;
    # nope, ok, add them
    if (not $attr_id) { # attribution record does not already exist, 
                        # create new entry
        if ($self->{write_db}) {
            $attr_id = Attr->insert({
                          name   => $self->get_record('name'),
                       });
        }
    }

    my $catg_ids = ();
    if ($self->{write_db}) {
      my ($catg) = $self->get_record('catg');
      if (! ref $catg){ # 'single' value
        my $catg_id = $self->_get_id_if_catg_exist($catg);
        if (!$catg_id) {
          # category does not already exist, 
          # create new entry
          $catg_id = Catg->insert({catg => $catg});
        }
        push @{$catg_ids}, $catg_id;
      } # support multi catg
      elsif (ref $catg eq 'ARRAY'){
          foreach my $c (@{$catg}){
            my $catg_id = $self->_get_id_if_catg_exist($c);
            if (!$catg_id) { # category does not already exist, 
               # create new entry
               $catg_id = Catg->insert({catg => $c});
            }
            push @{$catg_ids}, $catg_id;
          }
      }
    }

    $self->_display_vals_if_verbose;

    if ($self->{write_db}) {
       my $qid = Quote->insert({
               attr_id  => $attr_id,
               quote    => $self->get_record('quote'),
               source   => $self->get_record('source'),
               rating   => $self->get_record('rating')
       }) or croak $!;

       if ($qid) {
         my $id;
         foreach my $cid (@{$catg_ids}){
           $id =   QuoteCatg->insert({
                 quot_id  => $qid,
                 catg_id  => $cid,
            }) or croak $!;
         }
       }
    }
    # confirmation?
    # TODO add a test for failure
    if ($self->{write_db} and not $attr_id) {croak 'db write not successful'}

    #$self->set_record(undef);
    $self->{record} = {};
    $self->_reset_orig_args;

    if ($self->{write_db}) {
        $self->success(1);
    }

    return $self->success;
}

sub _reset_orig_args {
  my ($self) = @_;

  $self->{record}->{rating} = $self->{orig_args}->{rating};
  $self->{record}->{name}   = $self->{orig_args}->{attr_source};
  $self->{record}->{source} = $self->{orig_args}->{attr_source};
  if (ref $self->{orig_args}->{category} eq 'ARRAY') {
     foreach my $c (@{$self->{orig_args}->{category}}){
       push @{$self->{record}->{catg}}, $c;
     }
  } 
  else {
    $self->{record}->{catg} = $self->{orig_args}->{category};
  }

}

sub success {
  my ($self, $flag) = @_;

  $self->{success} ||= $flag;

  return $self->{success};
};

sub _display_vals_if_verbose {
    my ($self) = @_;

    if ($self->{verbose}){
        #print 'Quote: ',   $self->get_record('quote'),"\n";
        #print 'Source: ',  $self->get_record('source'),"\n";
        #print 'Category: ',$self->get_record('catg'),"\n";
        #print 'Rating: ',  $self->get_record('rating'),"\n";
        print Dumper $self->{record};
    }

    return $self;
}

#sub create_db {
#    my ($self) = @_;
#
#    if ($self->{db} and $self->{host}) {
#      $self->create_db_mysql();
#    }
#}

sub create_db_tables {
    my ($self) = @_;

    if ($self->{db} and $self->{host}) {
      #$self->create_db_mysql();
      $self->create_db_tables_mysql();
    }
    else {
      create_db_tables_sqlite();
    }

    return $self;

}

# XXX  we want the user to supply a pre created database.
# created as such 'CREATE DATABASE $dbn CHARACTER SET utf8 COLLATE utf8_general_ci'
# this get's into too many isseuwith privs and database creation
#Sat Aug 22 13:42:37 PDT 2009
# did this:
#mysql> CREATE DATABASE acme_quotedb CHARACTER SET utf8 COLLATE utf8_general_ci;
#mysql> grant usage on *.* to acme_user@localhost identified by 'acme';
#mysql> grant all privileges on acme_quotedb.* to acme_user@localhost ;

#sub create_db_mysql {
#    my ($self) = @_;
#
#     # hmmmm, what about priv's access, etc
#     # maybe user need to supply a db, they have 
#     # access to, already created (just the db though)
#     ## create our db
#     #my $dbhc = DBI->connect('DBI:mysql:database=mysql;host='
#     #                           .$self->{host}, $self->{user}, $self->{pass})
#     #      || croak "db cannot be accessed $! $DBI::errstr";
#
#     #my $dbn = $self->{db};
#     #my $db = qq(CREATE DATABASE $dbn CHARACTER SET utf8 COLLATE utf8_general_ci);
#     # eval {
#     #     $dbhc->do($db) or croak $dbhc->errstr;
#     # };
#     # $@ and croak 'Cannot create database!';
#     # $dbhc->disconnect; $dbhc = undef;
#
#     my $drh = DBI->install_driver('mysql');
#     my $rc = $drh->func("dropdb", $self->{db}, 
#                    [$self->{host}, $self->{user}, $self->{password}],
#                    'admin'

lib/ACME/QuoteDB/LoadDB.pm  view on Meta::CPAN

            source         TEXT,
            rating         REAL
            );')
            or croak $dbh->errstr;

        $dbh->do('DROP TABLE IF EXISTS attribution;') or croak $dbh->errstr;

        $dbh->do('CREATE TABLE IF NOT EXISTS attribution (
            attr_id  INTEGER PRIMARY KEY,
            name     TEXT
            );') or croak $dbh->errstr;

        $dbh->do('DROP TABLE IF EXISTS category;') or croak $dbh->errstr;

        $dbh->do('CREATE TABLE IF NOT EXISTS category (
            catg_id    INTEGER PRIMARY KEY, 
            catg       TEXT
            );') or croak $dbh->errstr;

        $dbh->do('DROP TABLE IF EXISTS quote_catg;') or croak $dbh->errstr;

        $dbh->do('CREATE TABLE IF NOT EXISTS quote_catg (
            id         INTEGER PRIMARY KEY,
            catg_id    INTEGER,
            quot_id    INTEGER
            );') or croak $dbh->errstr;

        $dbh->disconnect or carp $dbh->errstr;

        $dbh = undef;
    };

    return $@ and croak 'Cannot create database tables!';

}

q(My cat's breath smells like cat food. --Ralph Wiggum);


__END__

=head1 NAME

ACME::QuoteDB::LoadDB - Database loader for ACME::QuoteDB

=head1 VERSION

Version 0.1.1

=head1 SYNOPSIS

load a csv file to quotes database 

  my $load_db = ACME::QuoteDB::LoadDB->new({
                              file => '/home/me/data/simpsons_quotes.csv',
                              file_format => 'csv',
                          });
  
  $load_db->data_to_db;

  print $load_db->success; # bool

header columns of the csv file as follows:

"Quote", "Attribution Name", "Attribution Source", "Category", "Rating"


=head1 DESCRIPTION

This module is part of L<ACME::QuoteDB>. This is a Database loader, it
takes (quotes) data and loads into a database 
(currently L<sqlite3 or mysql|/'CONFIGURATION AND ENVIRONMENT'>),
which is then accessed by L<ACME::QuoteDB>.


There are several ways to get quote data into the db via this loader:
(There are more aimed towards 'batch' operations, i.e load a bunch of 
records quickly)

=over 4

=item 1

* csv file (pre determined format)

     pros: quick and easy to load.
     cons: getting the quotes data into the correct format need by this module

=item 2

* any source.

    One can take quote data from any source, override
    L<ACME::QuoteDB::LoadDB/dbload> loader methods to populate a record
    and write it to the db.
     pros: can get any quote data into the db.
     cons: you supply the method. depending on the complexity of the data
           source and munging required this will take longer then the other 
           methods.

=back

=head3 load from csv file

The pre defined csv file format is:

format of file is as follows: (headers)
"Quote", "Attribution Name", "Attribution Source", "Category", "Rating"
  
   for example:
   "Quote", "Attribution Name", "Attribution Source", "Category", "Rating"
   "I hope this has taught you kids a lesson: kids never learn.","Chief Wiggum","The Simpsons","Humor",9
   "Sideshow Bob has no decency. He called me Chief Piggum. (laughs) Oh wait, I get it, he's all right.","Chief Wiggum","The Simpsons","Humor",8


   my $load_db = ACME::QuoteDB::LoadDB->new({
                               file => dirname(__FILE__).'/data/simpsons_quotes.csv',
                               file_format => 'csv',
                           });
   
   $load_db->data_to_db;

   if (!$load_db->success){print 'failed'}


=head3 load from any source

If those dont catch your interest, ACME::QuoteDB::LoadDB is sub-classable, 
so one can extract data anyway they like and populate the db themselves. 
(there is a test that illustrates overriding the stub method, 'dbload')

you need to populate a record data structure:

    $self->set_record(quote  => q{}); # mandatory
    $self->set_record(name   => q{}); # mandatory
    $self->set_record(source => q{}); # optional but useful
    $self->set_record(catg   => q{}); # optional but useful
    $self->set_record(rating => q{}); # optional but useful

    # then to write the record you call
    $self->write_record;

NOTE: this is a one record at a time operation, so one would perform 
this within a loop. there is no bulk write operation currently.


=head1 OVERVIEW

You have a collection of quotes (adages/sayings/quips/epigrams, etc) for
whatever reason, you use these quotes for whatever reason, you want to 
access these quotes in a variety of ways,...

This module is part of L<ACME::QuoteDB>. 

This is a Database loader, it takes data (quotes) and loads into a database, 
which is then accessed by L<ACME::QuoteDB>.

See L<ACME::QuoteDB>.


=head1 USAGE

General usage, csv/tsv file in the expected format loaded to the database

  my $load_db = ACME::QuoteDB::LoadDB->new({
                              file => '/home/me/data/sorta_funny_quotes.tsv',
                              file_format => 'tsv',
                              delimiter => "\t",
                              # provide a attr_source for all (if not in data)
                              # data is used first, if not defined use below
                              attr_source => 'Things Randomly Overheard',
                              # provide a category for all (if not in data)
                              category => 'Humor',
                              # provide a rating for all
                              rating   => 5, # scale 1-10
                          });
  $load_db->data_to_db;

  if (!$load_db->success){print 'failed'}

Also see t/01-load_quotes.t included with the distribution.

(available from the CPAN if not included on your system)


=head1 SUBROUTINES/METHODS 

This is an Object Oriented module. There is no proceedural interface.

=head2 new

  Instantiate a ACME::QuoteDB::LoadDB object.

  Argument is a hash ref. Params below 


=head4 Data Related Parameters

=over 4

=item  file or directory - one or the other required (not both)

if file, must be in our defined format, full path is needed.

if directory, full path is needed, can supply a basic glob type filter.

example:

{ file  => '/home/me/data/simpsons_quotes.csv' }

{ dir  => '/home/me/data/*.csv' }
 

=item  file_format - required

can be one of: 'csv', 'tsv', 'custom', or 'html'

if 'html' or 'custom' you must supply the method for parsing. 
(see tests for examples)

example:

{ file_format => 'csv' }


=item  delimiter - optional, default is a comma for csv

csv/tsv options tested: comma(,) and tab(\t)

'html' - not applicable

example:

{ delimiter => "\t" }

=item  category - optional, extracted from data if exists, otherwise will use what you
specify

TODO one quote to multiple categories



( run in 2.727 seconds using v1.01-cache-2.11-cpan-d8267643d1d )