AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN

package AI::Categorizer::Collection;
use strict;

use Params::Validate qw(:types);
use Class::Container;
use base qw(Class::Container);
__PACKAGE__->valid_params
  (
   verbose => {type => SCALAR, default => 0},
   stopword_file => { type => SCALAR, optional => 1 },
   category_hash => { type => HASHREF, default => {} },
   category_file => { type => SCALAR, optional => 1 },
  );

__PACKAGE__->contained_objects
  (
   document => { class => 'AI::Categorizer::Document::Text',
		 delayed => 1 },
  );

sub new {
  my ($class, %args) = @_;
  
  # Optimize so every document doesn't have to convert the stopword list to a hash
  if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
    $args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
  }
  
  my $self = $class->SUPER::new(%args);

  if ($self->{category_file}) {
    local *FH;
    open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!";
    while (<FH>) {
      my ($doc, @cats) = split;
      $self->{category_hash}{$doc} = \@cats;
    }
    close FH;
  }
  if (exists $self->{stopword_file}) {
    my %stopwords;
    local *FH;
    open FH, "< $self->{stopword_file}" or die "$self->{stopword_file}: $!";
    while (<FH>) {
      chomp;
      $stopwords{$_} = 1;
    }
    close FH;

    $self->delayed_object_params('document', stopwords => \%stopwords);
  }

  return $self;
}

# This should usually be replaced in subclasses with a faster version that doesn't
# need to create actual documents each time through
sub count_documents {
  my $self = shift;
  return $self->{document_count} if exists $self->{document_count};

  $self->rewind;
  my $count = 0;
  $count++ while $self->next;
  $self->rewind;

  return $self->{document_count} = $count;
}

# Abstract methods
sub next;
sub rewind;

1;
__END__

=head1 NAME

AI::Categorizer::Collection - Access stored documents

=head1 SYNOPSIS

  my $c = new AI::Categorizer::Collection::Files
    (path => '/tmp/docs/training',
     category_file => '/tmp/docs/cats.txt');
  print "Total number of docs: ", $c->count_documents, "\n";
  while (my $document = $c->next) {
    ...
  }



( run in 0.881 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )