AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
package AI::Categorizer::Collection;
use strict;
use Params::Validate qw(:types);
use Class::Container;
use base qw(Class::Container);
__PACKAGE__->valid_params
(
verbose => {type => SCALAR, default => 0},
stopword_file => { type => SCALAR, optional => 1 },
category_hash => { type => HASHREF, default => {} },
category_file => { type => SCALAR, optional => 1 },
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my ($class, %args) = @_;
# Optimize so every document doesn't have to convert the stopword list to a hash
if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
$args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
}
my $self = $class->SUPER::new(%args);
if ($self->{category_file}) {
local *FH;
open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!";
while (<FH>) {
my ($doc, @cats) = split;
$self->{category_hash}{$doc} = \@cats;
}
close FH;
}
if (exists $self->{stopword_file}) {
my %stopwords;
local *FH;
open FH, "< $self->{stopword_file}" or die "$self->{stopword_file}: $!";
while (<FH>) {
chomp;
$stopwords{$_} = 1;
}
close FH;
$self->delayed_object_params('document', stopwords => \%stopwords);
}
return $self;
}
# This should usually be replaced in subclasses with a faster version that doesn't
# need to create actual documents each time through
sub count_documents {
my $self = shift;
return $self->{document_count} if exists $self->{document_count};
$self->rewind;
my $count = 0;
$count++ while $self->next;
$self->rewind;
return $self->{document_count} = $count;
}
# Abstract methods
sub next;
sub rewind;
1;
__END__
=head1 NAME
AI::Categorizer::Collection - Access stored documents
=head1 SYNOPSIS
my $c = new AI::Categorizer::Collection::Files
(path => '/tmp/docs/training',
category_file => '/tmp/docs/cats.txt');
print "Total number of docs: ", $c->count_documents, "\n";
while (my $document = $c->next) {
...
}
( run in 0.881 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )