AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
package AI::Categorizer::Collection;
use strict;
use Params::Validate qw(:types);
use Class::Container;
use base qw(Class::Container);
__PACKAGE__->valid_params
(
verbose => {type => SCALAR, default => 0},
stopword_file => { type => SCALAR, optional => 1 },
category_hash => { type => HASHREF, default => {} },
category_file => { type => SCALAR, optional => 1 },
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my ($class, %args) = @_;
# Optimize so every document doesn't have to convert the stopword list to a hash
if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
$args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
}
my $self = $class->SUPER::new(%args);
if ($self->{category_file}) {
local *FH;
open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!";
while (<FH>) {
my ($doc, @cats) = split;
$self->{category_hash}{$doc} = \@cats;
}
close FH;
}
if (exists $self->{stopword_file}) {
my %stopwords;
local *FH;
open FH, "< $self->{stopword_file}" or die "$self->{stopword_file}: $!";
while (<FH>) {
chomp;
$stopwords{$_} = 1;
}
close FH;
$self->delayed_object_params('document', stopwords => \%stopwords);
}
return $self;
}
# This should usually be replaced in subclasses with a faster version that doesn't
# need to create actual documents each time through
sub count_documents {
my $self = shift;
return $self->{document_count} if exists $self->{document_count};
$self->rewind;
my $count = 0;
$count++ while $self->next;
$self->rewind;
return $self->{document_count} = $count;
}
# Abstract methods
sub next;
sub rewind;
1;
__END__
=head1 NAME
AI::Categorizer::Collection - Access stored documents
=head1 SYNOPSIS
my $c = new AI::Categorizer::Collection::Files
(path => '/tmp/docs/training',
category_file => '/tmp/docs/cats.txt');
print "Total number of docs: ", $c->count_documents, "\n";
while (my $document = $c->next) {
...
}
$c->rewind; # For further operations
=head1 DESCRIPTION
This abstract class implements an iterator for accessing documents in
their natively stored format. You cannot directly create an instance
of the Collection class, because it is abstract - see the
documentation for the C<Files>, C<SingleFile>, or C<InMemory>
subclasses for a concrete interface.
=head1 METHODS
=over 4
=item new()
Creates a new Collection object and returns it. Accepts the following
parameters:
=over 4
=item category_hash
Indicates a reference to a hash which maps document names to category
names. The keys of the hash are the document names, each value should
be a reference to an array containing the names of the categories to
which each document belongs.
=item category_file
Indicates a file which should be read in order to create the
C<category_hash>. Each line of the file should list a document's
name, followed by a list of category names, all separated by
whitespace.
=item stopword_file
Specifies a file containing a list of "stopwords", which are words
that should automatically be disregarded when scanning/reading
documents. The file should contain one word per line. The file will
be parsed and then fed as the C<stopwords> parameter to the
Document C<new()> method.
( run in 0.866 second using v1.01-cache-2.11-cpan-39bf76dae61 )