AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Learner.pm view on Meta::CPAN
package AI::Categorizer::Learner;
use strict;
use Class::Container;
use AI::Categorizer::Storable;
use base qw(Class::Container AI::Categorizer::Storable);
use Params::Validate qw(:types);
use AI::Categorizer::ObjectSet;
__PACKAGE__->valid_params
(
knowledge_set => { isa => 'AI::Categorizer::KnowledgeSet', optional => 1 },
verbose => {type => SCALAR, default => 0},
);
__PACKAGE__->contained_objects
(
hypothesis => {
class => 'AI::Categorizer::Hypothesis',
delayed => 1,
},
experiment => {
class => 'AI::Categorizer::Experiment',
delayed => 1,
},
);
# Subclasses must override these virtual methods:
sub get_scores;
sub create_model;
# Optional virtual method for on-line learning:
sub add_knowledge;
sub verbose {
my $self = shift;
if (@_) {
$self->{verbose} = shift;
}
return $self->{verbose};
}
sub knowledge_set {
my $self = shift;
if (@_) {
$self->{knowledge_set} = shift;
}
return $self->{knowledge_set};
}
sub categories {
my $self = shift;
return $self->knowledge_set->categories;
}
sub train {
my ($self, %args) = @_;
$self->{knowledge_set} = $args{knowledge_set} if $args{knowledge_set};
die "No knowledge_set provided" unless $self->{knowledge_set};
$self->{knowledge_set}->finish;
$self->create_model; # Creates $self->{model}
$self->delayed_object_params('hypothesis',
all_categories => [map $_->name, $self->categories],
);
}
sub prog_bar {
my ($self, $count) = @_;
return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
my $pb = 'Time::Progress'->new;
$pb->attr(max => $count);
my $i = 0;
return sub {
$i++;
return if $i % 25;
my $string = '';
if (@_) {
my $e = shift;
$string = sprintf " (maF1=%.03f, miF1=%.03f)", $e->macro_F1, $e->micro_F1;
}
print STDERR $pb->report("%50b %p ($i/$count)$string\r", $i);
return $i;
};
}
sub categorize_collection {
my ($self, %args) = @_;
my $c = $args{collection} or die "No collection provided";
my @all_cats = map $_->name, $self->categories;
my $experiment = $self->create_delayed_object('experiment', categories => \@all_cats);
my $pb = $self->verbose ? $self->prog_bar($c->count_documents) : sub {};
while (my $d = $c->next) {
my $h = $self->categorize($d);
$experiment->add_hypothesis($h, [map $_->name, $d->categories]);
$pb->($experiment);
if ($self->verbose > 1) {
printf STDERR ("%s: assigned=(%s) correct=(%s)\n",
$d->name,
join(', ', $h->categories),
join(', ', map $_->name, $d->categories));
}
}
print STDERR "\n" if $self->verbose;
return $experiment;
}
sub categorize {
my ($self, $doc) = @_;
my ($scores, $threshold) = $self->get_scores($doc);
if ($self->verbose > 2) {
warn "scores: @{[ %$scores ]}" if $self->verbose > 3;
foreach my $key (sort {$scores->{$b} <=> $scores->{$a}} keys %$scores) {
print "$key: $scores->{$key}\n";
}
}
return $self->create_delayed_object('hypothesis',
scores => $scores,
threshold => $threshold,
document_name => $doc->name,
);
}
1;
__END__
=head1 NAME
AI::Categorizer::Learner - Abstract Machine Learner Class
=head1 SYNOPSIS
use AI::Categorizer::Learner::NaiveBayes; # Or other subclass
# Here $k is an AI::Categorizer::KnowledgeSet object
my $nb = new AI::Categorizer::Learner::NaiveBayes(...parameters...);
$nb->train(knowledge_set => $k);
$nb->save_state('filename');
... time passes ...
$nb = AI::Categorizer::Learner::NaiveBayes->restore_state('filename');
my $c = new AI::Categorizer::Collection::Files( path => ... );
while (my $document = $c->next) {
my $hypothesis = $nb->categorize($document);
print "Best assigned category: ", $hypothesis->best_category, "\n";
print "All assigned categories: ", join(', ', $hypothesis->categories), "\n";
}
=head1 DESCRIPTION
The C<AI::Categorizer::Learner> class is an abstract class that will
never actually be directly used in your code. Instead, you will use a
subclass like C<AI::Categorizer::Learner::NaiveBayes> which implements
an actual machine learning algorithm.
The general description of the Learner interface is documented here.
=head1 METHODS
=over 4
=item new()
( run in 0.872 second using v1.01-cache-2.11-cpan-39bf76dae61 )