AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
$self->knowledge_set->features; # Initialize
}
sub threshold {
my $self = shift;
$self->{threshold} = shift if @_;
return $self->{threshold};
}
sub categorize_collection {
my $self = shift;
my $f_class = $self->knowledge_set->contained_class('features');
if ($f_class->can('all_features')) {
$f_class->all_features([$self->knowledge_set->features->names]);
}
$self->SUPER::categorize_collection(@_);
}
sub get_scores {
my ($self, $newdoc) = @_;
my $currentDocName = $newdoc->name;
#print "classifying $currentDocName\n";
my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});
my @docset;
if ($self->{max_instances}) {
# Use (approximately) max_instances documents, chosen randomly from corpus
my $probability = $self->{max_instances} / $self->knowledge_set->documents;
@docset = grep {rand() < $probability} $self->knowledge_set->documents;
} else {
# Use the whole corpus
@docset = $self->knowledge_set->documents;
}
foreach my $doc (@docset) {
my $score = $doc->features->dot( $features );
warn "Score for ", $doc->name, " (", ($doc->categories)[0]->name, "): $score" if $self->verbose > 1;
$q->add($doc, $score);
}
my %scores = map {+$_->name, 0} $self->categories;
foreach my $e (@{$q->entries}) {
foreach my $cat ($e->{thing}->categories) {
$scores{$cat->name} += ($self->{knn_weighting} eq 'score' ? $e->{score} : 1); #increment cat score
}
}
$_ /= $self->{k_value} foreach values %scores;
return (\%scores, $self->{threshold});
}
###################################################################
package AI::Categorizer::Learner::KNN::Queue;
sub new {
my ($pkg, %args) = @_;
return bless {
size => $args{size},
entries => [],
}, $pkg;
}
sub add {
my ($self, $thing, $score) = @_;
# scores may be (0.2, 0.4, 0.4, 0.8) - ascending
return unless (@{$self->{entries}} < $self->{size} # Queue not filled
or $score > $self->{entries}[0]{score}); # Found a better entry
my $i;
if (!@{$self->{entries}}) {
$i = 0;
} elsif ($score > $self->{entries}[-1]{score}) {
$i = @{$self->{entries}};
} else {
for ($i = 0; $i < @{$self->{entries}}; $i++) {
last if $score < $self->{entries}[$i]{score};
}
}
splice @{$self->{entries}}, $i, 0, { thing => $thing, score => $score};
shift @{$self->{entries}} if @{$self->{entries}} > $self->{size};
}
sub entries {
return shift->{entries};
}
1;
__END__
=head1 NAME
AI::Categorizer::Learner::KNN - K Nearest Neighbour Algorithm For AI::Categorizer
=head1 SYNOPSIS
use AI::Categorizer::Learner::KNN;
# Here $k is an AI::Categorizer::KnowledgeSet object
my $nb = new AI::Categorizer::Learner::KNN(...parameters...);
$nb->train(knowledge_set => $k);
$nb->save_state('filename');
... time passes ...
$l = AI::Categorizer::Learner->restore_state('filename');
my $c = new AI::Categorizer::Collection::Files( path => ... );
while (my $document = $c->next) {
my $hypothesis = $l->categorize($document);
print "Best assigned category: ", $hypothesis->best_category, "\n";
print "All assigned categories: ", join(', ', $hypothesis->categories), "\n";
}
=head1 DESCRIPTION
( run in 1.448 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )