AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
package AI::Categorizer::Learner::KNN;
use strict;
use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
threshold => {type => SCALAR, default => 0.4},
k_value => {type => SCALAR, default => 20},
knn_weighting => {type => SCALAR, default => 'score'},
max_instances => {type => SCALAR, default => 0},
);
sub create_model {
my $self = shift;
foreach my $doc ($self->knowledge_set->documents) {
$doc->features->normalize;
}
$self->knowledge_set->features; # Initialize
}
sub threshold {
my $self = shift;
$self->{threshold} = shift if @_;
return $self->{threshold};
}
sub categorize_collection {
my $self = shift;
my $f_class = $self->knowledge_set->contained_class('features');
if ($f_class->can('all_features')) {
$f_class->all_features([$self->knowledge_set->features->names]);
}
$self->SUPER::categorize_collection(@_);
}
sub get_scores {
my ($self, $newdoc) = @_;
my $currentDocName = $newdoc->name;
#print "classifying $currentDocName\n";
my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});
my @docset;
if ($self->{max_instances}) {
# Use (approximately) max_instances documents, chosen randomly from corpus
my $probability = $self->{max_instances} / $self->knowledge_set->documents;
@docset = grep {rand() < $probability} $self->knowledge_set->documents;
} else {
# Use the whole corpus
@docset = $self->knowledge_set->documents;
}
foreach my $doc (@docset) {
my $score = $doc->features->dot( $features );
warn "Score for ", $doc->name, " (", ($doc->categories)[0]->name, "): $score" if $self->verbose > 1;
$q->add($doc, $score);
}
my %scores = map {+$_->name, 0} $self->categories;
foreach my $e (@{$q->entries}) {
foreach my $cat ($e->{thing}->categories) {
$scores{$cat->name} += ($self->{knn_weighting} eq 'score' ? $e->{score} : 1); #increment cat score
}
}
$_ /= $self->{k_value} foreach values %scores;
return (\%scores, $self->{threshold});
}
###################################################################
package AI::Categorizer::Learner::KNN::Queue;
sub new {
my ($pkg, %args) = @_;
return bless {
size => $args{size},
entries => [],
}, $pkg;
}
sub add {
my ($self, $thing, $score) = @_;
# scores may be (0.2, 0.4, 0.4, 0.8) - ascending
return unless (@{$self->{entries}} < $self->{size} # Queue not filled
or $score > $self->{entries}[0]{score}); # Found a better entry
my $i;
if (!@{$self->{entries}}) {
$i = 0;
} elsif ($score > $self->{entries}[-1]{score}) {
$i = @{$self->{entries}};
} else {
for ($i = 0; $i < @{$self->{entries}}; $i++) {
last if $score < $self->{entries}[$i]{score};
}
}
splice @{$self->{entries}}, $i, 0, { thing => $thing, score => $score};
( run in 1.565 second using v1.01-cache-2.11-cpan-39bf76dae61 )