AI-NaiveBayes
view release on metacpan or search on metacpan
sub classify {
my ($self, $newattrs) = @_;
$newattrs or die "Missing parameter for classify()";
my $m = $self->model;
# Note that we're using the log(prob) here. That's why we add instead of multiply.
my %scores = %{$m->{prior_probs}};
my %features;
while (my ($feature, $value) = each %$newattrs) {
next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
while (my ($label, $attributes) = each %{$m->{probs}}) {
my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value; # P($feature|$label)**$value
$scores{$label} += $score;
$features{$feature}{$label} = $score;
}
}
rescale(\%scores);
return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
}
sub rescale {
my ($scores) = @_;
# Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
my $total = 0;
my $max = max(values %$scores);
foreach (values %$scores) {
$_ = exp($_ - $max);
},
{
attributes => {
vampires => 1, cannot => 1, see => 1, their => 1,
images => 1, mirrors => 1
},
labels => ['vampire']
},
);
# Classify a feature vector
my $result = $classifier->classify({bar => 3, blurp => 2});
# $result is now a AI::NaiveBayes::Classification object
my $best_category = $result->best_category;
=head1 DESCRIPTION
This module implements the classic "Naive Bayes" machine learning
algorithm. This is a low level class that accepts only pre-computed feature-vectors
as input, see L<AI::Classifier::Text> for a text classifier that uses
this class.
Creation of C<AI::NaiveBayes> classifier object out of training
data is done by L<AI::NaiveBayes::Learner>. For quick start
you can use the limited C<train> class method that trains the
classifier in a default way.
The classifier object is immutable.
=item train( LIST of HASHREFS )
Shortcut for creating a trained classifier using L<AI::NaiveBayes::Learner> default
settings.
Arguments are passed to the C<add_example> method of the L<AI::NaiveBayes::Learner>
object one by one.
=item classify( HASHREF )
Classifies a feature-vector of the form:
{ feature1 => weight1, feature2 => weight2, ... }
The result is a C<AI::NaiveBayes::Classification> object.
=item rescale
Internal
=back
=head1 ATTRIBUTES
lib/AI/NaiveBayes.pm view on Meta::CPAN
sub classify {
my ($self, $newattrs) = @_;
$newattrs or die "Missing parameter for classify()";
my $m = $self->model;
# Note that we're using the log(prob) here. That's why we add instead of multiply.
my %scores = %{$m->{prior_probs}};
my %features;
while (my ($feature, $value) = each %$newattrs) {
next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
while (my ($label, $attributes) = each %{$m->{probs}}) {
my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value; # P($feature|$label)**$value
$scores{$label} += $score;
$features{$feature}{$label} = $score;
}
}
rescale(\%scores);
return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
}
sub rescale {
my ($scores) = @_;
# Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
my $total = 0;
my $max = max(values %$scores);
foreach (values %$scores) {
$_ = exp($_ - $max);
lib/AI/NaiveBayes.pm view on Meta::CPAN
},
{
attributes => {
vampires => 1, cannot => 1, see => 1, their => 1,
images => 1, mirrors => 1
},
labels => ['vampire']
},
);
# Classify a feature vector
my $result = $classifier->classify({bar => 3, blurp => 2});
# $result is now a AI::NaiveBayes::Classification object
my $best_category = $result->best_category;
=head1 DESCRIPTION
This module implements the classic "Naive Bayes" machine learning
algorithm. This is a low level class that accepts only pre-computed feature-vectors
as input, see L<AI::Classifier::Text> for a text classifier that uses
this class.
Creation of C<AI::NaiveBayes> classifier object out of training
data is done by L<AI::NaiveBayes::Learner>. For quick start
you can use the limited C<train> class method that trains the
classifier in a default way.
The classifier object is immutable.
lib/AI/NaiveBayes.pm view on Meta::CPAN
=item train( LIST of HASHREFS )
Shortcut for creating a trained classifier using L<AI::NaiveBayes::Learner> default
settings.
Arguments are passed to the C<add_example> method of the L<AI::NaiveBayes::Learner>
object one by one.
=item classify( HASHREF )
Classifies a feature-vector of the form:
{ feature1 => weight1, feature2 => weight2, ... }
The result is a C<AI::NaiveBayes::Classification> object.
=item rescale
Internal
=back
=head1 ATTRIBUTES
lib/AI/NaiveBayes/Classification.pm view on Meta::CPAN
package AI::NaiveBayes::Classification;
$AI::NaiveBayes::Classification::VERSION = '0.04';
use strict;
use warnings;
use 5.010;
use Moose;
has features => (is => 'ro', isa => 'HashRef[HashRef]', required => 1);
has label_sums => (is => 'ro', isa => 'HashRef', required => 1);
has best_category => (is => 'ro', isa => 'Str', lazy_build => 1);
sub _build_best_category {
my $self = shift;
my $sc = $self->label_sums;
my ($best_cat, $best_score) = each %$sc;
while (my ($key, $val) = each %$sc) {
($best_cat, $best_score) = ($key, $val) if $val > $best_score;
}
return $best_cat;
}
sub find_predictors{
my $self = shift;
my $best_cat = $self->best_category;
my $features = $self->features;
my @predictors;
for my $feature ( keys %$features ) {
for my $cat ( keys %{ $features->{$feature } } ){
next if $cat eq $best_cat;
push @predictors, [ $feature, $features->{$feature}{$best_cat} - $features->{$feature}{$cat} ];
}
}
@predictors = sort { abs( $b->[1] ) <=> abs( $a->[1] ) } @predictors;
return $best_cat, @predictors;
}
__PACKAGE__->meta->make_immutable;
1;
lib/AI/NaiveBayes/Classification.pm view on Meta::CPAN
=item C<best_category()>
Returns a string being a label that suits given document the best.
=item C<find_predictors()>
This method returns the C<best_category()>, as well as the list of all the predictors
along with their influence on the best category selected. So the second value
returned is a list of array references, where each one contains a string being a
single feature and a number describing its influence on the result. So the
second part of the result may look like this:
(
[ 'activities', 1.2511540632952 ],
[ 'over', -1.0269523272981 ],
[ 'provide', 0.8280157033269 ],
[ 'natural', 0.7361042359385 ],
[ 'against', -0.6923354975173 ],
)
lib/AI/NaiveBayes/Learner.pm view on Meta::CPAN
use 5.010;
use List::Util qw( min sum );
use Moose;
use AI::NaiveBayes;
has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs');
has labels => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels');
has examples => (is => 'ro', isa => 'Int', default => 0, clearer => '_clear_examples');
has features_kept => (is => 'ro', predicate => 'limit_features');
has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' );
sub add_example {
my ($self, %params) = @_;
for ('attributes', 'labels') {
die "Missing required '$_' parameter" unless exists $params{$_};
}
$self->{examples}++;
lib/AI/NaiveBayes/Learner.pm view on Meta::CPAN
# P(attr|label) = $count/$label_tokens (simple)
# P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
# log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)
my $denominator = log($label_tokens + $vocab_size);
while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
$model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
}
if ($self->limit_features) {
my %old = %{$model->{probs}{$label}};
my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
my $limit = min($self->features_kept, 0+@features);
if ($limit < 1) {
$limit = int($limit * keys(%old));
}
my @top = @features[0..$limit-1];
my %kept = map { $_ => $old{$_} } @top;
$model->{probs}{$label} = \%kept;
}
}
my $classifier_class = $self->classifier_class;
return $classifier_class->new( model => $model );
}
sub add_hash {
my ($first, $second) = @_;
lib/AI/NaiveBayes/Learner.pm view on Meta::CPAN
=head1 NAME
AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples.
=head1 VERSION
version 0.04
=head1 SYNOPSIS
my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
$learner->add_example(
attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 },
labels => ['farming']
);
my $classifier = $learner->classifier;
=head1 DESCRIPTION
This is a trainer of AI::NaiveBayes classifiers. It saves information passed
by the C<add_example> method from
training data into internal structures and then constructs a classifier when
the C<classifier> method is called.
=head1 ATTRIBUTES
=over 4
=item C<features_kept>
Indicates how many features should remain after calculating probabilities. By
default all of them will be kept. For C<features_kept> > 1, C<features_kept> of
features will be preserved. For values lower than 1, a specified fraction of
features will be kept (e.g. top 20% of features for C<features_kept> = 0.2).
The rest of the attributes is for class' internal usage, and thus not
documented.
=item C<classifier_class>
The class of the classifier to be created. By default it is
C<AI::NaiveBayes>
=back
=head1 METHODS
=over 4
=item C<add_example( attributes => HASHREF, labels => LIST )>
Saves the information from a training example into internal data structures.
C<attributes> should be of the form of
{ feature1 => weight1, feature2 => weight2, ... }
C<labels> should be a list of strings denoting one or more classes to which the example belongs.
=item C<classifier()>
Creates an AI::NaiveBayes classifier based on the data accumulated before.
=back
=head1 UTILITY SUBS
t/01-learner.t view on Meta::CPAN
$learner->add_example( attributes => _hash(qw(vampires drink blood vampires may staked)),
labels => ['vampire'] );
is $learner->{labels}{vampire}{count}, 1;
$learner->add_example( attributes => _hash(qw(vampires cannot see their images mirrors)),
labels => ['vampire'] );
is $learner->{labels}{vampire}{count}, 2;
is keys %{$learner->{labels}}, 2;
# features_kept > 1
$learner = AI::NaiveBayes::Learner->new(features_kept => 5);
$learner->add_example( attributes => _hash(qw(one two three four)),
labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(five six seven eight)),
labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(one two three four five)),
labels => ['farming'] );
my $model = $learner->classifier->model;
is keys %{$model->{probs}{farming}}, 5, '5 features kept';
is join(" ", sort { $a cmp $b } keys %{$model->{probs}{farming}}), 'five four one three two';
# features_kept < 1
$learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
$learner->add_example( attributes => _hash(qw(one two three four)),
labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(five six seven eight)),
labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(one two three four)),
labels => ['farming'] );
$model = $learner->classifier->model;
is keys %{$model->{probs}{farming}}, 4, 'half features kept';
is join(" ", sort { $a cmp $b } keys %{$model->{probs}{farming}}), 'four one three two';
sub _hash { +{ map {$_,1} @_ } }
( run in 0.263 second using v1.01-cache-2.11-cpan-4d50c553e7e )