AI-NaiveBayes

 view release on metacpan or  search on metacpan

README.pod  view on Meta::CPAN


sub classify {
    my ($self, $newattrs) = @_;
    $newattrs or die "Missing parameter for classify()";

    my $m = $self->model;

    # Note that we're using the log(prob) here.  That's why we add instead of multiply.

    my %scores = %{$m->{prior_probs}};
    my %features;
    while (my ($feature, $value) = each %$newattrs) {
        next unless exists $m->{attributes}{$feature};  # Ignore totally unseen features
        while (my ($label, $attributes) = each %{$m->{probs}}) {
            my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value;  # P($feature|$label)**$value
            $scores{$label} += $score;
            $features{$feature}{$label} = $score;
        }
    }

    rescale(\%scores);

    return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
}

sub rescale {
    my ($scores) = @_;

    # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
    my $total = 0;
    my $max = max(values %$scores);
    foreach (values %$scores) {
        $_ = exp($_ - $max);

README.pod  view on Meta::CPAN

        },
        {
            attributes => {
                vampires => 1, cannot => 1, see => 1, their => 1,
                images => 1, mirrors => 1
            },
            labels => ['vampire']
        },
    );

    # Classify a feature vector
    my $result = $classifier->classify({bar => 3, blurp => 2});
    
    # $result is now a AI::NaiveBayes::Classification object
    
    my $best_category = $result->best_category;
    
=head1 DESCRIPTION

This module implements the classic "Naive Bayes" machine learning
algorithm.  This is a low level class that accepts only pre-computed feature-vectors
as input, see L<AI::Classifier::Text> for a text classifier that uses
this class.  

Creation of C<AI::NaiveBayes> classifier object out of training
data is done by L<AI::NaiveBayes::Learner>. For quick start 
you can use the limited C<train> class method that trains the 
classifier in a default way.

The classifier object is immutable.

README.pod  view on Meta::CPAN


=item train( LIST of HASHREFS )

Shortcut for creating a trained classifier using L<AI::NaiveBayes::Learner> default
settings. 
Arguments are passed to the C<add_example> method of the L<AI::NaiveBayes::Learner>
object one by one.

=item classify( HASHREF )

Classifies a feature-vector of the form:

    { feature1 => weight1, feature2 => weight2, ... }
    
The result is a C<AI::NaiveBayes::Classification> object.

=item rescale

Internal

=back

=head1 ATTRIBUTES 

lib/AI/NaiveBayes.pm  view on Meta::CPAN


sub classify {
    my ($self, $newattrs) = @_;
    $newattrs or die "Missing parameter for classify()";

    my $m = $self->model;

    # Note that we're using the log(prob) here.  That's why we add instead of multiply.

    my %scores = %{$m->{prior_probs}};
    my %features;
    while (my ($feature, $value) = each %$newattrs) {
        next unless exists $m->{attributes}{$feature};  # Ignore totally unseen features
        while (my ($label, $attributes) = each %{$m->{probs}}) {
            my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value;  # P($feature|$label)**$value
            $scores{$label} += $score;
            $features{$feature}{$label} = $score;
        }
    }

    rescale(\%scores);

    return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
}

sub rescale {
    my ($scores) = @_;

    # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
    my $total = 0;
    my $max = max(values %$scores);
    foreach (values %$scores) {
        $_ = exp($_ - $max);

lib/AI/NaiveBayes.pm  view on Meta::CPAN

        },
        {
            attributes => {
                vampires => 1, cannot => 1, see => 1, their => 1,
                images => 1, mirrors => 1
            },
            labels => ['vampire']
        },
    );

    # Classify a feature vector
    my $result = $classifier->classify({bar => 3, blurp => 2});
    
    # $result is now a AI::NaiveBayes::Classification object
    
    my $best_category = $result->best_category;

=head1 DESCRIPTION

This module implements the classic "Naive Bayes" machine learning
algorithm.  This is a low level class that accepts only pre-computed feature-vectors
as input, see L<AI::Classifier::Text> for a text classifier that uses
this class.  

Creation of C<AI::NaiveBayes> classifier object out of training
data is done by L<AI::NaiveBayes::Learner>. For quick start 
you can use the limited C<train> class method that trains the 
classifier in a default way.

The classifier object is immutable.

lib/AI/NaiveBayes.pm  view on Meta::CPAN


=item train( LIST of HASHREFS )

Shortcut for creating a trained classifier using L<AI::NaiveBayes::Learner> default
settings. 
Arguments are passed to the C<add_example> method of the L<AI::NaiveBayes::Learner>
object one by one.

=item classify( HASHREF )

Classifies a feature-vector of the form:

    { feature1 => weight1, feature2 => weight2, ... }

The result is a C<AI::NaiveBayes::Classification> object.

=item rescale

Internal

=back

=head1 ATTRIBUTES 

lib/AI/NaiveBayes/Classification.pm  view on Meta::CPAN

package AI::NaiveBayes::Classification;
$AI::NaiveBayes::Classification::VERSION = '0.04';
use strict;
use warnings;
use 5.010;
use Moose;

has features => (is => 'ro', isa => 'HashRef[HashRef]', required => 1);
has label_sums => (is => 'ro', isa => 'HashRef', required => 1);
has best_category => (is => 'ro', isa => 'Str', lazy_build => 1);

sub _build_best_category {
    my $self = shift;
    my $sc = $self->label_sums;

    my ($best_cat, $best_score) = each %$sc;
    while (my ($key, $val) = each %$sc) {
        ($best_cat, $best_score) = ($key, $val) if $val > $best_score;
    }
    return $best_cat;
}

sub find_predictors{
    my $self = shift;

    my $best_cat = $self->best_category;
    my $features = $self->features;
    my @predictors; 
    for my $feature ( keys %$features  ) {
        for my $cat ( keys %{ $features->{$feature } } ){
            next if $cat eq $best_cat;
            push @predictors, [ $feature, $features->{$feature}{$best_cat} - $features->{$feature}{$cat} ];
        }
    }
    @predictors = sort { abs( $b->[1] ) <=> abs( $a->[1] ) } @predictors;
    return $best_cat, @predictors;
}


__PACKAGE__->meta->make_immutable;

1;

lib/AI/NaiveBayes/Classification.pm  view on Meta::CPAN


=item C<best_category()>

Returns a string being a label that suits given document the best.

=item C<find_predictors()>

This method returns the C<best_category()>, as well as the list of all the predictors
along with their influence on the best category selected. So the second value
returned is a list of array references, where each one contains a string being a
single feature and a number describing its influence on the result. So the
second part of the result may look like this:

    (
        [ 'activities',  1.2511540632952 ],
        [ 'over',       -1.0269523272981 ],
        [ 'provide',     0.8280157033269 ],
        [ 'natural',     0.7361042359385 ],
        [ 'against',    -0.6923354975173 ],
    )

lib/AI/NaiveBayes/Learner.pm  view on Meta::CPAN

use 5.010;

use List::Util qw( min sum );
use Moose;
use AI::NaiveBayes;

has attributes => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_attrs');
has labels     => (is => 'ro', isa => 'HashRef', default => sub { {} }, clearer => '_clear_labels');
has examples  => (is => 'ro', isa => 'Int',     default => 0, clearer => '_clear_examples');

has features_kept => (is => 'ro', predicate => 'limit_features');

has classifier_class => ( is => 'ro', isa => 'Str', default => 'AI::NaiveBayes' );

sub add_example {
    my ($self, %params) = @_;
    for ('attributes', 'labels') {
        die "Missing required '$_' parameter" unless exists $params{$_};
    }

    $self->{examples}++;

lib/AI/NaiveBayes/Learner.pm  view on Meta::CPAN

        # P(attr|label) = $count/$label_tokens                         (simple)
        # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size)   (with smoothing)
        # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)

        my $denominator = log($label_tokens + $vocab_size);

        while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
            $model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
        }

        if ($self->limit_features) {
            my %old  = %{$model->{probs}{$label}};
            my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
            my $limit = min($self->features_kept, 0+@features);
            if ($limit < 1) {
                $limit = int($limit * keys(%old));
            }
            my @top = @features[0..$limit-1];
            my %kept = map { $_ => $old{$_} } @top;
            $model->{probs}{$label} = \%kept;
        }
    }
    my $classifier_class = $self->classifier_class;
    return $classifier_class->new( model => $model );
}

sub add_hash {
    my ($first, $second) = @_;

lib/AI/NaiveBayes/Learner.pm  view on Meta::CPAN

=head1 NAME

AI::NaiveBayes::Learner - Build AI::NaiveBayes classifier from a set of training examples.

=head1 VERSION

version 0.04

=head1 SYNOPSIS

    my $learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
    $learner->add_example(
        attributes => { sheep => 1, very => 1, valuable => 1, farming => 1 },
        labels => ['farming'] 
    );

    my $classifier = $learner->classifier;

=head1 DESCRIPTION

This is a trainer of AI::NaiveBayes classifiers.  It saves information passed
by the C<add_example> method from
training data into internal structures and then constructs a classifier when
the C<classifier> method is called.

=head1 ATTRIBUTES

=over 4

=item C<features_kept>

Indicates how many features should remain after calculating probabilities. By
default all of them will be kept. For C<features_kept> > 1, C<features_kept> of
features will be preserved. For values lower than 1, a specified fraction of 
features will be kept (e.g. top 20% of features for C<features_kept> = 0.2).

The rest of the attributes is for class' internal usage, and thus not
documented.

=item C<classifier_class>

The class of the classifier to be created.  By default it is
C<AI::NaiveBayes>

=back

=head1 METHODS

=over 4

=item C<add_example( attributes => HASHREF, labels => LIST )>

Saves the information from a training example into internal data structures.
C<attributes> should be of the form of 
    { feature1 => weight1, feature2 => weight2, ... }
C<labels> should be a list of strings denoting one or more classes to which the example belongs.

=item C<classifier()>

    Creates an AI::NaiveBayes classifier based on the data accumulated before.

=back

=head1 UTILITY SUBS

t/01-learner.t  view on Meta::CPAN


$learner->add_example( attributes => _hash(qw(vampires drink blood vampires may staked)),
		   labels => ['vampire'] );
is $learner->{labels}{vampire}{count}, 1;

$learner->add_example( attributes => _hash(qw(vampires cannot see their images mirrors)),
		   labels => ['vampire'] );
is $learner->{labels}{vampire}{count}, 2;
is keys %{$learner->{labels}}, 2;

# features_kept > 1
$learner = AI::NaiveBayes::Learner->new(features_kept => 5);
$learner->add_example( attributes => _hash(qw(one two three four)),
		   labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(five six seven eight)),
		   labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(one two three four five)),
		   labels => ['farming'] );
my $model = $learner->classifier->model;
is keys %{$model->{probs}{farming}}, 5, '5 features kept';
is join(" ", sort { $a cmp $b } keys %{$model->{probs}{farming}}), 'five four one three two';

# features_kept < 1
$learner = AI::NaiveBayes::Learner->new(features_kept => 0.5);
$learner->add_example( attributes => _hash(qw(one two three four)),
		   labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(five six seven eight)),
		   labels => ['farming'] );
$learner->add_example( attributes => _hash(qw(one two three four)),
		   labels => ['farming'] );
$model = $learner->classifier->model;
is keys %{$model->{probs}{farming}}, 4, 'half features kept';
is join(" ", sort { $a cmp $b } keys %{$model->{probs}{farming}}), 'four one three two';

sub _hash { +{ map {$_,1} @_ } }



( run in 0.263 second using v1.01-cache-2.11-cpan-4d50c553e7e )