UTF-8 results from the CPAN

AI-NaiveBayes

has model   => (is => 'ro', isa => 'HashRef[HashRef]', required => 1);

sub train {
    my $self = shift;
    my $learner = AI::NaiveBayes::Learner->new();
    for my $example ( @_ ){
        $learner->add_example( %$example );
    }
    return $learner->classifier;
}


sub classify {
    my ($self, $newattrs) = @_;
    $newattrs or die "Missing parameter for classify()";

    my $m = $self->model;

    # Note that we're using the log(prob) here.  That's why we add instead of multiply.

    my %scores = %{$m->{prior_probs}};
    my %features;
    while (my ($feature, $value) = each %$newattrs) {
        next unless exists $m->{attributes}{$feature};  # Ignore totally unseen features
        while (my ($label, $attributes) = each %{$m->{probs}}) {
            my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value;  # P($feature|$label)**$value
            $scores{$label} += $score;
            $features{$feature}{$label} = $score;
        }
    }

    rescale(\%scores);

    return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
}

sub rescale {
    my ($scores) = @_;

    # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
    my $total = 0;
    my $max = max(values %$scores);
    foreach (values %$scores) {
        $_ = exp($_ - $max);
        $total += $_**2;
    }
    $total = sqrt($total);
    foreach (values %$scores) {
        $_ /= $total;
    }
}


__PACKAGE__->meta->make_immutable;

1;

=pod

=encoding UTF-8

=head1 NAME

AI::NaiveBayes - A Bayesian classifier

=head1 VERSION

version 0.04

=head1 SYNOPSIS

    # AI::NaiveBayes objects are created by AI::NaiveBayes::Learner
    # but for quick start you can use the 'train' class method
    # that is a shortcut using default AI::NaiveBayes::Learner settings

    my $classifier = AI::NaiveBayes->train( 
        {
            attributes => {
                sheep => 1, very => 1,  valuable => 1, farming => 1
            },
            labels => ['farming']
        },
        {
            attributes => {
                vampires => 1, cannot => 1, see => 1, their => 1,
                images => 1, mirrors => 1
            },
            labels => ['vampire']
        },
    );

    # Classify a feature vector
    my $result = $classifier->classify({bar => 3, blurp => 2});
    
    # $result is now a AI::NaiveBayes::Classification object
    
    my $best_category = $result->best_category;

=head1 DESCRIPTION

This module implements the classic "Naive Bayes" machine learning
algorithm.  This is a low level class that accepts only pre-computed feature-vectors
as input, see L<AI::Classifier::Text> for a text classifier that uses
this class.  

Creation of C<AI::NaiveBayes> classifier object out of training
data is done by L<AI::NaiveBayes::Learner>. For quick start 
you can use the limited C<train> class method that trains the 
classifier in a default way.

The classifier object is immutable.

It is a well-studied probabilistic algorithm often used in
automatic text categorization.  Compared to other algorithms (kNN,
SVM, Decision Trees), it's pretty fast and reasonably competitive in
the quality of its results.

A paper by Fabrizio Sebastiani provides a really good introduction to
text categorization:
L<http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf>
( run in 0.229 second using v1.01-cache-2.11-cpan-0d8aa00de5b )