AI-NaiveBayes

 view release on metacpan or  search on metacpan

README.pod  view on Meta::CPAN

    return $learner->classifier;
}


sub classify {
    my ($self, $newattrs) = @_;
    $newattrs or die "Missing parameter for classify()";

    my $m = $self->model;

    # Note that we're using the log(prob) here.  That's why we add instead of multiply.

    my %scores = %{$m->{prior_probs}};
    my %features;
    while (my ($feature, $value) = each %$newattrs) {
        next unless exists $m->{attributes}{$feature};  # Ignore totally unseen features
        while (my ($label, $attributes) = each %{$m->{probs}}) {
            my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value;  # P($feature|$label)**$value
            $scores{$label} += $score;
            $features{$feature}{$label} = $score;
        }
    }

    rescale(\%scores);

    return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
}

sub rescale {
    my ($scores) = @_;

    # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
    my $total = 0;
    my $max = max(values %$scores);
    foreach (values %$scores) {
        $_ = exp($_ - $max);
        $total += $_**2;
    }
    $total = sqrt($total);
    foreach (values %$scores) {
        $_ /= $total;
    }

lib/AI/NaiveBayes.pm  view on Meta::CPAN

    return $learner->classifier;
}


sub classify {
    my ($self, $newattrs) = @_;
    $newattrs or die "Missing parameter for classify()";

    my $m = $self->model;

    # Note that we're using the log(prob) here.  That's why we add instead of multiply.

    my %scores = %{$m->{prior_probs}};
    my %features;
    while (my ($feature, $value) = each %$newattrs) {
        next unless exists $m->{attributes}{$feature};  # Ignore totally unseen features
        while (my ($label, $attributes) = each %{$m->{probs}}) {
            my $score = ($attributes->{$feature} || $m->{smoother}{$label})*$value;  # P($feature|$label)**$value
            $scores{$label} += $score;
            $features{$feature}{$label} = $score;
        }
    }

    rescale(\%scores);

    return AI::NaiveBayes::Classification->new( label_sums => \%scores, features => \%features );
}

sub rescale {
    my ($scores) = @_;

    # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize
    my $total = 0;
    my $max = max(values %$scores);
    foreach (values %$scores) {
        $_ = exp($_ - $max);
        $total += $_**2;
    }
    $total = sqrt($total);
    foreach (values %$scores) {
        $_ /= $total;
    }

lib/AI/NaiveBayes/Learner.pm  view on Meta::CPAN

sub classifier {
    my $self = shift;

    my $examples    = $self->examples;
    my $labels       = $self->labels;
    my $vocab_size   = keys %{ $self->attributes };
    my $model;
    $model->{attributes} = $self->attributes;


    # Calculate the log-probabilities for each category
    foreach my $label (keys %$labels) {
        $model->{prior_probs}{$label} = log($labels->{$label}{count} / $examples);

        # Count the number of tokens in this cat
        my $label_tokens = sum( values %{ $labels->{$label}{attributes} } );

        # Compute a smoothing term so P(word|cat)==0 can be avoided
        $model->{smoother}{$label} = -log($label_tokens + $vocab_size);

        # P(attr|label) = $count/$label_tokens                         (simple)
        # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size)   (with smoothing)
        # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)

        my $denominator = log($label_tokens + $vocab_size);

        while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
            $model->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
        }

        if ($self->limit_features) {
            my %old  = %{$model->{probs}{$label}};
            my @features = sort { abs($old{$a}) <=> abs($old{$b}) } keys(%old);
            my $limit = min($self->features_kept, 0+@features);
            if ($limit < 1) {
                $limit = int($limit * keys(%old));
            }
            my @top = @features[0..$limit-1];



( run in 1.455 second using v1.01-cache-2.11-cpan-49f99fa48dc )