Mail-SpamAssassin
view release on metacpan or search on metacpan
lib/Mail/SpamAssassin/Plugin/Bayes.pm view on Meta::CPAN
dbg("bayes: corpus size: nspam = $ns, nham = $nn");
my $msgtokens;
{ my $timer = $self->{main}->time_method('b_tokenize');
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
$msgtokens = $self->tokenize($msg, $msgdata);
}
my $tokensdata;
{ my $timer = $self->{main}->time_method('b_tok_get_all');
$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
}
my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
my $probabilities_ref =
$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
my %pw;
foreach my $tokendata (@{$tokensdata}) {
my $prob = shift(@$probabilities_ref);
next unless defined $prob;
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
$pw{$token} = {
prob => $prob,
spam_count => $tok_spam,
ham_count => $tok_ham,
atime => $atime
};
}
my @pw_keys = keys %pw;
# If none of the tokens were found in the DB, we're going to skip
# this message...
if (!@pw_keys) {
dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
goto skip;
}
my $tcount_total = keys %{$msgtokens};
my $tcount_learned = scalar @pw_keys;
# Figure out the message receive time (used as atime below)
# If the message atime comes back as being in the future, something's
# messed up and we should revert to current time as a safety measure.
#
my $msgatime = $msg->receive_date();
my $now = time;
$msgatime = $now if ( $msgatime > $now );
my @touch_tokens;
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
my $log_each_token = (would_log('dbg', 'bayes') > 1);
# now take the most significant tokens and calculate probs using
# Robinson's formula.
@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
my @sorted;
my $score;
foreach my $tok (@pw_keys) {
next if $tok_strength{$tok} <
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
my $pw_tok = $pw{$tok};
my $pw_prob = $pw_tok->{prob};
# What's more expensive, scanning headers for HAMMYTOKENS and
# SPAMMYTOKENS tags that aren't there or collecting data that
# won't be used? Just collecting the data is certainly simpler.
#
my $raw_token = $msgtokens->{$tok} || "(unknown)";
my $s = $pw_tok->{spam_count};
my $n = $pw_tok->{ham_count};
my $a = $pw_tok->{atime};
push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
[$raw_token, $pw_prob, $s, $n, $a] );
push(@sorted, $pw_prob);
# update the atime on this token, it proved useful
push(@touch_tokens, $tok);
if ($log_each_token) {
dbg("bayes: token '$raw_token' => $pw_prob");
}
}
if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
{
dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
goto skip;
}
$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
undef $timer_compute_prob; # end a timing section
# Couldn't come up with a probability?
goto skip unless defined $score;
dbg("bayes: score = $score");
# no need to call tok_touch_all unless there were significant
# tokens and a score was returned
# we don't really care about the return value here
{ my $timer = $self->{main}->time_method('b_tok_touch_all');
$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
}
my $timer_finish = $self->{main}->time_method('b_finish');
( run in 3.118 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )