Mail-SpamAssassin

 view release on metacpan or  search on metacpan

lib/Mail/SpamAssassin/Plugin/Bayes.pm  view on Meta::CPAN


  dbg("bayes: corpus size: nspam = $ns, nham = $nn");

  my $msgtokens;
  { my $timer = $self->{main}->time_method('b_tokenize');
    my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
    $msgtokens = $self->tokenize($msg, $msgdata);
  }

  my $tokensdata;
  { my $timer = $self->{main}->time_method('b_tok_get_all');
    $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
  }

  my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');

  my $probabilities_ref =
    $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);

  my %pw;
  foreach my $tokendata (@{$tokensdata}) {
    my $prob = shift(@$probabilities_ref);
    next unless defined $prob;
    my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
    $pw{$token} = {
      prob => $prob,
      spam_count => $tok_spam,
      ham_count => $tok_ham,
      atime => $atime
    };
  }

  my @pw_keys = keys %pw;

  # If none of the tokens were found in the DB, we're going to skip
  # this message...
  if (!@pw_keys) {
    dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
    goto skip;
  }

  my $tcount_total = keys %{$msgtokens};
  my $tcount_learned = scalar @pw_keys;

  # Figure out the message receive time (used as atime below)
  # If the message atime comes back as being in the future, something's
  # messed up and we should revert to current time as a safety measure.
  #
  my $msgatime = $msg->receive_date();
  my $now = time;
  $msgatime = $now if ( $msgatime > $now );

  my @touch_tokens;
  my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
  my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];

  my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
  my $log_each_token = (would_log('dbg', 'bayes') > 1);

  # now take the most significant tokens and calculate probs using
  # Robinson's formula.

  @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;

  if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }

  my @sorted;
  my $score;
  foreach my $tok (@pw_keys) {
    next if $tok_strength{$tok} <
                $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;

    my $pw_tok = $pw{$tok};
    my $pw_prob = $pw_tok->{prob};

    # What's more expensive, scanning headers for HAMMYTOKENS and
    # SPAMMYTOKENS tags that aren't there or collecting data that
    # won't be used?  Just collecting the data is certainly simpler.
    #
    my $raw_token = $msgtokens->{$tok} || "(unknown)";
    my $s = $pw_tok->{spam_count};
    my $n = $pw_tok->{ham_count};
    my $a = $pw_tok->{atime};

    push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
          [$raw_token, $pw_prob, $s, $n, $a] );

    push(@sorted, $pw_prob);

    # update the atime on this token, it proved useful
    push(@touch_tokens, $tok);

    if ($log_each_token) {
      dbg("bayes: token '$raw_token' => $pw_prob");
    }
  }

  if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && 
	$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
  {
    dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
    goto skip;
  }

  $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
  undef $timer_compute_prob;  # end a timing section

  # Couldn't come up with a probability?
  goto skip unless defined $score;

  dbg("bayes: score = $score");

  # no need to call tok_touch_all unless there were significant
  # tokens and a score was returned
  # we don't really care about the return value here

  { my $timer = $self->{main}->time_method('b_tok_touch_all');
    $self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
  }

  my $timer_finish = $self->{main}->time_method('b_finish');



( run in 3.118 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )