Mail-SpamAssassin
view release on metacpan or search on metacpan
sa-learn.raw view on Meta::CPAN
my @bin = File::Spec->splitpath($0);
my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1], '') : $bin[1])
|| File::Spec->curdir;
if (-e $bin.'/lib/Mail/SpamAssassin.pm'
|| !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' )
{
my $searchrelative;
$searchrelative = 1; # disabled during "make install": REMOVEFORINST
if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm')
{
unshift ( @INC, '../blib/lib' );
} else {
foreach ( qw(lib ../lib/site_perl
../lib/spamassassin ../share/spamassassin/lib))
{
my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) );
if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) )
{ unshift ( @INC, $dir ); last; }
}
}
}
}
use Mail::SpamAssassin;
use Mail::SpamAssassin::ArchiveIterator;
use Mail::SpamAssassin::Message;
use Mail::SpamAssassin::PerMsgLearner;
use Mail::SpamAssassin::Util::Progress;
use Mail::SpamAssassin::Logger;
###########################################################################
$SIG{PIPE} = 'IGNORE';
# used to be CmdLearn::cmd_run() ...
%opt = (
'force-expire' => 0,
'use-ignores' => 0,
'nosync' => 0,
'quiet' => 0,
'cf' => []
);
Getopt::Long::Configure(
qw(bundling no_getopt_compat
permute no_auto_abbrev no_ignore_case)
);
GetOptions(
'forget' => \$forget,
'ham|nonspam' => sub { $isspam = 0; },
'spam' => sub { $isspam = 1; },
'sync' => \$synconly,
'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" },
'q|quiet' => \$opt{'quiet'},
'username|u=s' => \$opt{'username'},
'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
'prefspath|prefs-file|p=s' => \$opt{'prefspath'},
'siteconfigpath=s' => \$opt{'siteconfigpath'},
'cf=s' => \@{$opt{'cf'}},
'folders|f=s' => \$opt{'folders'},
'force-expire|expire' => \$opt{'force-expire'},
'local|L' => \$opt{'local'},
'no-sync|nosync' => \$opt{'nosync'},
'showdots' => \$opt{'showdots'},
'progress' => \$opt{'progress'},
'use-ignores' => \$opt{'use-ignores'},
'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" },
'learnprob=f' => \$opt{'learnprob'},
'randseed=i' => \$opt{'randseed'},
'stopafter=i' => \$opt{'stopafter'},
'max-size=i' => \$opt{'max-size'},
'debug|debug-level|D:s' => \$opt{'debug'},
'help|h|?' => \$opt{'help'},
'version|V' => \$opt{'version'},
'dump:s' => \$opt{'dump'},
'import' => \$opt{'import'},
'backup' => \$opt{'backup'},
'clear' => \$opt{'clear'},
'restore=s' => \$opt{'restore'},
'dir' => sub { $opt{'old_format'} = 'dir'; },
'file' => sub { $opt{'old_format'} = 'file'; },
'mbox' => sub { $opt{'format'} = 'mbox'; },
'mbx' => sub { $opt{'format'} = 'mbx'; },
'single' => sub { $opt{'old_format'} = 'single'; },
'db|dbpath=s' => \$bayes_override_path,
're|regexp=s' => \$opt{'regexp'},
'<>' => \&target,
)
or usage( 0, "Unknown option!" );
if ( defined $opt{'help'} ) {
usage( 0, "For more information read the manual page" );
}
if ( defined $opt{'version'} ) {
print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
exit 0;
}
# set debug areas, if any specified (only useful for command-line tools)
if (defined $opt{'debug'}) {
$opt{'debug'} ||= 'all';
}
if ( $opt{'force-expire'} ) {
$synconly = 1;
}
if ($opt{'showdots'} && $opt{'progress'}) {
print "--showdots and --progress may not be used together, please select just one\n";
sa-learn.raw view on Meta::CPAN
}
if ( !defined $isspam
&& !defined $synconly
&& !defined $forget
&& !defined $opt{'dump'}
&& !defined $opt{'import'}
&& !defined $opt{'clear'}
&& !defined $opt{'backup'}
&& !defined $opt{'restore'}
&& !defined $opt{'folders'} )
{
usage( 0,
"Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore"
);
}
# We need to make sure the journal syncs pre-forget...
if ( defined $forget && $opt{'nosync'} ) {
$opt{'nosync'} = 0;
warn "sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n";
}
if ( defined $opt{'old_format'} ) {
#Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single.
#Convert it to the new behavior:
if ( $opt{'old_format'} eq 'single' ) {
push ( @ARGV, '-' );
}
}
my $post_config = '';
# kluge to support old check_bayes_db operation
# bug 3799: init() will go r/o with the configured DB, and then dbpath needs
# to override. Just access the dbpath version via post_config_text.
if ( defined $bayes_override_path ) {
# Add a default prefix if the path is a directory
if ( -d $bayes_override_path ) {
$bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' );
}
$post_config .= "bayes_path $bayes_override_path\n";
}
# These options require bayes_scanner, which requires "use_bayes 1", but
# that's not necessary for these commands.
if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} ||
defined $opt{'backup'} || defined $opt{'restore'}) {
$post_config .= "use_bayes 1\n";
}
$post_config .= join("\n", @{$opt{'cf'}})."\n";
# create the tester factory
$spamtest = Mail::SpamAssassin->new(
{
rules_filename => $opt{'configpath'},
site_rules_filename => $opt{'siteconfigpath'},
userprefs_filename => $opt{'prefspath'},
username => $opt{'username'},
debug => $opt{'debug'},
local_tests_only => $opt{'local'},
dont_copy_prefs => 1,
PREFIX => $PREFIX,
DEF_RULES_DIR => $DEF_RULES_DIR,
LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
post_config_text => $post_config,
}
);
$spamtest->init(1);
dbg("sa-learn: spamtest initialized");
# Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin;
# To be resolved more cleanly!!!
if ($spamtest->{bayes_scanner}) {
foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) {
if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) {
# copy plugin's "store" object ref one level up!
$spamtest->{bayes_scanner}->{store} = $plugin->{store};
}
}
}
if (Mail::SpamAssassin::Util::am_running_on_windows()) {
binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363
binmode(STDOUT) or die "cannot set binmode on STDOUT: $!";
}
if ( defined $opt{'dump'} ) {
my ( $magic, $toks );
if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens!
( $magic, $toks ) = ( 1, 1 );
}
elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only
( $magic, $toks ) = ( 1, 0 );
}
elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only
( $magic, $toks ) = ( 0, 1 );
}
else { # unknown option
warn "Unknown dump option '" . $opt{'dump'} . "'\n";
$spamtest->finish_learner();
exit 1;
}
if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) {
$spamtest->finish_learner();
die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n";
}
$spamtest->finish_learner();
# make sure we notice any write errors while flushing output buffer
close STDOUT or die "error closing STDOUT: $!";
close STDIN or die "error closing STDIN: $!";
exit 0;
}
if ( defined $opt{'import'} ) {
my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
$spamtest->finish_learner();
# make sure we notice any write errors while flushing output buffer
sa-learn.raw view on Meta::CPAN
}
###########################################################################
sub usage {
my ( $verbose, $message ) = @_;
my $ver = Mail::SpamAssassin::Version();
print "SpamAssassin version $ver\n";
pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 );
}
# ---------------------------------------------------------------------------
=head1 NAME
sa-learn - train SpamAssassin's Bayesian classifier
=head1 SYNOPSIS
B<sa-learn> [options] [file]...
B<sa-learn> [options] --dump [ all | data | magic ]
Options:
--ham Learn the following messages as ham (non-spam)
--spam Learn the following messages as spam
--forget Forget the following messages
--use-ignores Use bayes_ignore_from and bayes_ignore_to
--sync Synchronize the database and the journal if needed
--force-expire Force a database sync and expiry run
--dbpath <path> Allows commandline override (in bayes_path form)
for where to read the Bayes DB from
--dump [all|data|magic] Display the contents of the Bayes database
Takes optional argument for what to display
--regexp <re> For dump only, specifies which tokens to
dump based on a regular expression.
-f file, --folders=file Read list of files/directories from file
--dir Ignored; historical compatibility
--file Ignored; historical compatibility
--mbox Input sources are in mbox format
--mbx Input sources are in mbx format
--max-size <b> Skip messages larger than b bytes;
defaults to 500 KB, 0 implies no limit
--showdots Show progress using dots
--progress Show progress using progress bar
--no-sync Skip synchronizing the database and journal
after learning
-L, --local Operate locally, no network accesses. Use
of this is recommended, see documentation.
--import Migrate data from older version/non DB_File
based databases
--clear Wipe out existing database
--backup Backup, to STDOUT, existing database
--restore <filename> Restore a database from filename
-u username, --username=username
Override username taken from the runtime
environment, used with SQL
-C path, --configpath=path, --config-file=path
Path to standard configuration dir
-p prefs, --prefspath=file, --prefs-file=file
Set user preferences file
--siteconfigpath=path Path for site configs
(default: @@PREFIX@@/etc/mail/spamassassin)
--cf='config line' Additional line of configuration
-D, --debug [area,...] Print debugging messages
-V, --version Print version
-h, --help Print usage message
=head1 DESCRIPTION
Given a typical selection of your incoming mail classified as spam or ham
(non-spam), this tool will feed each mail to SpamAssassin, allowing it
to 'learn' what signs are likely to mean spam, and which are likely to
mean ham.
Simply run this command once for each of your mail folders, and it will
''learn'' from the mail therein.
Note that csh-style I<globbing> in the mail folder names is supported;
in other words, listing a folder name as C<*> will scan every folder
that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details.
If you are using mail boxes in format other than maildir you should use
the B<--mbox> or B<--mbx> parameters.
Files compressed with gzip/bzip2/xz/lz4/lzip/lzo are uncompressed
automatically. See C<Mail::SpamAssassin::ArchiveIterator> for more details.
SpamAssassin remembers which mail messages it has learnt already, and will not
re-learn those messages again, unless you use the B<--forget> option. Messages
learnt as spam will have SpamAssassin markup removed, on the fly.
If you make a mistake and scan a mail as ham when it is spam, or vice
versa, simply rerun this command with the correct classification, and the
mistake will be corrected. SpamAssassin will automatically 'forget' the
previous indications.
Users of C<spamd> who wish to perform training remotely, over a network,
should investigate the C<spamc -L> switch.
=head1 OPTIONS
=over 4
=item B<--ham>
Learn the input message(s) in the files following the option as ham.
If you have previously learnt any of the messages as spam, SpamAssassin will
forget them first, then re-learn them as ham. Alternatively, if you have
previously learnt them as ham, it'll skip them this time around.
If the messages have already been filtered through SpamAssassin, the learner
will ignore any modifications SpamAssassin may have made.
=item B<--spam>
Learn the input message(s) in the files following the option as spam.
If you have previously learnt any of the messages as ham, SpamAssassin will
forget them first, then re-learn them as spam. Alternatively, if you have
previously learnt them as spam, it'll skip them this time around.
If the messages have already been filtered through SpamAssassin, the learner
sa-learn.raw view on Meta::CPAN
=item B<--dump> I<option>
Display the contents of the Bayes database. Without an option or with
the I<all> option, all magic tokens and data tokens will be displayed.
I<magic> will only display magic tokens, and I<data> will only display
the data tokens.
Can also use the B<--regexp> I<RE> option to specify which tokens to
display based on a regular expression.
=item B<--clear>
Clear an existing Bayes database by removing all traces of the database.
WARNING: This is destructive and should be used with care.
=item B<--backup>
Performs a dump of the Bayes database in machine/human readable format.
The dump will include token and seen data. It is suitable for input back
into the --restore command.
=item B<--restore>=I<filename>
Performs a restore of the Bayes database defined by I<filename>.
WARNING: This is a destructive operation, previous Bayes data will be wiped out.
=item B<-h>, B<--help>
Print help message and exit.
=item B<-u> I<username>, B<--username>=I<username>
If specified this username will override the username taken from the runtime
environment. You can use this option to specify users in a virtual user
configuration when using SQL as the Bayes backend.
NOTE: This option will not change to the given I<username>, it will only attempt
to act on behalf of that user. Because of this you will need to have proper
permissions to be able to change files owned by I<username>. In the case of SQL
this generally is not a problem.
=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
Use the specified path for locating the distributed configuration files.
Ignore the default directories (usually C</usr/share/spamassassin> or similar).
=item B<--siteconfigpath>=I<path>
Use the specified path for locating site-specific configuration files. Ignore
the default directories (usually C</etc/mail/spamassassin> or similar).
=item B<--cf='config line'>
Add additional lines of configuration directly from the command-line, parsed
after the configuration files are read. Multiple B<--cf> arguments can be
used, and each will be considered a separate line of configuration.
=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>).
=item B<--progress>
Prints a progress bar (to STDERR) showing the current progress. In the case
where no valid terminal is found this option will behave very much like the
--showdots option.
=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
Produce debugging output. If no areas are listed, all debugging information is
printed. Diagnostic output can also be enabled for each area individually;
I<area> is the area of the code to instrument. For example, to produce
diagnostic output on bayes, learn, and dns, use:
spamassassin -D bayes,learn,dns
Use an empty string (-D '') to indicate no areas when the next item on the
command line is a path, to prevent the path from being parsed as an area.
For more information about which areas (also known as channels) are available,
please see the documentation at:
C<https://wiki.apache.org/spamassassin/DebugChannels>
Higher priority informational messages that are suitable for logging in normal
circumstances are available with an area of "info".
=item B<--no-sync>
Skip the slow synchronization step which normally takes place after
changing database entries. If you plan to learn from many folders in
a batch, or to learn many individual messages one-by-one, it is faster
to use this switch and run C<sa-learn --sync> once all the folders have
been scanned.
Clarification: The state of I<--no-sync> overrides the
I<bayes_learn_to_journal> configuration option. If not specified,
sa-learn will learn to the database directly. If specified, sa-learn
will learn to the journal file.
Note: I<--sync> and I<--no-sync> can be specified on the same commandline,
which is slightly confusing. In this case, the I<--no-sync> option is
ignored since there is no learn operation.
=item B<-L>, B<--local>
Do not perform any network accesses while learning details about the mail
messages. This should be normally used, as there really isn't anything
Bayes can learn from network lookup results. Official SpamAssassin plugins
do not currently do any network lookups when learning, but it's possible
that third party ones might.
=item B<--import>
If you previously used SpamAssassin's Bayesian learner without the C<DB_File>
module installed, it will have created files in other formats, such as
C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate
that old data into the C<DB_File> format. It will overwrite any data currently
in the C<DB_File>.
( run in 0.669 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )