AI-Categorizer

 view release on metacpan or  search on metacpan

eg/demo.pl  view on Meta::CPAN

#!/usr/bin/perl

# This script is a fairly simple demonstration of how AI::Categorizer
# can be used.  There are lots of other less-simple demonstrations
# (actually, they're doing much simpler things, but are probably
# harder to follow) in the tests in the t/ subdirectory.  The
# eg/categorizer script can also be a good example if you're willing
# to figure out a bit how it works.
#
# This script reads a training corpus from a directory of plain-text
# documents, trains a Naive Bayes categorizer on it, then tests the
# categorizer on a set of test documents.

use strict;
use AI::Categorizer;
use AI::Categorizer::Collection::Files;
use AI::Categorizer::Learner::NaiveBayes;
use File::Spec;

die("Usage: $0 <corpus>\n".
    "  A sample corpus (data set) can be downloaded from\n".
    "     http://www.cpan.org/authors/Ken_Williams/data/reuters-21578.tar.gz\n".
    "  or http://www.limnus.com/~ken/reuters-21578.tar.gz\n")
  unless @ARGV == 1;

my $corpus = shift;

my $training  = File::Spec->catfile( $corpus, 'training' );
my $test      = File::Spec->catfile( $corpus, 'test' );
my $cats      = File::Spec->catfile( $corpus, 'cats.txt' );
my $stopwords = File::Spec->catfile( $corpus, 'stopwords' );

my %params;
if (-e $stopwords) {
  $params{stopword_file} = $stopwords;
} else {
  warn "$stopwords not found - no stopwords will be used.\n";
}

eg/demo.pl  view on Meta::CPAN

  die "$cats not found - can't proceed without category information.\n";
}


# In a real-world application these Collection objects could be of any
# type (any Collection subclass).  Or you could create each Document
# object manually.  Or you could let the KnowledgeSet create the
# Collection objects for you.

$training = AI::Categorizer::Collection::Files->new( path => $training, %params );
$test     = AI::Categorizer::Collection::Files->new( path => $test, %params );

# We turn on verbose mode so you can watch the progress of loading &
# training.  This looks nicer if you have Time::Progress installed!

print "Loading training set\n";
my $k = AI::Categorizer::KnowledgeSet->new( verbose => 1 );
$k->load( collection => $training );

print "Training categorizer\n";
my $l = AI::Categorizer::Learner::NaiveBayes->new( verbose => 1 );
$l->train( knowledge_set => $k );

print "Categorizing test set\n";
my $experiment = $l->categorize_collection( collection => $test );

print $experiment->stats_table;


# If you want to get at the specific assigned categories for a
# specific document, you can do it like this:

my $doc = AI::Categorizer::Document->new
  ( content => "Hello, I am a pretty generic document with not much to say." );

my $h = $l->categorize( $doc );

print ("For test document:\n",
       "  Best category = ", $h->best_category, "\n",
       "  All categories = ", join(', ', $h->categories), "\n");

eg/easy_guesser.pl  view on Meta::CPAN

#!/usr/bin/perl

# This script can be helpful for getting a set of baseline scores for
# a categorization task.  It simulates using the "Guesser" learner,
# but is much faster.  Because it doesn't leverage using the whole
# framework, though, it expects everything to be in a very strict
# format.  <cats-file> is in the same format as the 'category_file'
# parameter to the Collection class.  <training-dir> and <test-dir>
# give paths to directories of documents, named as in <cats-file>.

use strict;
use Statistics::Contingency;

die "Usage: $0 <cats-file> <training-dir> <test-dir>\n" unless @ARGV == 3;
my ($cats, $training, $test) = @ARGV;

die "$cats isn't a plain file\n" unless -f $cats;
die "$training isn't a directory\n" unless -d $training;
die "$test isn't a directory\n" unless -d $test;

my %cats;
print "Reading category file\n";
open my($fh), $cats or die "Can't read $cats: $!";
while (<$fh>) {
    my ($doc, @cats) = split;
    $cats{$doc} = \@cats;
}

my (%freq, $docs);

eg/easy_guesser.pl  view on Meta::CPAN

    }
    $docs++;
    $freq{$_}++ foreach @{$cats{$file}};
}
closedir $dh;

print "Calculating probabilities (@{[ %freq ]})\n";
$_ /= $docs foreach values %freq;
my @cats = keys %freq;

print "Scoring test documents\n";
my $c = Statistics::Contingency->new(categories => \@cats);
opendir $dh, $test or die "Can't opendir $test: $!";
while (defined(my $file = readdir $dh)) {
    next if $file eq '.' or $file eq '..';
    unless ($cats{$file}) {
	warn "No category information for '$file'";
	next;
    }
    my @assigned;
    foreach (@cats) {
	push @assigned, $_ if rand() < $freq{$_};
    }

lib/AI/Categorizer.pm  view on Meta::CPAN

use AI::Categorizer::KnowledgeSet;


__PACKAGE__->valid_params
  (
   progress_file => { type => SCALAR, default => 'save' },
   knowledge_set => { isa => 'AI::Categorizer::KnowledgeSet' },
   learner       => { isa => 'AI::Categorizer::Learner' },
   verbose       => { type => BOOLEAN, default => 0 },
   training_set  => { type => SCALAR, optional => 1 },
   test_set      => { type => SCALAR, optional => 1 },
   data_root     => { type => SCALAR, optional => 1 },
  );

__PACKAGE__->contained_objects
  (
   knowledge_set => { class => 'AI::Categorizer::KnowledgeSet' },
   learner       => { class => 'AI::Categorizer::Learner::NaiveBayes' },
   experiment    => { class => 'AI::Categorizer::Experiment',
		      delayed => 1 },
   collection    => { class => 'AI::Categorizer::Collection::Files',
		      delayed => 1 },
  );

sub new {
  my $package = shift;
  my %args = @_;
  my %defaults;
  if (exists $args{data_root}) {
    $defaults{training_set} = File::Spec->catfile($args{data_root}, 'training');
    $defaults{test_set} = File::Spec->catfile($args{data_root}, 'test');
    $defaults{category_file} = File::Spec->catfile($args{data_root}, 'cats.txt');
    delete $args{data_root};
  }

  return $package->SUPER::new(%defaults, %args);
}

#sub dump_parameters {
#  my $p = shift()->SUPER::dump_parameters;
#  delete $p->{stopwords} if $p->{stopword_file};

lib/AI/Categorizer.pm  view on Meta::CPAN


sub knowledge_set { shift->{knowledge_set} }
sub learner       { shift->{learner} }

# Combines several methods in one sub
sub run_experiment {
  my $self = shift;
  $self->scan_features;
  $self->read_training_set;
  $self->train;
  $self->evaluate_test_set;
  print $self->stats_table;
}

sub scan_features {
  my $self = shift;
  return unless $self->knowledge_set->scan_first;
  $self->knowledge_set->scan_features( path => $self->{training_set} );
  $self->knowledge_set->save_features( "$self->{progress_file}-01-features" );
}

lib/AI/Categorizer.pm  view on Meta::CPAN

}

sub train {
  my $self = shift;
  $self->_load_progress( '02', 'knowledge_set' );
  $self->learner->train( knowledge_set => $self->{knowledge_set} );
  $self->_save_progress( '03', 'learner' );
  return $self->learner;
}

sub evaluate_test_set {
  my $self = shift;
  $self->_load_progress( '03', 'learner' );
  my $c = $self->create_delayed_object('collection', path => $self->{test_set} );
  $self->{experiment} = $self->learner->categorize_collection( collection => $c );
  $self->_save_progress( '04', 'experiment' );
  return $self->{experiment};
}

sub stats_table {
  my $self = shift;
  $self->_load_progress( '04', 'experiment' );
  return $self->{experiment}->stats_table;
}

lib/AI/Categorizer.pm  view on Meta::CPAN


=head1 NAME

AI::Categorizer - Automatic Text Categorization

=head1 SYNOPSIS

 use AI::Categorizer;
 my $c = new AI::Categorizer(...parameters...);
 
 # Run a complete experiment - training on a corpus, testing on a test
 # set, printing a summary of results to STDOUT
 $c->run_experiment;
 
 # Or, run the parts of $c->run_experiment separately
 $c->scan_features;
 $c->read_training_set;
 $c->train;
 $c->evaluate_test_set;
 print $c->stats_table;
 
 # After training, use the Learner for categorization
 my $l = $c->learner;
 while (...) {
   my $d = ...create a document...
   my $hypothesis = $l->categorize($d);  # An AI::Categorizer::Hypothesis object
   print "Assigned categories: ", join ', ', $hypothesis->categories, "\n";
   print "Best category: ", $hypothesis->best_category, "\n";
 }

lib/AI/Categorizer.pm  view on Meta::CPAN

The basic process of using this module will typically involve
obtaining a collection of B<pre-categorized> documents, creating a
"knowledge set" representation of those documents, training a
categorizer on that knowledge set, and saving the trained categorizer
for later use.  There are several ways to carry out this process.  The
top-level C<AI::Categorizer> module provides an umbrella class for
high-level operations, or you may use the interfaces of the individual
classes in the framework.

A simple sample script that reads a training corpus, trains a
categorizer, and tests the categorizer on a test corpus, is
distributed as eg/demo.pl .

Disclaimer: the results of any of the machine learning algorithms are
far from infallible (close to fallible?).  Categorization of documents
is often a difficult task even for humans well-trained in the
particular domain of knowledge, and there are many things a human
would consider that none of these algorithms consider.  These are only
statistical tests - at best they are neat tricks or helpful
assistants, and at worst they are totally unreliable.  If you plan to
use this module for anything really important, human supervision is
essential, both of the categorization process and the final results.

For the usage details, please see the documentation of each individual
module.

=head1 FRAMEWORK COMPONENTS

This section explains the major pieces of the C<AI::Categorizer>

lib/AI/Categorizer.pm  view on Meta::CPAN

set is called "feature selection".  It is managed by the
C<AI::Categorizer::KnowledgeSet> class, and you will find the details
of feature selection processes in that class's documentation.

=head2 Collections

Because documents may be stored in lots of different formats, a
"collection" class has been created as an abstraction of a stored set
of documents, together with a way to iterate through the set and
return Document objects.  A knowledge set contains a single collection
object.  A C<Categorizer> doing a complete test run generally contains
two collections, one for training and one for testing.  A C<Learner>
can mass-categorize a collection.

The C<AI::Categorizer::Collection> class and its subclasses
instantiate the idea of a collection in this sense.

=head2 Documents

Each document is represented by an C<AI::Categorizer::Document>
object, or an object of one of its subclasses.  Each document class
contains methods for turning a bunch of data into a Feature Vector.

lib/AI/Categorizer.pm  view on Meta::CPAN

=item verbose

If true, a few status messages will be printed during execution.

=item training_set

Specifies the C<path> parameter that will be fed to the KnowledgeSet's
C<scan_features()> and C<read()> methods during our C<scan_features()>
and C<read_training_set()> methods.

=item test_set

Specifies the C<path> parameter that will be used when creating a
Collection during the C<evaluate_test_set()> method.

=item data_root

A shortcut for setting the C<training_set>, C<test_set>, and
C<category_file> parameters separately.  Sets C<training_set> to
C<$data_root/training>, C<test_set> to C<$data_root/test>, and
C<category_file> (used by some of the Collection classes) to
C<$data_root/cats.txt>.

=back

=item learner()

Returns the Learner object associated with this Categorizer.  Before
C<train()>, the Learner will of course not be trained yet.

=item knowledge_set()

Returns the KnowledgeSet object associated with this Categorizer.  If
C<read_training_set()> has not yet been called, the KnowledgeSet will
not yet be populated with any training data.

=item run_experiment()

Runs a complete experiment on the training and testing data, reporting
the results on C<STDOUT>.  Internally, this is just a shortcut for
calling the C<scan_features()>, C<read_training_set()>, C<train()>,
and C<evaluate_test_set()> methods, then printing the value of the
C<stats_table()> method.

=item scan_features()

Scans the Collection specified in the C<test_set> parameter to
determine the set of features (words) that will be considered when
training the Learner.  Internally, this calls the C<scan_features()>
method of the KnowledgeSet, then saves a list of the KnowledgeSet's
features for later use.

This step is not strictly necessary, but it can dramatically reduce
memory requirements if you scan for features before reading the entire
corpus into memory.

=item read_training_set()

Populates the KnowledgeSet with the data specified in the C<test_set>
parameter.  Internally, this calls the C<read()> method of the
KnowledgeSet.  Returns the KnowledgeSet.  Also saves the KnowledgeSet
object for later use.

=item train()

Calls the Learner's C<train()> method, passing it the KnowledgeSet
created during C<read_training_set()>.  Returns the Learner object.
Also saves the Learner object for later use.

=item evaluate_test_set()

Creates a Collection based on the value of the C<test_set> parameter,
and calls the Learner's C<categorize_collection()> method using this
Collection.  Returns the resultant Experiment object.  Also saves the
Experiment object for later use in the C<stats_table()> method.

=item stats_table()

Returns the value of the Experiment's (as created by
C<evaluate_test_set()>) C<stats_table()> method.  This is a string
that shows various statistics about the
accuracy/precision/recall/F1/etc. of the assignments made during
testing.

=back

=head1 HISTORY

This module is a revised and redesigned version of the previous
C<AI::Categorize> module by the same author.  Note the added 'r' in
the new name.  The older module has a different interface, and no
attempt at backward compatibility has been made - that's why I changed
the name.

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

documents are plain text, but subclasses of the Document class may
handle any kind of data.

=head1 METHODS

=over 4

=item new(%parameters)

Creates a new Document object.  Document objects are used during
training (for the training documents), testing (for the test
documents), and when categorizing new unseen documents in an
application (for the unseen documents).  However, you'll typically
only call C<new()> in the latter case, since the KnowledgeSet or
Collection classes will create Document objects for you in the former
cases.

The C<new()> method accepts the following parameters:

=over 4

lib/AI/Categorizer/Hypothesis.pm  view on Meta::CPAN

=head1 METHODS

=over 4

=item new(%parameters)

Returns a new Hypothesis object.  Generally a user of
C<AI::Categorize> doesn't create a Hypothesis object directly - they
are returned by the Learner's C<categorize()> method.  However, if you
wish to create a Hypothesis directly (maybe passing it some fake data
for testing purposes) you may do so using the C<new()> method.

The following parameters are accepted when creating a new Hypothesis:

=over 4

=item all_categories

A required parameter which gives the set of all categories that could
possibly be assigned to.  The categories should be specified as a
reference to an array of category names (as strings).

lib/AI/Categorizer/Learner/Guesser.pm  view on Meta::CPAN

    my $hypothesis = $l->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
    print "All assigned categories: ", join(', ', $hypothesis->categories), "\n";
  }

=head1 DESCRIPTION

This implements a simple category guesser that makes assignments based
solely on the prior probabilities of categories.  For instance, if 5%
of the training documents belong to a certain category, then the
probability of any test document being assigned to that category is
0.05.  This can be useful for providing baseline scores to compare
with other more sophisticated algorithms.

See L<AI::Categorizer> for a complete description of the interface.

=head1 METHODS

This class inherits from the C<AI::Categorizer::Learner> class, so all
of its methods are available.

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

}

# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -t /tmp/train_file.arff -d /tmp/weka-machine

sub create_model {
  my ($self) = shift;
  my $m = $self->{model} ||= {};
  $m->{all_features} = [ $self->knowledge_set->features->names ];
  $m->{_in_dir} = File::Temp::tempdir( DIR => $self->{tmpdir} );

  # Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement)
  my $dummy_features = $self->create_delayed_object('features');
  $m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]);

  $self->SUPER::create_model(@_);
}

sub create_boolean_model {
  my ($self, $pos, $neg, $cat) = @_;

  my @docs = (map([$_->features, 1], @$pos),

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

	      '-d', $outfile,
	      '-v',
	      '-p', '0',
	     );
  $self->do_cmd(@args);
  unlink $train_file or warn "Couldn't remove $train_file: $!";

  return \%info;
}

# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -l out -T test.arff -p 0

sub get_boolean_score {
  my ($self, $doc, $info) = @_;
  
  # Create document file
  my $doc_file = $self->create_arff_file('doc', [[$doc->features, 0]], $self->{tmpdir});
  my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, $info->{machine_file});

  my @args = ($self->{java_path},
	      @{$self->{java_args}},

t/common.pl  view on Meta::CPAN

use AI::Categorizer::KnowledgeSet;
use AI::Categorizer::Collection::InMemory;

sub have_module {
  my $module = shift;
  return eval "use $module; 1";
}

sub need_module {
  my $module = shift;
  skip_test("$module not installed") unless have_module($module);
}

sub skip_test {
  my $msg = @_ ? shift() : '';
  print "1..0 # Skipped: $msg\n";
  exit;
}

sub training_docs {
  return (
	  doc1 => {categories => ['farming'],
		   content => 'Sheep are very valuable in farming.' },
	  doc2 => {categories => ['farming'],
		   content => 'Farming requires many kinds of animals.' },
	  doc3 => {categories => ['vampire'],
		   content => 'Vampires drink blood and vampires may be staked.' },
	  doc4 => {categories => ['vampire'],
		   content => 'Vampires cannot see their images in mirrors.'},
	 );
}

sub run_test_docs {
  my $l = shift;

  my $doc = new AI::Categorizer::Document
    ( name => 'test1',
      content => 'I would like to begin farming sheep.' );
  my $r = $l->categorize($doc);
  
  print "Categories: ", join(', ', $r->categories), "\n";
  ok($r->best_category, 'farming', "Best category is 'farming'");
  ok $r->in_category('farming'),  1, sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('farming'));
  ok $r->in_category('vampire'), '', sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('vampire'));
  
  ok $r->all_categories, 2, "Should be 2 categories in total";
  
  $doc = new AI::Categorizer::Document
    ( name => 'test2',
      content => "I see that many vampires may have eaten my beautiful daughter's blood." );
  $r = $l->categorize($doc);
  
  print "Categories: ", join(', ', $r->categories), "\n";
  ok($r->best_category, 'vampire', "Best category is 'vampire'");
  ok $r->in_category('farming'), '', sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('farming'));
  ok $r->in_category('vampire'),  1, sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('vampire'));
}

sub set_up_tests {
  my %params = @_;
  my $c = new AI::Categorizer(
			      knowledge_set => AI::Categorizer::KnowledgeSet->new
			      (
			       name => 'Vampires/Farmers',
			       stopwords => [qw(are be in of and)],
			      ),
			      verbose => $ENV{TEST_VERBOSE} ? 1 : 0,
			      %params,
			     );

t/common.pl  view on Meta::CPAN

  while (my ($name, $data) = each %docs) {
    $c->knowledge_set->make_document(name => $name, %$data);
  }

  my $l = $c->learner;
  ok $l;
  
  if ($params{learner_class}) {
    ok ref($l), $params{learner_class}, "Make sure the correct Learner class is instantiated";
  } else {
    ok 1, 1, "Dummy test";
  }

  $l->train;
  return ($l, \%docs);
}

sub perform_standard_tests {
  my ($l, $docs) = set_up_tests(@_);
  
  run_test_docs($l);

  # Make sure we can save state & restore state
  $l->save_state('t/state');
  $l = $l->restore_state('t/state');
  ok $l;

  run_test_docs($l);

  my $train_collection = AI::Categorizer::Collection::InMemory->new(data => $docs);
  ok $train_collection;
  
  my $h = $l->categorize_collection(collection => $train_collection);
  ok $h->micro_precision > 0.5;
}

sub num_setup_tests    () { 3 }
sub num_standard_tests () { num_setup_tests + 17 }

1;



( run in 2.569 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )