AI-Categorizer

 view release on metacpan or  search on metacpan

Makefile.PL  view on Meta::CPAN

# Note: this file was auto-generated by Module::Build::Compat version 0.03
    
    unless (eval "use Module::Build::Compat 0.02; 1" ) {
      print "This module requires Module::Build to install itself.\n";
      
      require ExtUtils::MakeMaker;
      my $yn = ExtUtils::MakeMaker::prompt
	('  Install Module::Build now from CPAN?', 'y');
      
      unless ($yn =~ /^y/i) {
	die " *** Cannot install without Module::Build.  Exiting ...\n";
      }
      
      require Cwd;

README  view on Meta::CPAN

NAME
    AI::Categorizer - Automatic Text Categorization

SYNOPSIS
     use AI::Categorizer;
     my $c = new AI::Categorizer(...parameters...);
 
     # Run a complete experiment - training on a corpus, testing on a test
     # set, printing a summary of results to STDOUT
     $c->run_experiment;
 
     # Or, run the parts of $c->run_experiment separately
     $c->scan_features;
     $c->read_training_set;
     $c->train;
     $c->evaluate_test_set;
     print $c->stats_table;
 
     # After training, use the Learner for categorization
     my $l = $c->learner;
     while (...) {
       my $d = ...create a document...
       my $hypothesis = $l->categorize($d);  # An AI::Categorizer::Hypothesis object
       print "Assigned categories: ", join ', ', $hypothesis->categories, "\n";
       print "Best category: ", $hypothesis->best_category, "\n";
     }
 
DESCRIPTION
    "AI::Categorizer" is a framework for automatic text categorization. It
    consists of a collection of Perl modules that implement common
    categorization tasks, and a set of defined relationships among those
    modules. The various details are flexible - for example, you can choose what
    categorization algorithm to use, what features (words or otherwise) of the
    documents should be used (or how to automatically choose these features),
    what format the documents are in, and so on.

README  view on Meta::CPAN


        progress_file
            A string that indicates a place where objects will be saved during
            several of the methods of this class. The default value is the
            string "save", which means files like "save-01-knowledge_set" will
            get created. The exact names of these files may change in future
            releases, since they're just used internally to resume where we last
            left off.

        verbose
            If true, a few status messages will be printed during execution.

        training_set
            Specifies the "path" parameter that will be fed to the
            KnowledgeSet's "scan_features()" and "read()" methods during our
            "scan_features()" and "read_training_set()" methods.

        test_set
            Specifies the "path" parameter that will be used when creating a
            Collection during the "evaluate_test_set()" method.

README  view on Meta::CPAN


    knowledge_set()
        Returns the KnowledgeSet object associated with this Categorizer. If
        "read_training_set()" has not yet been called, the KnowledgeSet will not
        yet be populated with any training data.

    run_experiment()
        Runs a complete experiment on the training and testing data, reporting
        the results on "STDOUT". Internally, this is just a shortcut for calling
        the "scan_features()", "read_training_set()", "train()", and
        "evaluate_test_set()" methods, then printing the value of the
        "stats_table()" method.

    scan_features()
        Scans the Collection specified in the "test_set" parameter to determine
        the set of features (words) that will be considered when training the
        Learner. Internally, this calls the "scan_features()" method of the
        KnowledgeSet, then saves a list of the KnowledgeSet's features for later
        use.

        This step is not strictly necessary, but it can dramatically reduce

eg/categorizer  view on Meta::CPAN

}
die $@ if $@;

%$do_stage = map {$_, 1} 1..5 unless keys %$do_stage;

my $out_fh;
if ($outfile) {
  open $out_fh, ">> $outfile" or die "Can't create $outfile: $!";
  select((select($out_fh), $|=1)[0]);
  if (keys(%$do_stage) > 1) {
    print $out_fh "~~~~~~~~~~~~~~~~", scalar(localtime), "~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
    if ($HAVE_YAML) {
      print {$out_fh} YAML::Dump($c->dump_parameters);
    } else {
      warn "More detailed parameter dumping is available if you install the YAML module from CPAN.\n";
    }
  }
}
  

run_section('scan_features',     1, $do_stage);
run_section('read_training_set', 2, $do_stage);
run_section('train',             3, $do_stage);
run_section('evaluate_test_set', 4, $do_stage);
if ($do_stage->{5}) {
  my $result = $c->stats_table;
  print $result if $c->verbose;
  print $out_fh $result if $out_fh;
}

sub run_section {
  my ($section, $stage, $do_stage) = @_;
  return unless $do_stage->{$stage};
  if (keys %$do_stage > 1) {
    print " % $0 @ARGV -$stage\n" if $c->verbose;
    die "$0 is not executable, please change its execution permissions"
      unless -x $0;
    system($0, @ARGV, "-$stage") == 0
      or die "$0 returned nonzero status, \$?=$?";
    return;
  }
  my $start = new Benchmark;
  $c->$section();
  my $end = new Benchmark;
  my $summary = timestr(timediff($end, $start));
  my ($rss, $vsz) = memory_usage();
  print "$summary (memory: rss=$rss, vsz=$vsz)\n" if $c->verbose;
  print $out_fh "Stage $stage: $summary (memory: rss=$rss, vsz=$vsz)\n" if $out_fh;
}

sub parse_command_line {
  my (%opt, %do_stage);

  while (@_) {
    if ($_[0] =~ /^-(\d+)$/) {
      shift;
      $do_stage{$1} = 1;
      

eg/demo.pl  view on Meta::CPAN

# type (any Collection subclass).  Or you could create each Document
# object manually.  Or you could let the KnowledgeSet create the
# Collection objects for you.

$training = AI::Categorizer::Collection::Files->new( path => $training, %params );
$test     = AI::Categorizer::Collection::Files->new( path => $test, %params );

# We turn on verbose mode so you can watch the progress of loading &
# training.  This looks nicer if you have Time::Progress installed!

print "Loading training set\n";
my $k = AI::Categorizer::KnowledgeSet->new( verbose => 1 );
$k->load( collection => $training );

print "Training categorizer\n";
my $l = AI::Categorizer::Learner::NaiveBayes->new( verbose => 1 );
$l->train( knowledge_set => $k );

print "Categorizing test set\n";
my $experiment = $l->categorize_collection( collection => $test );

print $experiment->stats_table;


# If you want to get at the specific assigned categories for a
# specific document, you can do it like this:

my $doc = AI::Categorizer::Document->new
  ( content => "Hello, I am a pretty generic document with not much to say." );

my $h = $l->categorize( $doc );

print ("For test document:\n",
       "  Best category = ", $h->best_category, "\n",
       "  All categories = ", join(', ', $h->categories), "\n");

eg/easy_guesser.pl  view on Meta::CPAN

use Statistics::Contingency;

die "Usage: $0 <cats-file> <training-dir> <test-dir>\n" unless @ARGV == 3;
my ($cats, $training, $test) = @ARGV;

die "$cats isn't a plain file\n" unless -f $cats;
die "$training isn't a directory\n" unless -d $training;
die "$test isn't a directory\n" unless -d $test;

my %cats;
print "Reading category file\n";
open my($fh), $cats or die "Can't read $cats: $!";
while (<$fh>) {
    my ($doc, @cats) = split;
    $cats{$doc} = \@cats;
}

my (%freq, $docs);
print "Scanning training set\n";
opendir my($dh), $training or die "Can't opendir $training: $!";
while (defined(my $file = readdir $dh)) {
    next if $file eq '.' or $file eq '..';
    unless ($cats{$file}) {
	warn "No category information for '$file'";
	next;
    }
    $docs++;
    $freq{$_}++ foreach @{$cats{$file}};
}
closedir $dh;

print "Calculating probabilities (@{[ %freq ]})\n";
$_ /= $docs foreach values %freq;
my @cats = keys %freq;

print "Scoring test documents\n";
my $c = Statistics::Contingency->new(categories => \@cats);
opendir $dh, $test or die "Can't opendir $test: $!";
while (defined(my $file = readdir $dh)) {
    next if $file eq '.' or $file eq '..';
    unless ($cats{$file}) {
	warn "No category information for '$file'";
	next;
    }
    my @assigned;
    foreach (@cats) {
	push @assigned, $_ if rand() < $freq{$_};
    }
    $c->add_result(\@assigned, $cats{$file});
}

print $c->stats_table(4);

lib/AI/Categorizer.pm  view on Meta::CPAN

sub knowledge_set { shift->{knowledge_set} }
sub learner       { shift->{learner} }

# Combines several methods in one sub
sub run_experiment {
  my $self = shift;
  $self->scan_features;
  $self->read_training_set;
  $self->train;
  $self->evaluate_test_set;
  print $self->stats_table;
}

sub scan_features {
  my $self = shift;
  return unless $self->knowledge_set->scan_first;
  $self->knowledge_set->scan_features( path => $self->{training_set} );
  $self->knowledge_set->save_features( "$self->{progress_file}-01-features" );
}

sub read_training_set {

lib/AI/Categorizer.pm  view on Meta::CPAN

=head1 NAME

AI::Categorizer - Automatic Text Categorization

=head1 SYNOPSIS

 use AI::Categorizer;
 my $c = new AI::Categorizer(...parameters...);
 
 # Run a complete experiment - training on a corpus, testing on a test
 # set, printing a summary of results to STDOUT
 $c->run_experiment;
 
 # Or, run the parts of $c->run_experiment separately
 $c->scan_features;
 $c->read_training_set;
 $c->train;
 $c->evaluate_test_set;
 print $c->stats_table;
 
 # After training, use the Learner for categorization
 my $l = $c->learner;
 while (...) {
   my $d = ...create a document...
   my $hypothesis = $l->categorize($d);  # An AI::Categorizer::Hypothesis object
   print "Assigned categories: ", join ', ', $hypothesis->categories, "\n";
   print "Best category: ", $hypothesis->best_category, "\n";
 }
 
=head1 DESCRIPTION

C<AI::Categorizer> is a framework for automatic text categorization.
It consists of a collection of Perl modules that implement common
categorization tasks, and a set of defined relationships among those
modules.  The various details are flexible - for example, you can
choose what categorization algorithm to use, what features (words or
otherwise) of the documents should be used (or how to automatically

lib/AI/Categorizer.pm  view on Meta::CPAN


A string that indicates a place where objects will be saved during
several of the methods of this class.  The default value is the string
C<save>, which means files like C<save-01-knowledge_set> will get
created.  The exact names of these files may change in future
releases, since they're just used internally to resume where we last
left off.

=item verbose

If true, a few status messages will be printed during execution.

=item training_set

Specifies the C<path> parameter that will be fed to the KnowledgeSet's
C<scan_features()> and C<read()> methods during our C<scan_features()>
and C<read_training_set()> methods.

=item test_set

Specifies the C<path> parameter that will be used when creating a

lib/AI/Categorizer.pm  view on Meta::CPAN


Returns the KnowledgeSet object associated with this Categorizer.  If
C<read_training_set()> has not yet been called, the KnowledgeSet will
not yet be populated with any training data.

=item run_experiment()

Runs a complete experiment on the training and testing data, reporting
the results on C<STDOUT>.  Internally, this is just a shortcut for
calling the C<scan_features()>, C<read_training_set()>, C<train()>,
and C<evaluate_test_set()> methods, then printing the value of the
C<stats_table()> method.

=item scan_features()

Scans the Collection specified in the C<test_set> parameter to
determine the set of features (words) that will be considered when
training the Learner.  Internally, this calls the C<scan_features()>
method of the KnowledgeSet, then saves a list of the KnowledgeSet's
features for later use.

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN


=head1 NAME

AI::Categorizer::Collection - Access stored documents

=head1 SYNOPSIS

  my $c = new AI::Categorizer::Collection::Files
    (path => '/tmp/docs/training',
     category_file => '/tmp/docs/cats.txt');
  print "Total number of docs: ", $c->count_documents, "\n";
  while (my $document = $c->next) {
    ...
  }
  $c->rewind; # For further operations
  
=head1 DESCRIPTION

This abstract class implements an iterator for accessing documents in
their natively stored format.  You cannot directly create an instance
of the Collection class, because it is abstract - see the

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN

=item stopword_file

Specifies a file containing a list of "stopwords", which are words
that should automatically be disregarded when scanning/reading
documents.  The file should contain one word per line.  The file will
be parsed and then fed as the C<stopwords> parameter to the
Document C<new()> method.

=item verbose

If true, some status/debugging information will be printed to
C<STDOUT> during operation.

=item document_class

The class indicating what type of Document object should be created.
This generally specifies the format that the documents are stored in.
The default is C<AI::Categorizer::Document::Text>.

=back

lib/AI/Categorizer/Collection/Files.pm  view on Meta::CPAN


=head1 NAME

AI::Categorizer::Collection::Files - One document per file

=head1 SYNOPSIS

  my $c = new AI::Categorizer::Collection::Files
    (path => '/tmp/docs/training',
     category_file => '/tmp/docs/cats.txt');
  print "Total number of docs: ", $c->count_documents, "\n";
  while (my $document = $c->next) {
    ...
  }
  $c->rewind; # For further operations
  
=head1 DESCRIPTION

This implements a Collection class in which each document exists as a
single file on a filesystem.  The documents can exist in a single
directory, or in several directories.

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

  $self->finish;
  return $self;
}

sub dump_features {
  my ($self, %args) = @_;
  my $path = $args{path} or die "No 'path' argument given to dump_features()";
  open my($fh), "> $path" or die "Can't create $path: $!";
  my $f = $self->features->as_hash;
  while (my ($k, $v) = each %$f) {
    print $fh "$k\t$v\n";
  }
}

1;

__END__

=head1 NAME

AI::Categorizer::Document - Embodies a document

lib/AI/Categorizer/Experiment.pm  view on Meta::CPAN


 use AI::Categorizer::Experiment;
 my $e = new AI::Categorizer::Experiment(categories => \%categories);
 my $l = AI::Categorizer::Learner->restore_state(...path...);
 
 while (my $d = ... get document ...) {
   my $h = $l->categorize($d); # A Hypothesis
   $e->add_hypothesis($h, [map $_->name, $d->categories]);
 }
 
 print "Micro F1: ", $e->micro_F1, "\n"; # Access a single statistic
 print $e->stats_table; # Show several stats in table form

=head1 DESCRIPTION

The C<AI::Categorizer::Experiment> class helps you organize the
results of categorization experiments.  As you get lots of
categorization results (Hypotheses) back from the Learner, you can
feed these results to the Experiment class, along with the correct
answers.  When all results have been collected, you can get a report
on accuracy, precision, recall, F1, and so on, with both
macro-averaging and micro-averaging over categories.

lib/AI/Categorizer/Experiment.pm  view on Meta::CPAN

=item new( categories => \%categories )

=item new( categories => \@categories, verbose => 1, sig_figs => 2 )

Returns a new Experiment object.  A required C<categories> parameter
specifies the names of all categories in the data set.  The category
names may be specified either the keys in a reference to a hash, or as
the entries in a reference to an array.

The C<new()> method accepts a C<verbose> parameter which
will cause some status/debugging information to be printed to
C<STDOUT> when C<verbose> is set to a true value.

A C<sig_figs> indicates the number of significant figures that should
be used when showing the results in the C<results_table()> method.  It
does not affect the other methods like C<micro_precision()>.

=item add_result($assigned, $correct, $name)

Adds a new result to the experiment.  Please see the
C<Statistics::Contingency> documentation for a description of this

lib/AI/Categorizer/FeatureSelector.pm  view on Meta::CPAN

  # features to use.

  my ($self, $f, %args) = @_;
  my $kept = defined $args{features_kept} ? $args{features_kept} : $self->{features_kept};
  return $f unless $kept;

  my $num_kept = ($kept < 1 ? 
		  $f->length * $kept :
		  $kept);

  print "Trimming features - # features = " . $f->length . "\n" if $self->verbose;
  
  # This is algorithmic overkill, but the sort seems fast enough.  Will revisit later.
  my $features = $f->as_hash;
  my @new_features = (sort {$features->{$b} <=> $features->{$a}} keys %$features)
                      [0 .. $num_kept-1];

  my $result = $f->intersection( \@new_features );
  print "Finished trimming features - # features = " . $result->length . "\n" if $self->verbose;
  return $result;
}

# Abstract methods
sub rank_features;
sub scan_features;

sub select_features {
  my ($self, %args) = @_;
  

lib/AI/Categorizer/FeatureSelector/CategorySelector.pm  view on Meta::CPAN

    my $docfeatures = $doc->features->as_hash;
    foreach my $cat ($doc->categories) {
      my $catname = $cat->name;
      if(!(exists $cat_features{$catname})) {
        $cat_features{$catname} = $self->create_delayed_object('features');
      }
      $cat_features{$catname}->add($docfeatures);
    }
    $coll_features->add( $docfeatures );
  }
  print STDERR "\n* Computing Chi-Square values\n" if $self->verbose;

  my $r_features = $self->create_delayed_object('features');
  my @terms = $coll_features->names;
  my $progressBar = $self->prog_bar(scalar @terms);
  my $allFeaturesSum = $coll_features->sum;
  my %cat_features_sum;
  while( my($catname,$features) = each %cat_features ) {
    $cat_features_sum{$catname} = $features->sum;
  }

  foreach my $term (@terms) {
    $progressBar->();
    $r_features->{features}{$term} = $self->reduction_function($term,
      $nbDocuments,$allFeaturesSum,$coll_features,
      \%cat_features,\%cat_features_sum);
  }
  print STDERR "\n" if $self->verbose;
  my $new_features = $self->reduce_features($r_features);
  return $coll_features->intersection( $new_features );
}


# calculate feature set after reading collection (scan_first=0)

sub rank_features {
  die "CategorySelector->rank_features is not implemented yet!";
#  my ($self, %args) = @_;

lib/AI/Categorizer/FeatureSelector/CategorySelector.pm  view on Meta::CPAN

#  return $self->create_delayed_object('features', features => \%freq_counts);
}


# copied from KnowledgeSet->prog_bar by Ken Williams

sub prog_bar {
  my ($self, $count) = @_;

  return sub {} unless $self->verbose;
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";

  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
  };
}


__END__

=head1 NAME

AI::Categorizer::CategorySelector - Abstract Category Selection class

lib/AI/Categorizer/FeatureSelector/DocFrequency.pm  view on Meta::CPAN

sub scan_features {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No 'collection' parameter provided to scan_features()";

  my $doc_freq = $self->create_delayed_object('features');
  
  while (my $doc = $c->next) {
    $args{prog_bar}->() if $args{prog_bar};
    $doc_freq->add( $doc->features->as_boolean_hash );
  }
  print "\n" if $self->verbose;
  
  return $self->reduce_features($doc_freq);
}

1;

__END__

=head1 NAME

lib/AI/Categorizer/Hypothesis.pm  view on Meta::CPAN

AI::Categorizer::Hypothesis - Embodies a set of category assignments

=head1 SYNOPSIS

 use AI::Categorizer::Hypothesis;
 
 # Hypotheses are usually created by the Learner's categorize() method.
 # (assume here that $learner and $document have been created elsewhere)
 my $h = $learner->categorize($document);
 
 print "Assigned categories: ", join ', ', $h->categories, "\n";
 print "Best category: ", $h->best_category, "\n";
 print "Assigned scores: ", join ', ', $h->scores( $h->categories ), "\n";
 print "Chosen from: ", join ', ', $h->all_categories, "\n";
 print +($h->in_category('geometry') ? '' : 'not '), "assigned to geometry\n";

=head1 DESCRIPTION

A Hypothesis embodies a set of category assignments that a categorizer
makes about a single document.  Because one may be interested in
knowing different kinds of things about the assignments (for instance,
what categories were assigned, which category had the highest score,
whether a particular category was assigned), we provide a simple class
to help facilitate these scenarios.

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

  foreach my $doc ($self->documents) {
    $doc->features( $doc->features->intersection($self->features) );
  }
}


sub prog_bar {
  my ($self, $collection) = @_;

  return sub {} unless $self->verbose;
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";

  my $count = $collection->can('count_documents') ? $collection->count_documents : 0;
  
  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
  };
}

# A little utility method for several other methods like scan_stats(),
# load(), read(), etc.
sub _make_collection {
  my ($self, $args) = @_;
  return $args->{collection} || $self->create_delayed_object('collection', %$args);
}

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

    $stats{token_count} += $sum;
    $stats{type_count}  += $length;
    
    foreach my $cat ($doc->categories) {
#warn $doc->name, ": ", $cat->name, "\n";
      $stats{categories}{$cat->name}{document_count}++;
      $stats{categories}{$cat->name}{token_count} += $sum;
      $stats{categories}{$cat->name}{type_count}  += $length;
    }
  }
  print "\n" if $self->verbose;

  my @cats = keys %{ $stats{categories} };

  $stats{category_count}          = @cats;
  $stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
  $stats{tokens_per_document}     = $stats{token_count} / $stats{document_count};
  $stats{types_per_document}      = $stats{type_count}  / $stats{document_count};

  foreach my $thing ('type', 'token', 'document') {
    $stats{"${thing}s_per_category"} = AI::Categorizer::Util::average

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN


sub read {
  my ($self, %args) = @_;
  my $collection = $self->_make_collection(\%args);
  my $pb = $self->prog_bar($collection);
  
  while (my $doc = $collection->next) {
    $pb->();
    $self->add_document($doc);
  }
  print "\n" if $self->verbose;
}

sub finish {
  my $self = shift;
  return if $self->{finished}++;
  $self->weigh_features;
}

sub weigh_features {
  # This could be made more efficient by figuring out an execution

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

}

sub save_features {
  my ($self, $file) = @_;
  
  my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
    or croak "No features to save";
  
  open my($fh), "> $file" or croak "Can't create $file: $!";
  my $h = $f->as_hash;
  print $fh "# Total: ", $f->length, "\n";
  
  foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
    print $fh "$k\t$h->{$k}\n";
  }
  close $fh;
}

sub restore_features {
  my ($self, $file, $n) = @_;
  
  open my($fh), "< $file" or croak "Can't open $file: $!";

  my %hash;

lib/AI/Categorizer/Learner.pm  view on Meta::CPAN

  $self->{knowledge_set}->finish;
  $self->create_model;    # Creates $self->{model}
  $self->delayed_object_params('hypothesis',
			       all_categories => [map $_->name, $self->categories],
			      );
}

sub prog_bar {
  my ($self, $count) = @_;
  
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
  
  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    my $string = '';
    if (@_) {
      my $e = shift;
      $string = sprintf " (maF1=%.03f, miF1=%.03f)", $e->macro_F1, $e->micro_F1;
    }
    print STDERR $pb->report("%50b %p ($i/$count)$string\r", $i);
    return $i;
  };
}

sub categorize_collection {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No collection provided";

  my @all_cats = map $_->name, $self->categories;
  my $experiment = $self->create_delayed_object('experiment', categories => \@all_cats);
  my $pb = $self->verbose ? $self->prog_bar($c->count_documents) : sub {};
  while (my $d = $c->next) {
    my $h = $self->categorize($d);
    $experiment->add_hypothesis($h, [map $_->name, $d->categories]);
    $pb->($experiment);
    if ($self->verbose > 1) {
      printf STDERR ("%s: assigned=(%s) correct=(%s)\n",
		     $d->name,
		     join(', ', $h->categories),
		     join(', ', map $_->name, $d->categories));
    }
  }
  print STDERR "\n" if $self->verbose;

  return $experiment;
}

sub categorize {
  my ($self, $doc) = @_;
  
  my ($scores, $threshold) = $self->get_scores($doc);
  
  if ($self->verbose > 2) {
    warn "scores: @{[ %$scores ]}" if $self->verbose > 3;
    
    foreach my $key (sort {$scores->{$b} <=> $scores->{$a}} keys %$scores) {
      print "$key: $scores->{$key}\n";
    }
  }
  
  return $self->create_delayed_object('hypothesis',
                                      scores => $scores,
                                      threshold => $threshold,
                                      document_name => $doc->name,
                                     );
}
1;

lib/AI/Categorizer/Learner.pm  view on Meta::CPAN

 my $nb = new AI::Categorizer::Learner::NaiveBayes(...parameters...);
 $nb->train(knowledge_set => $k);
 $nb->save_state('filename');
 
 ... time passes ...
 
 $nb = AI::Categorizer::Learner::NaiveBayes->restore_state('filename');
 my $c = new AI::Categorizer::Collection::Files( path => ... );
 while (my $document = $c->next) {
   my $hypothesis = $nb->categorize($document);
   print "Best assigned category: ", $hypothesis->best_category, "\n";
   print "All assigned categories: ", join(', ', $hypothesis->categories), "\n";
 }

=head1 DESCRIPTION

The C<AI::Categorizer::Learner> class is an abstract class that will
never actually be directly used in your code.  Instead, you will use a
subclass like C<AI::Categorizer::Learner::NaiveBayes> which implements
an actual machine learning algorithm.

The general description of the Learner interface is documented here.

lib/AI/Categorizer/Learner/DecisionTree.pm  view on Meta::CPAN

	$t->add_instance( attributes => $doc->features->as_boolean_hash,
			  result => $results{$doc->name},
			  name => $doc->name,
			);
      }
    }
    $t->purge(0);
    $self->{model}{first_tree} = $t;
  }

  print STDERR "\nBuilding tree for category '", $cat->name, "'" if $self->verbose;
  $t->train;
  return $t;
}

sub get_scores {
  my ($self, $doc) = @_;
  local $self->{current_doc} = $doc->features->as_boolean_hash;
  return $self->SUPER::get_scores($doc);
}

lib/AI/Categorizer/Learner/DecisionTree.pm  view on Meta::CPAN

  
  my $l = new AI::Categorizer::Learner::DecisionTree(...parameters...);
  $l->train(knowledge_set => $k);
  $l->save_state('filename');
  
  ... time passes ...
  
  $l = AI::Categorizer::Learner->restore_state('filename');
  while (my $document = ... ) {  # An AI::Categorizer::Document object
    my $hypothesis = $l->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
  }

=head1 DESCRIPTION

This class implements a Decision Tree machine learner, using
C<AI::DecisionTree> to do the internal work.

=head1 METHODS

This class inherits from the C<AI::Categorizer::Learner> class, so all

lib/AI/Categorizer/Learner/Guesser.pm  view on Meta::CPAN

  my $l = new AI::Categorizer::Learner::Guesser;
  $l->train(knowledge_set => $k);
  $l->save_state('filename');
  
  ... time passes ...
  
  $l = AI::Categorizer::Learner->restore_state('filename');
  my $c = new AI::Categorizer::Collection::Files( path => ... );
  while (my $document = $c->next) {
    my $hypothesis = $l->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
    print "All assigned categories: ", join(', ', $hypothesis->categories), "\n";
  }

=head1 DESCRIPTION

This implements a simple category guesser that makes assignments based
solely on the prior probabilities of categories.  For instance, if 5%
of the training documents belong to a certain category, then the
probability of any test document being assigned to that category is
0.05.  This can be useful for providing baseline scores to compare
with other more sophisticated algorithms.

lib/AI/Categorizer/Learner/KNN.pm  view on Meta::CPAN

  my $f_class = $self->knowledge_set->contained_class('features');
  if ($f_class->can('all_features')) {
    $f_class->all_features([$self->knowledge_set->features->names]);
  }
  $self->SUPER::categorize_collection(@_);
}

sub get_scores {
  my ($self, $newdoc) = @_;
  my $currentDocName = $newdoc->name;
  #print "classifying $currentDocName\n";

  my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
  my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});

  my @docset;
  if ($self->{max_instances}) {
    # Use (approximately) max_instances documents, chosen randomly from corpus
    my $probability = $self->{max_instances} / $self->knowledge_set->documents;
    @docset = grep {rand() < $probability} $self->knowledge_set->documents;
  } else {

lib/AI/Categorizer/Learner/KNN.pm  view on Meta::CPAN

  my $nb = new AI::Categorizer::Learner::KNN(...parameters...);
  $nb->train(knowledge_set => $k);
  $nb->save_state('filename');
  
  ... time passes ...
  
  $l = AI::Categorizer::Learner->restore_state('filename');
  my $c = new AI::Categorizer::Collection::Files( path => ... );
  while (my $document = $c->next) {
    my $hypothesis = $l->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
    print "All assigned categories: ", join(', ', $hypothesis->categories), "\n";
  }

=head1 DESCRIPTION

This is an implementation of the k-Nearest-Neighbor decision-making
algorithm, applied to the task of document categorization (as defined
by the AI::Categorizer module).  See L<AI::Categorizer> for a complete
description of the interface.

=head1 METHODS

lib/AI/Categorizer/Learner/NaiveBayes.pm  view on Meta::CPAN

  my $nb = new AI::Categorizer::Learner::NaiveBayes(...parameters...);
  $nb->train(knowledge_set => $k);
  $nb->save_state('filename');
  
  ... time passes ...
  
  $nb = AI::Categorizer::Learner::NaiveBayes->restore_state('filename');
  my $c = new AI::Categorizer::Collection::Files( path => ... );
  while (my $document = $c->next) {
    my $hypothesis = $nb->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
    print "All assigned categories: ", join(', ', $hypothesis->categories), "\n";
  }

=head1 DESCRIPTION

This is an implementation of the Naive Bayes decision-making
algorithm, applied to the task of document categorization (as defined
by the AI::Categorizer module).  See L<AI::Categorizer> for a complete
description of the interface.

This module is now a wrapper around the stand-alone

lib/AI/Categorizer/Learner/SVM.pm  view on Meta::CPAN

  
  my $l = new AI::Categorizer::Learner::SVM(...parameters...);
  $l->train(knowledge_set => $k);
  $l->save_state('filename');
  
  ... time passes ...
  
  $l = AI::Categorizer::Learner->restore_state('filename');
  while (my $document = ... ) {  # An AI::Categorizer::Document object
    my $hypothesis = $l->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
  }

=head1 DESCRIPTION

This class implements a Support Vector Machine machine learner, using
Cory Spencer's C<Algorithm::SVM> module.  In lots of the recent
academic literature, SVMs perform very well for text categorization.

=head1 METHODS

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

    foreach my $line (@output) {
      next unless $line =~ /\S/;
      
      # 0 large.elem 0.4515551620220952 numberth.high
      unless ( $line =~ /^([\d.]+)\s+(\S+)\s+([\d.]+)\s+(\S+)/ ) {
	warn "Can't parse line $line";
	next;
      }
      my ($index, $predicted, $score) = ($1, $2, $3);
      $assigned[$index]{$cat} = $score if $predicted;  # Not sure what weka's scores represent
      print STDERR "$index: assigned=($predicted) correct=(", $alldocs[$index]->is_in_category($cat) ? 1 : 0, ")\n"
	if $self->verbose;
    }
  }

  my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]);
  foreach my $i (0..$#alldocs) {
    $experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name);
  }

  return $experiment;
}


sub do_cmd {
  my ($self, @cmd) = @_;
  print STDERR " % @cmd\n" if $self->verbose;
  
  my @output;
  local *KID_TO_READ;
  my $pid = open(KID_TO_READ, "-|");
  
  if ($pid) {   # parent
    @output = <KID_TO_READ>;
    close(KID_TO_READ) or warn "@cmd exited $?";
    
  } else {      # child

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN


sub create_arff_file {
  my ($self, $name, $docs, $dir) = @_;
  $dir = $self->{model}{_in_dir} unless defined $dir;

  my ($fh, $filename) = File::Temp::tempfile(
					     $name . "_XXXX",  # Template
					     DIR    => $dir,
					     SUFFIX => '.arff',
					    );
  print $fh "\@RELATION foo\n\n";
  
  my $feature_names = $self->{model}{all_features};
  foreach my $name (@$feature_names) {
    print $fh "\@ATTRIBUTE feature-$name REAL\n";
  }
  print $fh "\@ATTRIBUTE category {1, 0}\n\n";
  
  my %feature_indices = map {$feature_names->[$_], $_} 0..$#{$feature_names};
  my $last_index = keys %feature_indices;
  
  # We use the 'sparse' format, see http://www.cs.waikato.ac.nz/~ml/weka/arff.html
  
  print $fh "\@DATA\n";
  foreach my $doc (@$docs) {
    my ($features, $cat) = @$doc;
    my $f = $features->as_hash;
    my @ordered_keys = (sort {$feature_indices{$a} <=> $feature_indices{$b}} 
			grep {exists $feature_indices{$_}}
			keys %$f);

    print $fh ("{",
	       join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"),
	       "}\n"
	      );
  }
  
  return $filename;
}

sub save_state {
  my ($self, $path) = @_;

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

  my $nb = new AI::Categorizer::Learner::Weka(...parameters...);
  $nb->train(knowledge_set => $k);
  $nb->save_state('filename');
  
  ... time passes ...
  
  $nb = AI::Categorizer::Learner->restore_state('filename');
  my $c = new AI::Categorizer::Collection::Files( path => ... );
  while (my $document = $c->next) {
    my $hypothesis = $nb->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
  }

=head1 DESCRIPTION

This class doesn't implement any machine learners of its own, it
merely passes the data through to the Weka machine learning system
(http://www.cs.waikato.ac.nz/~ml/weka/).  This can give you access to
a collection of machine learning algorithms not otherwise implemented
in C<AI::Categorizer>.

lib/AI/Categorizer/Util.pm  view on Meta::CPAN

package AI::Categorizer::Util;

use Exporter;
use base qw(Exporter);
@EXPORT_OK = qw(intersection average max min random_elements binary_search);

use strict;

# It's possible that this can be a class - something like 
# 
# $e = Evaluate->new(); $e->correct([...]); $e->assigned([...]); print $e->precision;

# A simple binary search
sub binary_search {
  my ($arr, $target) = @_;
  my ($low, $high) = (0, scalar @$arr);
  use integer;
  while ( $low < $high ) {
    my $cur = ($low + $high)/2;
    if ( $arr->[$cur] < $target ) {
      $low = $cur + 1;

t/13-document.t  view on Meta::CPAN

     stopwords => ['stemmed'],
     stemming => 'porter',
     content  => 'stopword processing should happen after stemming',
     # Becomes qw(stopword process    should happen after stem    )
    );
  ok $d->stopword_behavior, 'stem', "stopword_behavior() is 'stem'";
  
  ok $d->features->includes('stopword'), 1,  "Should include 'stopword'";
  ok $d->features->includes('stemming'), '', "Shouldn't include 'stemming'";
  ok $d->features->includes('stem'),     '', "Shouldn't include 'stem'";
  print "Features: @{[ $d->features->names ]}\n";
}

{
  my $d = $docclass->new
    (
     name => 'test',
     stopwords => ['stemmed'],
     stemming => 'porter',
     stopword_behavior => 'no_stem',
     content  => 'stopword processing should happen after stemming',
     # Becomes qw(stopword process    should happen after stem    )
    );
  ok $d->stopword_behavior, 'no_stem', "stopword_behavior() is 'no_stem'";
  
  ok $d->features->includes('stopword'), 1,  "Should include 'stopword'";
  ok $d->features->includes('stemming'), '', "Shouldn't include 'stemming'";
  ok $d->features->includes('stem'),     1,  "Should include 'stem'";
  print "Features: @{[ $d->features->names ]}\n";
}

{
  my $d = $docclass->new
    (
     name => 'test',
     stopwords => ['stem'],
     stemming => 'porter',
     stopword_behavior => 'pre_stemmed',
     content  => 'stopword processing should happen after stemming',
     # Becomes qw(stopword process    should happen after stem    )
    );
  ok $d->stopword_behavior, 'pre_stemmed', "stopword_behavior() is 'pre_stemmed'";
  
  ok $d->features->includes('stopword'), 1,  "Should include 'stopword'";
  ok $d->features->includes('stemming'), '', "Shouldn't include 'stemming'";
  ok $d->features->includes('stem'),     '', "Shouldn't include 'stem'";
  print "Features: @{[ $d->features->names ]}\n";
}

t/common.pl  view on Meta::CPAN

  return eval "use $module; 1";
}

sub need_module {
  my $module = shift;
  skip_test("$module not installed") unless have_module($module);
}

sub skip_test {
  my $msg = @_ ? shift() : '';
  print "1..0 # Skipped: $msg\n";
  exit;
}

sub training_docs {
  return (
	  doc1 => {categories => ['farming'],
		   content => 'Sheep are very valuable in farming.' },
	  doc2 => {categories => ['farming'],
		   content => 'Farming requires many kinds of animals.' },
	  doc3 => {categories => ['vampire'],

t/common.pl  view on Meta::CPAN

}

sub run_test_docs {
  my $l = shift;

  my $doc = new AI::Categorizer::Document
    ( name => 'test1',
      content => 'I would like to begin farming sheep.' );
  my $r = $l->categorize($doc);
  
  print "Categories: ", join(', ', $r->categories), "\n";
  ok($r->best_category, 'farming', "Best category is 'farming'");
  ok $r->in_category('farming'),  1, sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('farming'));
  ok $r->in_category('vampire'), '', sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('vampire'));
  
  ok $r->all_categories, 2, "Should be 2 categories in total";
  
  $doc = new AI::Categorizer::Document
    ( name => 'test2',
      content => "I see that many vampires may have eaten my beautiful daughter's blood." );
  $r = $l->categorize($doc);
  
  print "Categories: ", join(', ', $r->categories), "\n";
  ok($r->best_category, 'vampire', "Best category is 'vampire'");
  ok $r->in_category('farming'), '', sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('farming'));
  ok $r->in_category('vampire'),  1, sprintf("threshold = %s, score = %s", $r->threshold, $r->scores('vampire'));
}

sub set_up_tests {
  my %params = @_;
  my $c = new AI::Categorizer(
			      knowledge_set => AI::Categorizer::KnowledgeSet->new
			      (
			       name => 'Vampires/Farmers',
			       stopwords => [qw(are be in of and)],
			      ),



( run in 1.679 second using v1.01-cache-2.11-cpan-de7293f3b23 )