AI-Categorizer

 view release on metacpan or  search on metacpan

eg/categorizer  view on Meta::CPAN

run_section('scan_features',     1, $do_stage);
run_section('read_training_set', 2, $do_stage);
run_section('train',             3, $do_stage);
run_section('evaluate_test_set', 4, $do_stage);
if ($do_stage->{5}) {
  my $result = $c->stats_table;
  print $result if $c->verbose;
  print $out_fh $result if $out_fh;
}

sub run_section {
  my ($section, $stage, $do_stage) = @_;
  return unless $do_stage->{$stage};
  if (keys %$do_stage > 1) {
    print " % $0 @ARGV -$stage\n" if $c->verbose;
    die "$0 is not executable, please change its execution permissions"
      unless -x $0;
    system($0, @ARGV, "-$stage") == 0
      or die "$0 returned nonzero status, \$?=$?";
    return;
  }
  my $start = new Benchmark;
  $c->$section();
  my $end = new Benchmark;
  my $summary = timestr(timediff($end, $start));
  my ($rss, $vsz) = memory_usage();
  print "$summary (memory: rss=$rss, vsz=$vsz)\n" if $c->verbose;
  print $out_fh "Stage $stage: $summary (memory: rss=$rss, vsz=$vsz)\n" if $out_fh;
}

sub parse_command_line {
  my (%opt, %do_stage);

  while (@_) {
    if ($_[0] =~ /^-(\d+)$/) {
      shift;
      $do_stage{$1} = 1;
      
    } elsif ( $_[0] eq '--config_file' ) {
      die "--config_file requires the YAML module from CPAN to be installed.\n" unless $HAVE_YAML;
      shift;

eg/categorizer  view on Meta::CPAN

  }

  my $outfile;
  unless ($outfile = delete $opt{outfile}) {
    $outfile = $opt{progress_file} ? "$opt{progress_file}-results.txt" : "results.txt";
  }

  return (\%opt, \%do_stage, $outfile);
}

sub usage {
  return <<EOF;
 Usage:

  $0 --parameter_1 <value_1> --parameter_2 <value_2>
      # You may specify a YAML config file as follows:
  $0 --config_file <path> --parameter_3 <value_3>
      # Or, to run only step 3 (of 5)
  $0 --config_file <path> -3

 --parameter_1, --parameter_2, etc. are parameters accepted by
 AI::Categorizer objects' new() methods.

EOF
}

sub memory_usage {
  my ($rss, $vsz);
  if ($^O eq 'darwin' or $^O eq 'linux') {
    ($rss, $vsz) = `ps -eo rss,vsz -p $$` =~ /(\d+)\s+(\d+)/;
  } elsif ($^O eq 'solaris') {
    ($rss, $vsz) = `ps -o rss,vsz -p $$` =~ /(\d+)\s+(\d+)/;
  } else {
    warn "Unknown system, can't get memory usage";
  }
  return ($rss, $vsz);
}

lib/AI/Categorizer.pm  view on Meta::CPAN

__PACKAGE__->contained_objects
  (
   knowledge_set => { class => 'AI::Categorizer::KnowledgeSet' },
   learner       => { class => 'AI::Categorizer::Learner::NaiveBayes' },
   experiment    => { class => 'AI::Categorizer::Experiment',
		      delayed => 1 },
   collection    => { class => 'AI::Categorizer::Collection::Files',
		      delayed => 1 },
  );

sub new {
  my $package = shift;
  my %args = @_;
  my %defaults;
  if (exists $args{data_root}) {
    $defaults{training_set} = File::Spec->catfile($args{data_root}, 'training');
    $defaults{test_set} = File::Spec->catfile($args{data_root}, 'test');
    $defaults{category_file} = File::Spec->catfile($args{data_root}, 'cats.txt');
    delete $args{data_root};
  }

  return $package->SUPER::new(%defaults, %args);
}

#sub dump_parameters {
#  my $p = shift()->SUPER::dump_parameters;
#  delete $p->{stopwords} if $p->{stopword_file};
#  return $p;
#}

sub knowledge_set { shift->{knowledge_set} }
sub learner       { shift->{learner} }

# Combines several methods in one sub
sub run_experiment {
  my $self = shift;
  $self->scan_features;
  $self->read_training_set;
  $self->train;
  $self->evaluate_test_set;
  print $self->stats_table;
}

sub scan_features {
  my $self = shift;
  return unless $self->knowledge_set->scan_first;
  $self->knowledge_set->scan_features( path => $self->{training_set} );
  $self->knowledge_set->save_features( "$self->{progress_file}-01-features" );
}

sub read_training_set {
  my $self = shift;
  $self->knowledge_set->restore_features( "$self->{progress_file}-01-features" )
    if -e "$self->{progress_file}-01-features";
  $self->knowledge_set->read( path => $self->{training_set} );
  $self->_save_progress( '02', 'knowledge_set' );
  return $self->knowledge_set;
}

sub train {
  my $self = shift;
  $self->_load_progress( '02', 'knowledge_set' );
  $self->learner->train( knowledge_set => $self->{knowledge_set} );
  $self->_save_progress( '03', 'learner' );
  return $self->learner;
}

sub evaluate_test_set {
  my $self = shift;
  $self->_load_progress( '03', 'learner' );
  my $c = $self->create_delayed_object('collection', path => $self->{test_set} );
  $self->{experiment} = $self->learner->categorize_collection( collection => $c );
  $self->_save_progress( '04', 'experiment' );
  return $self->{experiment};
}

sub stats_table {
  my $self = shift;
  $self->_load_progress( '04', 'experiment' );
  return $self->{experiment}->stats_table;
}

sub progress_file {
  shift->{progress_file};
}

sub verbose {
  shift->{verbose};
}

sub _save_progress {
  my ($self, $stage, $node) = @_;
  return unless $self->{progress_file};
  my $file = "$self->{progress_file}-$stage-$node";
  warn "Saving to $file\n" if $self->{verbose};
  $self->{$node}->save_state($file);
}

sub _load_progress {
  my ($self, $stage, $node) = @_;
  return unless $self->{progress_file};
  my $file = "$self->{progress_file}-$stage-$node";
  warn "Loading $file\n" if $self->{verbose};
  $self->{$node} = $self->contained_class($node)->restore_state($file);
}

1;
__END__

lib/AI/Categorizer/Category.pm  view on Meta::CPAN

use Params::Validate qw(:types);
use AI::Categorizer::FeatureVector;

__PACKAGE__->valid_params
  (
   name => {type => SCALAR, public => 0},
   documents  => {
		  type => ARRAYREF,
		  default => [],
		  callbacks => { 'all are Document objects' => 
				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'), @_ },
			       },
		  public => 0,
		 },
  );

__PACKAGE__->contained_objects
  (
   features => {
		class => 'AI::Categorizer::FeatureVector',
		delayed => 1,
	       },
  );

my %REGISTRY = ();

sub new {
  my $self = shift()->SUPER::new(@_);
  $self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
  $REGISTRY{$self->{name}} = $self;
  return $self;
}

sub by_name {
  my ($class, %args) = @_;
  return $REGISTRY{$args{name}} if exists $REGISTRY{$args{name}};
  return $class->new(%args);
}

sub name { $_[0]->{name} }

sub documents {
  my $d = $_[0]->{documents};
  return wantarray ? $d->members : $d->size;
}

sub contains_document {
  return $_[0]->{documents}->includes( $_[1] );
}

sub add_document {
  my $self = shift;
  $self->{documents}->insert( $_[0] );
  delete $self->{features};  # Could be more efficient?
}

sub features {
  my $self = shift;

  if (@_) {
    $self->{features} = shift;
  }
  return $self->{features} if $self->{features};

  my $v = $self->create_delayed_object('features');
  return $self->{features} = $v unless $self->documents;

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN

   category_hash => { type => HASHREF, default => {} },
   category_file => { type => SCALAR, optional => 1 },
  );

__PACKAGE__->contained_objects
  (
   document => { class => 'AI::Categorizer::Document::Text',
		 delayed => 1 },
  );

sub new {
  my ($class, %args) = @_;
  
  # Optimize so every document doesn't have to convert the stopword list to a hash
  if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
    $args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
  }
  
  my $self = $class->SUPER::new(%args);

  if ($self->{category_file}) {

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN

    close FH;

    $self->delayed_object_params('document', stopwords => \%stopwords);
  }

  return $self;
}

# This should usually be replaced in subclasses with a faster version that doesn't
# need to create actual documents each time through
sub count_documents {
  my $self = shift;
  return $self->{document_count} if exists $self->{document_count};

  $self->rewind;
  my $count = 0;
  $count++ while $self->next;
  $self->rewind;

  return $self->{document_count} = $count;
}

# Abstract methods
sub next;
sub rewind;

1;
__END__

=head1 NAME

AI::Categorizer::Collection - Access stored documents

=head1 SYNOPSIS

lib/AI/Categorizer/Collection/DBI.pm  view on Meta::CPAN

   dbh => {isa => 'DBI::db', default => undef},
   select_statement => {type => SCALAR, default => "SELECT text FROM documents"},
  );

__PACKAGE__->contained_objects
  (
   document => { class => 'AI::Categorizer::Document',
		 delayed => 1 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  die "Must provide 'dbh' or 'connection_string' arguments"
    unless $self->{dbh} or $self->{connection_string};
  
  unless ($self->{dbh}) {
    $self->{dbh} = DBI->connect($self->{connection_string}, '', '', {RaiseError => 1})
      or die DBI->errstr;
    delete $self->{connection_string};
  }
  
  $self->rewind;
  return $self;
}

sub dbh { shift()->{dbh} }

sub rewind {
  my $self = shift;
  
  if (!$self->{sth}) {
    $self->{sth} = $self->dbh->prepare($self->{select_statement});
  }

  if ($self->{sth}{Active}) {
    $self->{sth}->finish;
  }

  $self->{sth}->execute;
}

sub next {
  my $self = shift;

  my @result = $self->{sth}->fetchrow_array;
  return undef unless @result;
  
  return $self->create_delayed_object('document',
				      name => $result[0],
				      categories => [$result[1]],
				      content => $result[2],
				     );

lib/AI/Categorizer/Collection/Files.pm  view on Meta::CPAN


use Params::Validate qw(:types);
use File::Spec;

__PACKAGE__->valid_params
  (
   path => { type => SCALAR|ARRAYREF },
   recurse => { type => BOOLEAN, default => 0 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  $self->{dir_fh} = do {local *FH; *FH};  # double *FH avoids a warning

  # Documents are contained in a directory, or list of directories
  $self->{path} = [$self->{path}] unless ref $self->{path};
  $self->{used} = [];

  $self->_next_path;
  return $self;
}

sub _next_path {
  my $self = shift;
  closedir $self->{dir_fh} if $self->{cur_dir};

  $self->{cur_dir} = shift @{$self->{path}};
  push @{$self->{used}}, $self->{cur_dir};
  opendir $self->{dir_fh}, $self->{cur_dir} or die "$self->{cur_dir}: $!";
}

sub next {
  my $self = shift;
  my $file = $self->_read_file;
  return unless defined $file;
  
  warn "No category information about '$file'" unless defined $self->{category_hash}{$file};
  my @cats = map AI::Categorizer::Category->by_name(name => $_), @{ $self->{category_hash}{$file} || [] };

  return $self->call_method('document', 'read', 
			    path => File::Spec->catfile($self->{cur_dir}, $file),
			    name => $file,
			    categories => \@cats,
			   );
}

sub _read_file {
  my ($self) = @_;
  
  my $file = readdir $self->{dir_fh};

  if (!defined $file) { # Directory has been exhausted
    return undef unless @{$self->{path}};
    $self->_next_path;
    return $self->_read_file;
  } elsif ($file eq '.' or $file eq '..') {
    return $self->_read_file;
  } elsif (-d (my $path = File::Spec->catdir($self->{cur_dir}, $file))) {
    push @{$self->{path}}, $path  # Add for later processing
      if $self->{recurse} and !grep {$_ eq $path} @{$self->{path}}, @{$self->{used}};
    return $self->_read_file;
  }
  return $file;
}

sub rewind {
  my $self = shift;
  push @{$self->{path}}, @{$self->{used}};
  @{$self->{used}} = ();
  $self->_next_path;
}

# This should share an iterator with next()
sub count_documents {
    my $self = shift;
    return $self->{document_count} if defined $self->{document_count};
    
    $self->rewind;
    
    my $count = 0;
    $count++ while defined $self->_read_file;

    $self->rewind;
    return $self->{document_count} = $count;

lib/AI/Categorizer/Collection/InMemory.pm  view on Meta::CPAN

use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);

use Params::Validate qw(:types);

__PACKAGE__->valid_params
  (
   data => { type => HASHREF },
  );

sub new {
  my $self = shift()->SUPER::new(@_);
  
  while (my ($name, $params) = each %{$self->{data}}) {
    foreach (@{$params->{categories}}) {
      next if ref $_;
      $_ = AI::Categorizer::Category->by_name(name => $_);
    }
  }

  return $self;
}

sub next {
  my $self = shift;
  my ($name, $params) = each %{$self->{data}} or return;
  return AI::Categorizer::Document->new(name => $name, %$params);
}

sub rewind {
  my $self = shift;
  scalar keys %{$self->{data}};
  return;
}

sub count_documents {
  my $self = shift;
  return scalar keys %{$self->{data}};
}

1;

lib/AI/Categorizer/Collection/SingleFile.pm  view on Meta::CPAN

   categories => { type => HASHREF|UNDEF, default => undef },
   delimiter => { type => SCALAR },
  );

__PACKAGE__->contained_objects
  (
   document => { class => 'AI::Categorizer::Document::Text',
		 delayed => 1 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  $self->{fh} = do {local *FH; *FH};  # double *FH avoids a warning

  # Documents are contained in a file, or list of files
  $self->{path} = [$self->{path}] unless ref $self->{path};
  $self->{used} = [];

  $self->_next_path;
  return $self;
}

sub _next_path {
  my $self = shift;
  close $self->{fh} if $self->{cur_file};

  push @{$self->{used}}, shift @{$self->{path}};
  $self->{cur_file} = $self->{used}[-1];
  open $self->{fh}, "< $self->{cur_file}" or die "$self->{cur_file}: $!";
}

sub next {
  my $self = shift;

  my $fh = $self->{fh}; # Must put in a simple scalar
  my $content = do {local $/ = $self->{delimiter}; <$fh>};

  if (!defined $content) { # File has been exhausted
    unless (@{$self->{path}}) { # All files have been exhausted
      $self->{fh} = undef;
      return undef;
    }

lib/AI/Categorizer/Collection/SingleFile.pm  view on Meta::CPAN

    return $self->next;
  } elsif ($content =~ /^\s*$self->{delimiter}$/) { # Skip empty docs
    return $self->next;
  }
#warn "doc is $content";
#warn "creating document=>@{[ %{$self->{container}{delayed}{document}} ]}";

  return $self->create_delayed_object('document', content => $content);
}

sub count_documents {
  my ($self) = @_;
  return $self->{document_count} if defined $self->{document_count};
  
  $self->rewind;

  my $count = 0;
  local $/ = $self->{delimiter};
  my $fh = $self->{fh};
  while (1) {
    $count++ while <$fh>;
    last unless @{$self->{path}};
    $self->_next_path;
  }
  
  $self->rewind;

  return $self->{document_count} = $count;
}

sub rewind {
  my ($self) = @_;

  close $self->{fh} if $self->{cur_file};
  unshift @{$self->{path}}, @{$self->{used}};
  $self->{used} = [];
  $self->_next_path;
}

1;

lib/AI/Categorizer/Document.pm  view on Meta::CPAN


__PACKAGE__->valid_params
  (
   name       => {
		  type => SCALAR, 
		 },
   categories => {
		  type => ARRAYREF,
		  default => [],
		  callbacks => { 'all are Category objects' => 
				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
			       },
		  public => 0,
		 },
   stopwords => {
		 type => ARRAYREF|HASHREF,
		 default => {},
		},
   content   => {
		 type => HASHREF|SCALAR,
		 default => undef,

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

__PACKAGE__->contained_objects
  (
   features => { delayed => 1,
		 class => 'AI::Categorizer::FeatureVector' },
  );

### Constructors

my $NAME = 'a';

sub new {
  my $pkg = shift;
  my $self = $pkg->SUPER::new(name => $NAME++,  # Use a default name
			      @_);

  # Get efficient internal data structures
  $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );

  $self->_fix_stopwords;
  
  # A few different ways for the caller to initialize the content

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

    
  } elsif (defined $self->{content}) {
    # Allow a simple string as the content
    $self->{content} = { body => $self->{content} } unless ref $self->{content};
  }
  
  $self->finish if $self->{content};
  return $self;
}

sub _fix_stopwords {
  my $self = shift;
  
  # Convert to hash
  $self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
    if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
  
  my $s = $self->{stopwords};

  # May need to perform stemming on the stopwords
  return unless keys %$s; # No point in doing anything if there are no stopwords

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

  my @keys = keys %$s;
  %$s = ();
  $self->stem_words(\@keys);
  $s->{$_} = 1 foreach @keys;
  
  # This flag is attached to the stopword structure itself so that
  # other documents will notice it.
  $s->{___stemmed} = 1;
}

sub finish {
  my $self = shift;
  $self->create_feature_vector;
  
  # Now we're done with all the content stuff
  delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
}


# Parse a document format - a virtual method
sub parse;

sub parse_handle {
  my ($self, %args) = @_;
  my $fh = $args{handle} or die "No 'handle' argument given to parse_handle()";
  return $self->parse( content => join '', <$fh> );
}

### Accessors

sub name { $_[0]->{name} }
sub stopword_behavior { $_[0]->{stopword_behavior} }

sub features {
  my $self = shift;
  if (@_) {
    $self->{features} = shift;
  }
  return $self->{features};
}

sub categories {
  my $c = $_[0]->{categories};
  return wantarray ? $c->members : $c->size;
}


### Workers

sub create_feature_vector {
  my $self = shift;
  my $content = $self->{content};
  my $weights = $self->{content_weights};

  die "'stopword_behavior' must be one of 'stem', 'no_stem', or 'pre_stemmed'"
    unless $self->{stopword_behavior} =~ /^stem|no_stem|pre_stemmed$/;

  $self->{features} = $self->create_delayed_object('features');
  while (my ($name, $data) = each %$content) {
    my $t = $self->tokenize($data);
    $t = $self->_filter_tokens($t) if $self->{stopword_behavior} eq 'no_stem';
    $self->stem_words($t);
    $t = $self->_filter_tokens($t) if $self->{stopword_behavior} =~ /^stem|pre_stemmed$/;
    my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 );
    $self->{features}->add($h);
  }
}

sub is_in_category {
  return (ref $_[1]
	  ? $_[0]->{categories}->includes( $_[1] )
	  : $_[0]->{categories}->includes_name( $_[1] ));
    
}

sub tokenize {
  my $self = shift;
  my @tokens;
  while ($_[0] =~ /([-\w]+)/g) {
    my $word = lc $1;
    next unless $word =~ /[a-z]/;
    $word =~ s/^[^a-z]+//;  # Trim leading non-alpha characters (helps with ordinals)
    push @tokens, $word;
  }
  return \@tokens;
}

sub stem_words {
  my ($self, $tokens) = @_;
  return unless $self->{stemming};
  return if $self->{stemming} eq 'none';
  die "Unknown stemming option '$self->{stemming}' - options are 'porter' or 'none'"
    unless $self->{stemming} eq 'porter';
  
  eval {require Lingua::Stem; 1}
    or die "Porter stemming requires the Lingua::Stem module, available from CPAN.\n";

  @$tokens = @{ Lingua::Stem::stem(@$tokens) };
}

sub _filter_tokens {
  my ($self, $tokens_in) = @_;

  if ($self->{use_features}) {
    my $f = $self->{use_features}->as_hash;
    return [ grep  exists($f->{$_}), @$tokens_in ];
  } elsif ($self->{stopwords} and keys %{$self->{stopwords}}) {
    my $s = $self->{stopwords};
    return [ grep !exists($s->{$_}), @$tokens_in ];
  }
  return $tokens_in;
}

sub _weigh_tokens {
  my ($self, $tokens, $weight) = @_;

  my %counts;
  if (my $b = 0+$self->{front_bias}) {
    die "'front_bias' value must be between -1 and 1"
      unless -1 < $b and $b < 1;
    
    my $n = @$tokens;
    my $r = ($b-1)**2 / ($b+1);
    my $mult = $weight * log($r)/($r-1);

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

    
  } else {
    foreach my $feature (@$tokens) {
      $counts{$feature} += $weight;
    }
  }

  return \%counts;
}

sub vectorize {
  my ($self, %args) = @_;
  if ($self->{stem_stopwords}) {
    my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
    $self->{stopwords} = { map {+$_, 1} @$s };
    $args{tokens} = $self->_filter_tokens($args{tokens});
  }
  return $self->_weigh_tokens($args{tokens}, $args{weight});
}

sub read {
  my ($class, %args) = @_;
  my $path = delete $args{path} or die "Must specify 'path' argument to read()";
  
  my $self = $class->new(%args);
  
  open my($fh), "< $path" or die "$path: $!";
  $self->parse_handle(handle => $fh);
  close $fh;
  
  $self->finish;
  return $self;
}

sub dump_features {
  my ($self, %args) = @_;
  my $path = $args{path} or die "No 'path' argument given to dump_features()";
  open my($fh), "> $path" or die "Can't create $path: $!";
  my $f = $self->features->as_hash;
  while (my ($k, $v) = each %$f) {
    print $fh "$k\t$v\n";
  }
}

1;

lib/AI/Categorizer/Document/SMART.pm  view on Meta::CPAN

package AI::Categorizer::Document::SMART;

use strict;
use AI::Categorizer::Document;
use base qw(AI::Categorizer::Document);

sub parse {
  my ($self, %args) = @_;
  
  $args{content} =~ s{
		   ^(?:\.I)?\s+(\d+)\n  # ID number - becomes document name
		   \.C\n
		   ([^\n]+)\n     # Categories
		   \.T\n
		   (.+)\n+        # Title
		   \.W\n
		  }

lib/AI/Categorizer/Document/Text.pm  view on Meta::CPAN

use strict;
use AI::Categorizer::Document;
use base qw(AI::Categorizer::Document);

#use Params::Validate qw(:types);
#use AI::Categorizer::ObjectSet;
#use AI::Categorizer::FeatureVector;

### Constructors

sub parse {
  my ($self, %args) = @_;
  $self->{content} = { body => $args{content} };
}

1;

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN

use base qw(AI::Categorizer::Document);
use XML::SAX;

__PACKAGE__->contained_objects
  (
   xml_handler => 'AI::Categorizer::Document::XML::Handler',
  );

### Constructors

sub parse {
  my ($self, %args) = @_;

  # it is a string which contains the content of XML
  my $body= $args{content};			

  # it is a hash which includes a pair of <elementName, weight>
  my $elementWeight= $args{elementWeight};	

  # construct Handler which receive event of element, data, comment, processing_instruction
  # And convert their values into a sequence  of string and save it into buffer

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN

}

##########################################################################
package AI::Categorizer::Document::XML::Handler;
use strict;
use base qw(XML::SAX::Base);

# Input: a hash which is weights of elements
# Output: object of this class
# Description: this is constructor
sub new{
  my ($class, %args) = @_;

  # call super class such as XML::SAX::Base
  my $self = $class->SUPER::new;

  # save weights of elements which is a hash for pairs <elementName, weight>
  # weight is times duplication of corresponding element
  # It is provided by caller(one of parameters) at construction, and
  # we must save it in order to use doing duplication at end_element
  $self->{weightHash} = $args{weights};

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN


  return $self;
}
	
# Input: None
# Output: None
# Description:
# 	it is called whenever the parser meets the document
# 	it will be called at once
#	Currently, confirm if the content buffer is an empty
sub start_document{
  my ($self, $doc)= @_;

  # The level(depth) of the last called element in XML tree
  # Calling of start_element is the preorder of the tree traversal.
  # The level is the level of current visiting element in tree.
  # the first element is 0-level
  $self->{levelPointer} = 0;

  # all data will be saved into here, initially, it is an empty
  $self->{content} = "";

  #$self->SUPER::start_document($doc);
}

# Input: None
# Output: None
# Description:
# 	it is called whenever the parser ends the document
# 	it will be called at once
#	Nothing to do
sub end_document{
  my ($self, $doc)= @_;

  #$self->SUPER::end_document($doc);
}

# Input
#	LocalName: 	$el->{LocalName}
#	NamespaceURI: 	$el->{NamespaceURI}
#	Name		$el->{Name}
#	Prefix		$el->{Prefix}
#	Attributes	$el->{Attributes}
#	for each attribute
#		LocalName: 	$el->{LocalName}
#		NamespaceURI: 	$el->{NamespaceURI}
#		Name		$el->{Name}
#		Prefix		$el->{Prefix}
#		Value		$el->{Value}
# Output: None
# Description:
# 	it is called whenever the parser meets the element
sub start_element{
  my ($self, $el)= @_;

  # find the last location of the content
  # its meaning is to append the new data at this location
  my $location= length $self->{content};

  # save the last location of the current content
  # so that at end_element the starting location of data of this element can be known
  $self->{locationArray}[$self->{levelPointer}] = $location;

  # for the next element, increase levelPointer
  $self->{levelPointer}++;

  #$self->SUPER::start_document($el);
}

# Input: None
# Output: None
# Description:
# 	it is called whenever the parser ends the element
sub end_element{
  my ($self, $el)= @_;

  $self->{levelPointer}--;
  my $location= $self->{locationArray}[$self->{levelPointer}];

  # find the name of element
  my $elementName= $el->{Name};

  # set the default weight
  my $weight= 1;

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN

  }

  #$self->SUPER::end_document($el);
}

# Input: a hash which consists of pair <Data, Value>
# Output: None
# Description:
# 	it is called whenever the parser meets the text which comes from Text, CDataSection and etc
#	Value must be saved into content buffer.
sub characters{
  my ($self, $args)= @_;

  # save "data plus new line" into content
  $self->{content} .= "$args->{Data}\n";
}
	
# Input: a hash which consists of pair <Data, Value>
# Output: None
# Description:
# 	it is called whenever the parser meets the comment
#	Currently, it will be ignored
sub comment{
  my ($self, $args)= @_;
}

# Input: a hash which consists of pair <Data, Value> and <Target, Value>
# Output: None
# Description:
# 	it is called whenever the parser meets the processing_instructing
#	Currently, it will be ignored
sub processing_instruction{
  my ($self, $args)= @_;
}

# Input: None
# Output: the converted data, that is, content
# Description:
# 	return the content
sub getContent{
  my ($self)= @_;
  return $self->{content};
}

1;
__END__

lib/AI/Categorizer/Experiment.pm  view on Meta::CPAN


use base qw(Class::Container AI::Categorizer::Storable Statistics::Contingency);

use Params::Validate qw(:types);
__PACKAGE__->valid_params
  (
   categories => { type => ARRAYREF|HASHREF },
   sig_figs   => { type => SCALAR, default => 4 },
  );

sub new {
  my $package = shift;
  my $self = $package->Class::Container::new(@_);
  
  $self->{$_} = 0 foreach qw(a b c d);
  my $c = delete $self->{categories};
  $self->{categories} = { map {($_ => {a=>0, b=>0, c=>0, d=>0})} 
			  UNIVERSAL::isa($c, 'HASH') ? keys(%$c) : @$c
			};
  return $self;
}

sub add_hypothesis {
  my ($self, $h, $correct, $name) = @_;
  die "No hypothesis given to add_hypothesis()" unless $h;
  $name = $h->document_name unless defined $name;
  
  $self->add_result([$h->categories], $correct, $name);
}

sub stats_table {
  my $self = shift;
  $self->SUPER::stats_table($self->{sig_figs});
}

1;

__END__

=head1 NAME

lib/AI/Categorizer/FeatureSelector.pm  view on Meta::CPAN

   features_kept => {
		     type => SCALAR,
		     default => 0.2,
		    },
   verbose => {
	       type => SCALAR,
	       default => 0,
	      },
  );

sub verbose {
  my $self = shift;
  $self->{verbose} = shift if @_;
  return $self->{verbose};
}

sub reduce_features {
  # Takes a feature vector whose weights are "feature scores", and
  # chops to the highest n features.  n is specified by the
  # 'features_kept' parameter.  If it's zero, all features are kept.
  # If it's between 0 and 1, we multiply by the present number of
  # features.  If it's greater than 1, we treat it as the number of
  # features to use.

  my ($self, $f, %args) = @_;
  my $kept = defined $args{features_kept} ? $args{features_kept} : $self->{features_kept};
  return $f unless $kept;

lib/AI/Categorizer/FeatureSelector.pm  view on Meta::CPAN

  my $features = $f->as_hash;
  my @new_features = (sort {$features->{$b} <=> $features->{$a}} keys %$features)
                      [0 .. $num_kept-1];

  my $result = $f->intersection( \@new_features );
  print "Finished trimming features - # features = " . $result->length . "\n" if $self->verbose;
  return $result;
}

# Abstract methods
sub rank_features;
sub scan_features;

sub select_features {
  my ($self, %args) = @_;
  
  die "No knowledge_set parameter provided to select_features()"
    unless $args{knowledge_set};

  my $f = $self->rank_features( knowledge_set => $args{knowledge_set} );
  return $self->reduce_features( $f, features_kept => $args{features_kept} );
}


lib/AI/Categorizer/FeatureSelector/CategorySelector.pm  view on Meta::CPAN


__PACKAGE__->contained_objects
  (
   features => { class => 'AI::Categorizer::FeatureVector',
		 delayed => 1 },
  );

1;


sub reduction_function;

# figure out the feature set before reading collection (default)

sub scan_features {
  my ($self, %args) = @_;
  my $c = $args{collection} or 
    die "No 'collection' parameter provided to scan_features()";

  if(!($self->{features_kept})) {return;}

  my %cat_features;
  my $coll_features = $self->create_delayed_object('features');
  my $nbDocuments = 0;

lib/AI/Categorizer/FeatureSelector/CategorySelector.pm  view on Meta::CPAN

      \%cat_features,\%cat_features_sum);
  }
  print STDERR "\n" if $self->verbose;
  my $new_features = $self->reduce_features($r_features);
  return $coll_features->intersection( $new_features );
}


# calculate feature set after reading collection (scan_first=0)

sub rank_features {
  die "CategorySelector->rank_features is not implemented yet!";
#  my ($self, %args) = @_;
#  
#  my $k = $args{knowledge_set} 
#    or die "No knowledge_set parameter provided to rank_features()";
#
#  my %freq_counts;
#  foreach my $name ($k->features->names) {
#    $freq_counts{$name} = $k->document_frequency($name);
#  }
#  return $self->create_delayed_object('features', features => \%freq_counts);
}


# copied from KnowledgeSet->prog_bar by Ken Williams

sub prog_bar {
  my ($self, $count) = @_;

  return sub {} unless $self->verbose;
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";

  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
  };
}


__END__

=head1 NAME

lib/AI/Categorizer/FeatureSelector/ChiSquare.pm  view on Meta::CPAN


use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector::CategorySelector);

use Params::Validate qw(:types);

# Chi-Square function
# NB: this could probably be optimised a bit...

sub reduction_function {
  my ($self,$term,$N,$allFeaturesSum,
      $coll_features,$cat_features,$cat_features_sum) = @_;
  my $CHI2SUM = 0;
  my $nbcats = 0;
  foreach my $catname (keys %{$cat_features}) {
#  while ( my ($catname,$catfeatures) = each %{$cat_features}) {
    my ($A,$B,$C,$D); # A = number of times where t and c co-occur
                      # B =   "     "   "   t occurs without c
                      # C =   "     "   "   c occurs without t
                      # D =   "     "   "   neither c nor t occur

lib/AI/Categorizer/FeatureSelector/DocFrequency.pm  view on Meta::CPAN

use Params::Validate qw(:types);
use Carp qw(croak);

__PACKAGE__->contained_objects
  (
   features => { class => 'AI::Categorizer::FeatureVector',
		 delayed => 1 },
  );

# The KnowledgeSet keeps track of document frequency, so just use that.
sub rank_features {
  my ($self, %args) = @_;
  
  my $k = $args{knowledge_set} or die "No knowledge_set parameter provided to rank_features()";
  
  my %freq_counts;
  foreach my $name ($k->features->names) {
    $freq_counts{$name} = $k->document_frequency($name);
  }
  return $self->create_delayed_object('features', features => \%freq_counts);
}

sub scan_features {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No 'collection' parameter provided to scan_features()";

  my $doc_freq = $self->create_delayed_object('features');
  
  while (my $doc = $c->next) {
    $args{prog_bar}->() if $args{prog_bar};
    $doc_freq->add( $doc->features->as_boolean_hash );
  }
  print "\n" if $self->verbose;

lib/AI/Categorizer/FeatureVector.pm  view on Meta::CPAN

package AI::Categorizer::FeatureVector;

sub new {
  my ($package, %args) = @_;
  $args{features} ||= {};
  return bless {features => $args{features}}, $package;
}

sub names {
  my $self = shift;
  return keys %{$self->{features}};
}

sub set {
  my $self = shift;
  $self->{features} = (ref $_[0] ? $_[0] : {@_});
}

sub as_hash {
  my $self = shift;
  return $self->{features};
}

sub euclidean_length {
  my $self = shift;
  my $f = $self->{features};

  my $total = 0;
  foreach (values %$f) {
    $total += $_**2;
  }
  return sqrt($total);
}

sub normalize {
  my $self = shift;

  my $length = $self->euclidean_length;
  return $length ? $self->scale(1/$length) : $self;
}

sub scale {
  my ($self, $scalar) = @_;
  $_ *= $scalar foreach values %{$self->{features}};
  return $self;
}

sub as_boolean_hash {
  my $self = shift;
  return { map {($_ => 1)} keys %{$self->{features}} };
}

sub length {
  my $self = shift;
  return scalar keys %{$self->{features}};
}

sub clone {
  my $self = shift;
  return ref($self)->new( features => { %{$self->{features}} } );
}

sub intersection {
  my ($self, $other) = @_;
  $other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);

  my $common;
  if (UNIVERSAL::isa($other, 'ARRAY')) {
    $common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} @$other};
  } elsif (UNIVERSAL::isa($other, 'HASH')) {
    $common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} keys %$other};
  }
  return ref($self)->new( features => $common );
}

sub add {
  my ($self, $other) = @_;

  $other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);
  while (my ($k,$v) = each %$other) {
    $self->{features}{$k} += $v;
  }
}

sub dot {
  my ($self, $other) = @_;
  $other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);

  my $sum = 0;
  my $f = $self->{features};
  while (my ($k, $v) = each %$f) {
    $sum += $other->{$k} * $v if exists $other->{$k};
  }
  return $sum;
}

sub sum {
  my ($self) = @_;

  # Return total of values in this vector
  my $total = 0;
  $total += $_ foreach values %{ $self->{features} };
  return $total;
}

sub includes {
  return exists $_[0]->{features}{$_[1]};
}

sub value {
  return $_[0]->{features}{$_[1]};
}

sub values {
  my $self = shift;
  return @{ $self->{features} }{ @_ };
}

1;
__END__

=head1 NAME

AI::Categorizer::FeatureVector - Features vs. Values

lib/AI/Categorizer/Hypothesis.pm  view on Meta::CPAN

use Params::Validate qw(:types);

__PACKAGE__->valid_params
  (
   all_categories => {type => ARRAYREF},
   scores => {type => HASHREF},
   threshold => {type => SCALAR},
   document_name => {type => SCALAR, optional => 1},
  );

sub all_categories { @{$_[0]->{all_categories}} }
sub document_name  { $_[0]->{document_name} }
sub threshold      { $_[0]->{threshold} }

sub best_category {
  my ($self) = @_;
  my $sc = $self->{scores};
  return unless %$sc;

  my ($best_cat, $best_score) = each %$sc;
  while (my ($key, $val) = each %$sc) {
    ($best_cat, $best_score) = ($key, $val) if $val > $best_score;
  }
  return $best_cat;
}

sub in_category {
  my ($self, $cat) = @_;
  return '' unless exists $self->{scores}{$cat};
  return $self->{scores}{$cat} > $self->{threshold};
}

sub categories {
  my $self = shift;
  return @{$self->{cats}} if $self->{cats};
  $self->{cats} = [sort {$self->{scores}{$b} <=> $self->{scores}{$a}}
                   grep {$self->{scores}{$_} >= $self->{threshold}}
                   keys %{$self->{scores}}];
  return @{$self->{cats}};
}

sub scores {
  my $self = shift;
  return @{$self->{scores}}{@_};
}

1;

__END__

=head1 NAME

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

use AI::Categorizer::FeatureVector;
use AI::Categorizer::Util;
use Carp qw(croak);

__PACKAGE__->valid_params
  (
   categories => {
		  type => ARRAYREF,
		  default => [],
		  callbacks => { 'all are Category objects' => 
				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'),
					 @{$_[0]} },
			       },
		 },
   documents  => {
		  type => ARRAYREF,
		  default => [],
		  callbacks => { 'all are Document objects' => 
				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'),
					 @{$_[0]} },
			       },
		 },
   scan_first => {
		  type => BOOLEAN,
		  default => 1,
		 },
   feature_selector => {
			isa => 'AI::Categorizer::FeatureSelector',
		       },

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

		 class => 'AI::Categorizer::Document' },
   category => { delayed => 1,
		 class => 'AI::Categorizer::Category' },
   collection => { delayed => 1,
		   class => 'AI::Categorizer::Collection::Files' },
   features => { delayed => 1,
		 class => 'AI::Categorizer::FeatureVector' },
   feature_selector => 'AI::Categorizer::FeatureSelector::DocFrequency',
  );

sub new {
  my ($pkg, %args) = @_;
  
  # Shortcuts
  if ($args{tfidf_weighting}) {
    @args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting};
    delete $args{tfidf_weighting};
  }

  my $self = $pkg->SUPER::new(%args);

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

  $self->{documents}  = new AI::Categorizer::ObjectSet( @{$self->{documents}}  );

  if ($self->{load}) {
    my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} };
    $self->load(%$args);
    delete $self->{load};
  }
  return $self;
}

sub features {
  my $self = shift;

  if (@_) {
    $self->{features} = shift;
    $self->trim_doc_features if $self->{features};
  }
  return $self->{features} if $self->{features};

  # Create a feature vector encompassing the whole set of documents
  my $v = $self->create_delayed_object('features');
  foreach my $document ($self->documents) {
    $v->add( $document->features );
  }
  return $self->{features} = $v;
}

sub categories {
  my $c = $_[0]->{categories};
  return wantarray ? $c->members : $c->size;
}

sub documents {
  my $d = $_[0]->{documents};
  return wantarray ? $d->members : $d->size;
}

sub document {
  my ($self, $name) = @_;
  return $self->{documents}->retrieve($name);
}

sub feature_selector { $_[0]->{feature_selector} }
sub scan_first       { $_[0]->{scan_first} }

sub verbose {
  my $self = shift;
  $self->{verbose} = shift if @_;
  return $self->{verbose};
}

sub trim_doc_features {
  my ($self) = @_;
  
  foreach my $doc ($self->documents) {
    $doc->features( $doc->features->intersection($self->features) );
  }
}


sub prog_bar {
  my ($self, $collection) = @_;

  return sub {} unless $self->verbose;
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";

  my $count = $collection->can('count_documents') ? $collection->count_documents : 0;
  
  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
  };
}

# A little utility method for several other methods like scan_stats(),
# load(), read(), etc.
sub _make_collection {
  my ($self, $args) = @_;
  return $args->{collection} || $self->create_delayed_object('collection', %$args);
}

sub scan_stats {
  # Should determine:
  #  - number of documents
  #  - number of categories
  #  - avg. number of categories per document (whole corpus)
  #  - avg. number of tokens per document (whole corpus)
  #  - avg. number of types per document (whole corpus)
  #  - number of documents, tokens, & types for each category
  #  - "category skew index" (% variance?) by num. documents, tokens, and types

  my ($self, %args) = @_;

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

    my $ssum;
    foreach my $cat (@cats) {
      $ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
    }
    $stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
  }

  return \%stats;
}

sub load {
  my ($self, %args) = @_;
  my $c = $self->_make_collection(\%args);

  if ($self->{features_kept}) {
    # Read the whole thing in, then reduce
    $self->read( collection => $c );
    $self->select_features;

  } elsif ($self->{scan_first}) {
    # Figure out the feature set first, then read data in
    $self->scan_features( collection => $c );
    $c->rewind;
    $self->read( collection => $c );

  } else {
    # Don't do any feature reduction, just read the data
    $self->read( collection => $c );
  }
}

sub read {
  my ($self, %args) = @_;
  my $collection = $self->_make_collection(\%args);
  my $pb = $self->prog_bar($collection);
  
  while (my $doc = $collection->next) {
    $pb->();
    $self->add_document($doc);
  }
  print "\n" if $self->verbose;
}

sub finish {
  my $self = shift;
  return if $self->{finished}++;
  $self->weigh_features;
}

sub weigh_features {
  # This could be made more efficient by figuring out an execution
  # plan in advance

  my $self = shift;
  
  if ( $self->{term_weighting} =~ /^(t|x)$/ ) {
    # Nothing to do
  } elsif ( $self->{term_weighting} eq 'l' ) {
    foreach my $doc ($self->documents) {
      my $f = $doc->features->as_hash;

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN


  if ( $self->{normalize_weighting} eq 'x' ) {
    # Nothing to do
  } elsif ( $self->{normalize_weighting} eq 'c' ) {
    $_->features->normalize foreach $self->documents;
  } else {
    die "normalize_weighting must be one of 'x' or 'c'";
  }
}

sub document_frequency {
  my ($self, $term) = @_;
  
  unless (exists $self->{doc_freq_vector}) {
    die "No corpus has been scanned for features" unless $self->documents;

    my $doc_freq = $self->create_delayed_object('features', features => {});
    foreach my $doc ($self->documents) {
      $doc_freq->add( $doc->features->as_boolean_hash );
    }
    $self->{doc_freq_vector} = $doc_freq->as_hash;
  }
  
  return exists $self->{doc_freq_vector}{$term} ? $self->{doc_freq_vector}{$term} : 0;
}

sub scan_features {
  my ($self, %args) = @_;
  my $c = $self->_make_collection(\%args);

  my $pb = $self->prog_bar($c);
  my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );

  $self->delayed_object_params('document', use_features => $ranked_features);
  $self->delayed_object_params('collection', use_features => $ranked_features);
  return $ranked_features;
}

sub select_features {
  my $self = shift;
  
  my $f = $self->feature_selector->select_features(knowledge_set => $self);
  $self->features($f);
}

sub partition {
  my ($self, @sizes) = @_;
  my $num_docs = my @docs = $self->documents;
  my @groups;

  while (@sizes > 1) {
    my $size = int ($num_docs * shift @sizes);
    push @groups, [];
    for (0..$size) {
      push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
    }
  }
  push @groups, \@docs;

  return map { ref($self)->new( documents => $_ ) } @groups;
}

sub make_document {
  my ($self, %args) = @_;
  my $cats = delete $args{categories};
  my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
  my $d = $self->create_delayed_object('document', %args, categories => \@cats);
  $self->add_document($d);
}

sub add_document {
  my ($self, $doc) = @_;

  foreach ($doc->categories) {
    $_->add_document($doc);
  }
  $self->{documents}->insert($doc);
  $self->{categories}->insert($doc->categories);
}

sub save_features {
  my ($self, $file) = @_;
  
  my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
    or croak "No features to save";
  
  open my($fh), "> $file" or croak "Can't create $file: $!";
  my $h = $f->as_hash;
  print $fh "# Total: ", $f->length, "\n";
  
  foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
    print $fh "$k\t$h->{$k}\n";
  }
  close $fh;
}

sub restore_features {
  my ($self, $file, $n) = @_;
  
  open my($fh), "< $file" or croak "Can't open $file: $!";

  my %hash;
  while (<$fh>) {
    next if /^#/;
    /^(.*)\t([\d.]+)$/ or croak "Malformed line: $_";
    $hash{$1} = $2;
    last if defined $n and $. >= $n;

lib/AI/Categorizer/Learner.pm  view on Meta::CPAN

		  class => 'AI::Categorizer::Hypothesis',
		  delayed => 1,
		 },
   experiment => {
		  class => 'AI::Categorizer::Experiment',
		  delayed => 1,
		 },
  );

# Subclasses must override these virtual methods:
sub get_scores;
sub create_model;

# Optional virtual method for on-line learning:
sub add_knowledge;

sub verbose {
  my $self = shift;
  if (@_) {
    $self->{verbose} = shift;
  }
  return $self->{verbose};
}

sub knowledge_set {
  my $self = shift;
  if (@_) {
    $self->{knowledge_set} = shift;
  }
  return $self->{knowledge_set};
}

sub categories {
  my $self = shift;
  return $self->knowledge_set->categories;
}

sub train {
  my ($self, %args) = @_;
  $self->{knowledge_set} = $args{knowledge_set} if $args{knowledge_set};
  die "No knowledge_set provided" unless $self->{knowledge_set};

  $self->{knowledge_set}->finish;
  $self->create_model;    # Creates $self->{model}
  $self->delayed_object_params('hypothesis',
			       all_categories => [map $_->name, $self->categories],
			      );
}

sub prog_bar {
  my ($self, $count) = @_;
  
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
  
  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    my $string = '';
    if (@_) {
      my $e = shift;
      $string = sprintf " (maF1=%.03f, miF1=%.03f)", $e->macro_F1, $e->micro_F1;
    }
    print STDERR $pb->report("%50b %p ($i/$count)$string\r", $i);
    return $i;
  };
}

sub categorize_collection {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No collection provided";

  my @all_cats = map $_->name, $self->categories;
  my $experiment = $self->create_delayed_object('experiment', categories => \@all_cats);
  my $pb = $self->verbose ? $self->prog_bar($c->count_documents) : sub {};
  while (my $d = $c->next) {
    my $h = $self->categorize($d);
    $experiment->add_hypothesis($h, [map $_->name, $d->categories]);
    $pb->($experiment);
    if ($self->verbose > 1) {
      printf STDERR ("%s: assigned=(%s) correct=(%s)\n",
		     $d->name,
		     join(', ', $h->categories),
		     join(', ', map $_->name, $d->categories));
    }
  }
  print STDERR "\n" if $self->verbose;

  return $experiment;
}

sub categorize {
  my ($self, $doc) = @_;
  
  my ($scores, $threshold) = $self->get_scores($doc);
  
  if ($self->verbose > 2) {
    warn "scores: @{[ %$scores ]}" if $self->verbose > 3;
    
    foreach my $key (sort {$scores->{$b} <=> $scores->{$a}} keys %$scores) {
      print "$key: $scores->{$key}\n";
    }

lib/AI/Categorizer/Learner/Boolean.pm  view on Meta::CPAN

use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
use AI::Categorizer::Util qw(random_elements);

__PACKAGE__->valid_params
  (
   max_instances => {type => SCALAR, default => 0},
   threshold => {type => SCALAR, default => 0.5},
  );

sub create_model {
  my $self = shift;
  my $m = $self->{model} ||= {};
  my $mi = $self->{max_instances};

  foreach my $cat ($self->knowledge_set->categories) {
    my (@p, @n);
    foreach my $doc ($self->knowledge_set->documents) {
      if ($doc->is_in_category($cat)) {
	push @p, $doc;
      } else {

lib/AI/Categorizer/Learner/Boolean.pm  view on Meta::CPAN

      @n = random_elements(\@n, @n * $ratio);
      
      warn "Limiting to ". @p ." positives and ". @n ." negatives\n" if $self->verbose;
    }

    warn "Creating model for ", $cat->name, "\n" if $self->verbose;
    $m->{learners}{ $cat->name } = $self->create_boolean_model(\@p, \@n, $cat);
  }
}

sub create_boolean_model;  # Abstract method

sub get_scores {
  my ($self, $doc) = @_;
  my $m = $self->{model};
  my %scores;
  foreach my $cat (keys %{$m->{learners}}) {
    $scores{$cat} = $self->get_boolean_score($doc, $m->{learners}{$cat});
  }
  return (\%scores, $self->{threshold});
}

sub get_boolean_score;  # Abstract method

sub threshold {
  my $self = shift;
  $self->{threshold} = shift if @_;
  return $self->{threshold};
}

sub categories {
  my $self = shift;
  return map AI::Categorizer::Category->by_name( name => $_ ), keys %{ $self->{model}{learners} };
}

1;
__END__

=head1 NAME

AI::Categorizer::Learner::Boolean - Abstract class for boolean categorizers

=head1 SYNOPSIS

 package AI::Categorizer::Learner::SomethingNew;
 use AI::Categorizer::Learner::Boolean;
 @ISA = qw(AI::Categorizer::Learner::Boolean);
 
 sub create_boolean_model {
   my ($self, $positives, $negatives, $category) = @_;
   ...
   return $something_helpful;
 }
 
 sub get_boolean_score {
   my ($self, $document, $something_helpful) = @_;
   ...
   return $score;
 }

=head1 DESCRIPTION

This is an abstract class which turns boolean categorizers
(categorizers based on algorithms that can just provide yes/no
categorization decisions for a single document and single category)

lib/AI/Categorizer/Learner/DecisionTree.pm  view on Meta::CPAN

package AI::Categorizer::Learner::DecisionTree;
$VERSION = '0.01';

use strict;
use AI::DecisionTree;
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);

sub create_model {
  my $self = shift;
  $self->SUPER::create_model;
  $self->{model}{first_tree}->do_purge;
  delete $self->{model}{first_tree};
}

sub create_boolean_model {
  my ($self, $positives, $negatives, $cat) = @_;
  
  my $t = new AI::DecisionTree(noise_mode => 'pick_best', 
			       verbose => $self->verbose);

  my %results;
  for ($positives, $negatives) {
    foreach my $doc (@$_) {
      $results{$doc->name} = $_ eq $positives ? 1 : 0;
    }

lib/AI/Categorizer/Learner/DecisionTree.pm  view on Meta::CPAN

    }
    $t->purge(0);
    $self->{model}{first_tree} = $t;
  }

  print STDERR "\nBuilding tree for category '", $cat->name, "'" if $self->verbose;
  $t->train;
  return $t;
}

sub get_scores {
  my ($self, $doc) = @_;
  local $self->{current_doc} = $doc->features->as_boolean_hash;
  return $self->SUPER::get_scores($doc);
}

sub get_boolean_score {
  my ($self, $doc, $t) = @_;
  return $t->get_result( attributes => $self->{current_doc} ) || 0;
}

1;
__END__

=head1 NAME

AI::Categorizer::Learner::DecisionTree - Decision Tree Learner

lib/AI/Categorizer/Learner/Guesser.pm  view on Meta::CPAN

package AI::Categorizer::Learner::Guesser;

use strict;
use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);

sub create_model {
  my $self = shift;
  my $k = $self->knowledge_set;
  my $num_docs = $k->documents;
  
  foreach my $cat ($k->categories) {
    next unless $cat->documents;
    $self->{model}{$cat->name} = $cat->documents / $num_docs;
  }
}

sub get_scores {
  my ($self, $newdoc) = @_;
  
  my %scores;
  while (my ($cat, $prob) = each %{$self->{model}}) {
    $scores{$cat} = 0.5 + $prob - rand();
  }
  
  return (\%scores, 0.5);
}

lib/AI/Categorizer/Learner/KNN.pm  view on Meta::CPAN

use Params::Validate qw(:types);

__PACKAGE__->valid_params
  (
   threshold => {type => SCALAR, default => 0.4},
   k_value => {type => SCALAR, default => 20},
   knn_weighting => {type => SCALAR, default => 'score'},
   max_instances => {type => SCALAR, default => 0},
  );

sub create_model {
  my $self = shift;
  foreach my $doc ($self->knowledge_set->documents) {
    $doc->features->normalize;
  }
  $self->knowledge_set->features;  # Initialize
}

sub threshold {
  my $self = shift;
  $self->{threshold} = shift if @_;
  return $self->{threshold};
}

sub categorize_collection {
  my $self = shift;
  
  my $f_class = $self->knowledge_set->contained_class('features');
  if ($f_class->can('all_features')) {
    $f_class->all_features([$self->knowledge_set->features->names]);
  }
  $self->SUPER::categorize_collection(@_);
}

sub get_scores {
  my ($self, $newdoc) = @_;
  my $currentDocName = $newdoc->name;
  #print "classifying $currentDocName\n";

  my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
  my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});

  my @docset;
  if ($self->{max_instances}) {
    # Use (approximately) max_instances documents, chosen randomly from corpus

lib/AI/Categorizer/Learner/KNN.pm  view on Meta::CPAN

  }
  
  $_ /= $self->{k_value} foreach values %scores;
  
  return (\%scores, $self->{threshold});
}

###################################################################
package AI::Categorizer::Learner::KNN::Queue;

sub new {
  my ($pkg, %args) = @_;
  return bless {
		size => $args{size},
		entries => [],
	       }, $pkg;
}

sub add {
  my ($self, $thing, $score) = @_;

  # scores may be (0.2, 0.4, 0.4, 0.8) - ascending

  return unless (@{$self->{entries}} < $self->{size}       # Queue not filled
		 or $score > $self->{entries}[0]{score});  # Found a better entry
  
  my $i;
  if (!@{$self->{entries}}) {
    $i = 0;

lib/AI/Categorizer/Learner/KNN.pm  view on Meta::CPAN

    $i = @{$self->{entries}};
  } else {
    for ($i = 0; $i < @{$self->{entries}}; $i++) {
      last if $score < $self->{entries}[$i]{score};
    }
  }
  splice @{$self->{entries}}, $i, 0, { thing => $thing, score => $score};
  shift @{$self->{entries}} if @{$self->{entries}} > $self->{size};
}

sub entries {
  return shift->{entries};
}

1;

__END__

=head1 NAME

AI::Categorizer::Learner::KNN - K Nearest Neighbour Algorithm For AI::Categorizer

lib/AI/Categorizer/Learner/NaiveBayes.pm  view on Meta::CPAN

use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
use Algorithm::NaiveBayes;

__PACKAGE__->valid_params
  (
   threshold => {type => SCALAR, default => 0.3},
  );

sub create_model {
  my $self = shift;
  my $m = $self->{model} = Algorithm::NaiveBayes->new;

  foreach my $d ($self->knowledge_set->documents) {
    $m->add_instance(attributes => $d->features->as_hash,
		     label      => [ map $_->name, $d->categories ]);
  }
  $m->train;
}

sub get_scores {
  my ($self, $newdoc) = @_;

  return ($self->{model}->predict( attributes => $newdoc->features->as_hash ),
	  $self->{threshold});
}

sub threshold {
  my $self = shift;
  $self->{threshold} = shift if @_;
  return $self->{threshold};
}

sub save_state {
  my $self = shift;
  local $self->{knowledge_set};  # Don't need the knowledge_set to categorize
  $self->SUPER::save_state(@_);
}

sub categories {
  my $self = shift;
  return map AI::Categorizer::Category->by_name( name => $_ ), $self->{model}->labels;
}

1;

__END__

=head1 NAME

lib/AI/Categorizer/Learner/Rocchio.pm  view on Meta::CPAN

use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);

__PACKAGE__->valid_params
  (
   positive_setting => {type => SCALAR, default => 16 },
   negative_setting => {type => SCALAR, default => 4  },
   threshold        => {type => SCALAR, default => 0.1},
  );

sub create_model {
  my $self = shift;
  foreach my $doc ($self->knowledge_set->documents) {
    $doc->features->normalize;
  }
  
  $self->{model}{all_features} = $self->knowledge_set->features(undef);
  $self->SUPER::create_model(@_);
  delete $self->{knowledge_set};
}

sub create_boolean_model {
  my ($self, $positives, $negatives, $cat) = @_;
  my $posdocnum = @$positives;
  my $negdocnum = @$negatives;
  
  my $beta = $self->{positive_setting};
  my $gamma = $self->{negative_setting};
  
  my $profile = $self->{model}{all_features}->clone->scale(-$gamma/$negdocnum);
  my $f = $cat->features(undef)->clone->scale( $beta/$posdocnum + $gamma/$negdocnum );
  $profile->add($f);

  return $profile->normalize;
}

sub get_boolean_score {
  my ($self, $newdoc, $profile) = @_;
  return $newdoc->features->normalize->dot($profile);
}

1;





lib/AI/Categorizer/Learner/SVM.pm  view on Meta::CPAN

use Algorithm::SVM;
use Algorithm::SVM::DataSet;
use Params::Validate qw(:types);
use File::Spec;

__PACKAGE__->valid_params
  (
   svm_kernel => {type => SCALAR, default => 'linear'},
  );

sub create_model {
  my $self = shift;
  my $f = $self->knowledge_set->features->as_hash;
  my $rmap = [ keys %$f ];
  $self->{model}{feature_map} = { map { $rmap->[$_], $_ } 0..$#$rmap };
  $self->{model}{feature_map_reverse} = $rmap;
  $self->SUPER::create_model(@_);
}

sub _doc_2_dataset {
  my ($self, $doc, $label, $fm) = @_;

  my $ds = new Algorithm::SVM::DataSet(Label => $label);
  my $f = $doc->features->as_hash;
  while (my ($k, $v) = each %$f) {
    next unless exists $fm->{$k};
    $ds->attribute( $fm->{$k}, $v );
  }
  return $ds;
}

sub create_boolean_model {
  my ($self, $positives, $negatives, $cat) = @_;
  my $svm = new Algorithm::SVM(Kernel => $self->{svm_kernel});
  
  my (@pos, @neg);
  foreach my $doc (@$positives) {
    push @pos, $self->_doc_2_dataset($doc, 1, $self->{model}{feature_map});
  }
  foreach my $doc (@$negatives) {
    push @neg, $self->_doc_2_dataset($doc, 0, $self->{model}{feature_map});
  }

  $svm->train(@pos, @neg);
  return $svm;
}

sub get_scores {
  my ($self, $doc) = @_;
  local $self->{current_doc} = $self->_doc_2_dataset($doc, -1, $self->{model}{feature_map});
  return $self->SUPER::get_scores($doc);
}

sub get_boolean_score {
  my ($self, $doc, $svm) = @_;
  return $svm->predict($self->{current_doc});
}

sub save_state {
  my ($self, $path) = @_;
  {
    local $self->{model}{learners};
    local $self->{knowledge_set};
    $self->SUPER::save_state($path);
  }
  return unless $self->{model};
  
  my $svm_dir = File::Spec->catdir($path, 'svms');
  mkdir($svm_dir, 0777) or die "Couldn't create $svm_dir: $!";
  while (my ($name, $learner) = each %{$self->{model}{learners}}) {
    my $path = File::Spec->catfile($svm_dir, $name);
    $learner->save($path);
  }
}

sub restore_state {
  my ($self, $path) = @_;
  $self = $self->SUPER::restore_state($path);
  
  my $svm_dir = File::Spec->catdir($path, 'svms');
  return $self unless -e $svm_dir;
  opendir my($dh), $svm_dir or die "Can't open directory $svm_dir: $!";
  while (defined (my $file = readdir $dh)) {
    my $full_file = File::Spec->catfile($svm_dir, $file);
    next if -d $full_file;
    $self->{model}{learners}{$file} = new Algorithm::SVM(Model => $full_file);

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

   weka_classifier => {type => SCALAR, default => 'weka.classifiers.NaiveBayes'},
   weka_args => {type => SCALAR|ARRAYREF, optional => 1},
   tmpdir => {type => SCALAR, default => File::Spec->tmpdir},
  );

__PACKAGE__->contained_objects
  (
   features => {class => 'AI::Categorizer::FeatureVector', delayed => 1},
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);

  for ('java_args', 'weka_args') {
    $self->{$_} = [] unless defined $self->{$_};
    $self->{$_} = [$self->{$_}] unless UNIVERSAL::isa($self->{$_}, 'ARRAY');
  }
  
  if (defined $self->{weka_path}) {
    push @{$self->{java_args}}, '-classpath', $self->{weka_path};
    delete $self->{weka_path};
  }
  return $self;
}

# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -t /tmp/train_file.arff -d /tmp/weka-machine

sub create_model {
  my ($self) = shift;
  my $m = $self->{model} ||= {};
  $m->{all_features} = [ $self->knowledge_set->features->names ];
  $m->{_in_dir} = File::Temp::tempdir( DIR => $self->{tmpdir} );

  # Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement)
  my $dummy_features = $self->create_delayed_object('features');
  $m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]);

  $self->SUPER::create_model(@_);
}

sub create_boolean_model {
  my ($self, $pos, $neg, $cat) = @_;

  my @docs = (map([$_->features, 1], @$pos),
	      map([$_->features, 0], @$neg));
  my $train_file = $self->create_arff_file($cat->name . '_train', \@docs);

  my %info = (machine_file => $cat->name . '_model');
  my $outfile = File::Spec->catfile($self->{model}{_in_dir}, $info{machine_file});

  my @args = ($self->{java_path},

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

	      '-p', '0',
	     );
  $self->do_cmd(@args);
  unlink $train_file or warn "Couldn't remove $train_file: $!";

  return \%info;
}

# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -l out -T test.arff -p 0

sub get_boolean_score {
  my ($self, $doc, $info) = @_;
  
  # Create document file
  my $doc_file = $self->create_arff_file('doc', [[$doc->features, 0]], $self->{tmpdir});
  my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, $info->{machine_file});

  my @args = ($self->{java_path},
	      @{$self->{java_args}},
	      $self->{weka_classifier},
	      '-l', $machine_file,

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

  foreach (@output) {
    # <doc> <category> <score> <real_category>
    # 0 large.elem 0.4515551620220952 numberth.high
    next unless my ($index, $predicted, $score) = /^([\d.]+)\s+(\S+)\s+([\d.]+)/;
    $scores{$predicted} = $score;
  }

  return $scores{1} || 0;  # Not sure what weka's scores represent...
}

sub categorize_collection {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No collection provided";
  
  my @alldocs;
  while (my $d = $c->next) {
    push @alldocs, $d;
  }
  my $doc_file = $self->create_arff_file("docs", [map [$_->features, 0], @alldocs]);

  my @assigned;

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN


  my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]);
  foreach my $i (0..$#alldocs) {
    $experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name);
  }

  return $experiment;
}


sub do_cmd {
  my ($self, @cmd) = @_;
  print STDERR " % @cmd\n" if $self->verbose;
  
  my @output;
  local *KID_TO_READ;
  my $pid = open(KID_TO_READ, "-|");
  
  if ($pid) {   # parent
    @output = <KID_TO_READ>;
    close(KID_TO_READ) or warn "@cmd exited $?";
    
  } else {      # child
    exec(@cmd) or die "Can't exec @cmd: $!";
  }
  
  return @output;
}

sub create_arff_file {
  my ($self, $name, $docs, $dir) = @_;
  $dir = $self->{model}{_in_dir} unless defined $dir;

  my ($fh, $filename) = File::Temp::tempfile(
					     $name . "_XXXX",  # Template
					     DIR    => $dir,
					     SUFFIX => '.arff',
					    );
  print $fh "\@RELATION foo\n\n";
  

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN


    print $fh ("{",
	       join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"),
	       "}\n"
	      );
  }
  
  return $filename;
}

sub save_state {
  my ($self, $path) = @_;

  {
    local $self->{knowledge_set};
    $self->SUPER::save_state($path);
  }
  return unless $self->{model};

  my $model_dir = File::Spec->catdir($path, 'models');
  mkdir($model_dir, 0777) or die "Couldn't create $model_dir: $!";
  while (my ($name, $learner) = each %{$self->{model}{learners}}) {
    my $oldpath = File::Spec->catdir($self->{model}{_in_dir}, $learner->{machine_file});
    my $newpath = File::Spec->catfile($model_dir, "${name}_model");
    File::Copy::copy($oldpath, $newpath);
  }
  $self->{model}{_in_dir} = $model_dir;
}

sub restore_state {
  my ($pkg, $path) = @_;
  
  my $self = $pkg->SUPER::restore_state($path);

  my $model_dir = File::Spec->catdir($path, 'models');
  return $self unless -e $model_dir;
  $self->{model}{_in_dir} = $model_dir;
  
  return $self;
}

lib/AI/Categorizer/ObjectSet.pm  view on Meta::CPAN

package AI::Categorizer::ObjectSet;
use strict;

sub new {
  my $pkg = shift;
  my $self = bless {}, $pkg;
  $self->insert(@_) if @_;
  return $self;
}

sub members {
  return values %{$_[0]};
}

sub size {
  return scalar keys %{$_[0]};
}

sub insert {
  my $self = shift;
  foreach my $element (@_) {
    #warn "types are ", @_;
    $self->{ $element->name } = $element;
  }
}

sub retrieve { $_[0]->{$_[1]} }

sub includes { exists $_[0]->{ $_[1]->name } }
sub includes_name  { exists $_[0]->{ $_[1] } }

1;



( run in 0.516 second using v1.01-cache-2.11-cpan-a5abf4f5562 )