AI-Categorizer

 view release on metacpan or  search on metacpan

eg/categorizer  view on Meta::CPAN


my ($opt, $do_stage, $outfile) = parse_command_line(@ARGV);
@ARGV = grep !/^-\d$/, @ARGV;

my $c = eval {new AI::Categorizer(%$opt)};
if ($@ and $@ =~ /^The following parameter/) {
  die "$@\nPlease see the AI::Categorizer documentation for a description of parameters accepted.\n";
}
die $@ if $@;

%$do_stage = map {$_, 1} 1..5 unless keys %$do_stage;

my $out_fh;
if ($outfile) {
  open $out_fh, ">> $outfile" or die "Can't create $outfile: $!";
  select((select($out_fh), $|=1)[0]);
  if (keys(%$do_stage) > 1) {
    print $out_fh "~~~~~~~~~~~~~~~~", scalar(localtime), "~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
    if ($HAVE_YAML) {
      print {$out_fh} YAML::Dump($c->dump_parameters);
    } else {

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN

  (
   document => { class => 'AI::Categorizer::Document::Text',
		 delayed => 1 },
  );

sub new {
  my ($class, %args) = @_;
  
  # Optimize so every document doesn't have to convert the stopword list to a hash
  if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
    $args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
  }
  
  my $self = $class->SUPER::new(%args);

  if ($self->{category_file}) {
    local *FH;
    open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!";
    while (<FH>) {
      my ($doc, @cats) = split;
      $self->{category_hash}{$doc} = \@cats;

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN


=item new()

Creates a new Collection object and returns it.  Accepts the following
parameters:

=over 4

=item category_hash

Indicates a reference to a hash which maps document names to category
names.  The keys of the hash are the document names, each value should
be a reference to an array containing the names of the categories to
which each document belongs.

=item category_file

Indicates a file which should be read in order to create the
C<category_hash>.  Each line of the file should list a document's
name, followed by a list of category names, all separated by
whitespace.

lib/AI/Categorizer/Collection/Files.pm  view on Meta::CPAN

  push @{$self->{used}}, $self->{cur_dir};
  opendir $self->{dir_fh}, $self->{cur_dir} or die "$self->{cur_dir}: $!";
}

sub next {
  my $self = shift;
  my $file = $self->_read_file;
  return unless defined $file;
  
  warn "No category information about '$file'" unless defined $self->{category_hash}{$file};
  my @cats = map AI::Categorizer::Category->by_name(name => $_), @{ $self->{category_hash}{$file} || [] };

  return $self->call_method('document', 'read', 
			    path => File::Spec->catfile($self->{cur_dir}, $file),
			    name => $file,
			    categories => \@cats,
			   );
}

sub _read_file {
  my ($self) = @_;

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

  }
  
  $self->finish if $self->{content};
  return $self;
}

sub _fix_stopwords {
  my $self = shift;
  
  # Convert to hash
  $self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
    if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
  
  my $s = $self->{stopwords};

  # May need to perform stemming on the stopwords
  return unless keys %$s; # No point in doing anything if there are no stopwords
  return unless $self->{stopword_behavior} eq 'stem';
  return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
  return if $s->{___stemmed};
  

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

    }
  }

  return \%counts;
}

sub vectorize {
  my ($self, %args) = @_;
  if ($self->{stem_stopwords}) {
    my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
    $self->{stopwords} = { map {+$_, 1} @$s };
    $args{tokens} = $self->_filter_tokens($args{tokens});
  }
  return $self->_weigh_tokens($args{tokens}, $args{weight});
}

sub read {
  my ($class, %args) = @_;
  my $path = delete $args{path} or die "Must specify 'path' argument to read()";
  
  my $self = $class->new(%args);

lib/AI/Categorizer/Document/SMART.pm  view on Meta::CPAN

     
     or die "Malformed record: $args{content}";
  
  my ($id, $categories, $title) = ($1, $2, $3);

  $self->{name} = $id;
  $self->{content} = { title => $title,
		       body  => $args{content} };

  my @categories = $categories =~ m/(.*?)\s+\d+[\s;]*/g;
  @categories = map AI::Categorizer::Category->by_name(name => $_), @categories;
  $self->{categories} = \@categories;
}

1;

lib/AI/Categorizer/Experiment.pm  view on Meta::CPAN

   categories => { type => ARRAYREF|HASHREF },
   sig_figs   => { type => SCALAR, default => 4 },
  );

sub new {
  my $package = shift;
  my $self = $package->Class::Container::new(@_);
  
  $self->{$_} = 0 foreach qw(a b c d);
  my $c = delete $self->{categories};
  $self->{categories} = { map {($_ => {a=>0, b=>0, c=>0, d=>0})} 
			  UNIVERSAL::isa($c, 'HASH') ? keys(%$c) : @$c
			};
  return $self;
}

sub add_hypothesis {
  my ($self, $h, $correct, $name) = @_;
  die "No hypothesis given to add_hypothesis()" unless $h;
  $name = $h->document_name unless defined $name;
  

lib/AI/Categorizer/Experiment.pm  view on Meta::CPAN

AI::Categorizer::Experiment - Coordinate experimental results

=head1 SYNOPSIS

 use AI::Categorizer::Experiment;
 my $e = new AI::Categorizer::Experiment(categories => \%categories);
 my $l = AI::Categorizer::Learner->restore_state(...path...);
 
 while (my $d = ... get document ...) {
   my $h = $l->categorize($d); # A Hypothesis
   $e->add_hypothesis($h, [map $_->name, $d->categories]);
 }
 
 print "Micro F1: ", $e->micro_F1, "\n"; # Access a single statistic
 print $e->stats_table; # Show several stats in table form

=head1 DESCRIPTION

The C<AI::Categorizer::Experiment> class helps you organize the
results of categorization experiments.  As you get lots of
categorization results (Hypotheses) back from the Learner, you can

lib/AI/Categorizer/FeatureSelector.pm  view on Meta::CPAN


AI::Categorizer::FeatureSelector - Abstract Feature Selection class

=head1 SYNOPSIS

 ...

=head1 DESCRIPTION

The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two.  Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.

=head1 METHODS

=over 4

=item new()

Creates a new KnowledgeSet and returns it.  Accepts the following

lib/AI/Categorizer/FeatureSelector/DocFrequency.pm  view on Meta::CPAN


AI::Categorizer::FeatureSelector - Abstract Feature Selection class

=head1 SYNOPSIS

 ...

=head1 DESCRIPTION

The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two.  Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.

=head1 METHODS

=over 4

=item new()

Creates a new KnowledgeSet and returns it.  Accepts the following

lib/AI/Categorizer/FeatureVector.pm  view on Meta::CPAN

}

sub scale {
  my ($self, $scalar) = @_;
  $_ *= $scalar foreach values %{$self->{features}};
  return $self;
}

sub as_boolean_hash {
  my $self = shift;
  return { map {($_ => 1)} keys %{$self->{features}} };
}

sub length {
  my $self = shift;
  return scalar keys %{$self->{features}};
}

sub clone {
  my $self = shift;
  return ref($self)->new( features => { %{$self->{features}} } );
}

sub intersection {
  my ($self, $other) = @_;
  $other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);

  my $common;
  if (UNIVERSAL::isa($other, 'ARRAY')) {
    $common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} @$other};
  } elsif (UNIVERSAL::isa($other, 'HASH')) {
    $common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} keys %$other};
  }
  return ref($self)->new( features => $common );
}

sub add {
  my ($self, $other) = @_;

  $other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);
  while (my ($k,$v) = each %$other) {
    $self->{features}{$k} += $v;

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN


  my @cats = keys %{ $stats{categories} };

  $stats{category_count}          = @cats;
  $stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
  $stats{tokens_per_document}     = $stats{token_count} / $stats{document_count};
  $stats{types_per_document}      = $stats{type_count}  / $stats{document_count};

  foreach my $thing ('type', 'token', 'document') {
    $stats{"${thing}s_per_category"} = AI::Categorizer::Util::average
      ( map { $stats{categories}{$_}{"${thing}_count"} } @cats );

    next unless @cats;

    # Compute the skews
    my $ssum;
    foreach my $cat (@cats) {
      $ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
    }
    $stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
  }

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN


  while (@sizes > 1) {
    my $size = int ($num_docs * shift @sizes);
    push @groups, [];
    for (0..$size) {
      push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
    }
  }
  push @groups, \@docs;

  return map { ref($self)->new( documents => $_ ) } @groups;
}

sub make_document {
  my ($self, %args) = @_;
  my $cats = delete $args{categories};
  my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
  my $d = $self->create_delayed_object('document', %args, categories => \@cats);
  $self->add_document($d);
}

sub add_document {
  my ($self, $doc) = @_;

  foreach ($doc->categories) {
    $_->add_document($doc);
  }

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

=head1 SYNOPSIS

 use AI::Categorizer::KnowledgeSet;
 my $k = new AI::Categorizer::KnowledgeSet(...parameters...);
 my $nb = new AI::Categorizer::Learner::NaiveBayes(...parameters...);
 $nb->train(knowledge_set => $k);

=head1 DESCRIPTION

The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two.  Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.

=head1 METHODS

=over 4

=item new()

Creates a new KnowledgeSet and returns it.  Accepts the following

lib/AI/Categorizer/Learner.pm  view on Meta::CPAN

}

sub train {
  my ($self, %args) = @_;
  $self->{knowledge_set} = $args{knowledge_set} if $args{knowledge_set};
  die "No knowledge_set provided" unless $self->{knowledge_set};

  $self->{knowledge_set}->finish;
  $self->create_model;    # Creates $self->{model}
  $self->delayed_object_params('hypothesis',
			       all_categories => [map $_->name, $self->categories],
			      );
}

sub prog_bar {
  my ($self, $count) = @_;
  
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
  
  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);

lib/AI/Categorizer/Learner.pm  view on Meta::CPAN

    }
    print STDERR $pb->report("%50b %p ($i/$count)$string\r", $i);
    return $i;
  };
}

sub categorize_collection {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No collection provided";

  my @all_cats = map $_->name, $self->categories;
  my $experiment = $self->create_delayed_object('experiment', categories => \@all_cats);
  my $pb = $self->verbose ? $self->prog_bar($c->count_documents) : sub {};
  while (my $d = $c->next) {
    my $h = $self->categorize($d);
    $experiment->add_hypothesis($h, [map $_->name, $d->categories]);
    $pb->($experiment);
    if ($self->verbose > 1) {
      printf STDERR ("%s: assigned=(%s) correct=(%s)\n",
		     $d->name,
		     join(', ', $h->categories),
		     join(', ', map $_->name, $d->categories));
    }
  }
  print STDERR "\n" if $self->verbose;

  return $experiment;
}

sub categorize {
  my ($self, $doc) = @_;
  

lib/AI/Categorizer/Learner/Boolean.pm  view on Meta::CPAN

sub get_boolean_score;  # Abstract method

sub threshold {
  my $self = shift;
  $self->{threshold} = shift if @_;
  return $self->{threshold};
}

sub categories {
  my $self = shift;
  return map AI::Categorizer::Category->by_name( name => $_ ), keys %{ $self->{model}{learners} };
}

1;
__END__

=head1 NAME

AI::Categorizer::Learner::Boolean - Abstract class for boolean categorizers

=head1 SYNOPSIS

lib/AI/Categorizer/Learner/KNN.pm  view on Meta::CPAN

    # Use the whole corpus
    @docset = $self->knowledge_set->documents;
  }
  
  foreach my $doc (@docset) {
    my $score = $doc->features->dot( $features );
    warn "Score for ", $doc->name, " (", ($doc->categories)[0]->name, "): $score" if $self->verbose > 1;
    $q->add($doc, $score);
  }
  
  my %scores = map {+$_->name, 0} $self->categories;
  foreach my $e (@{$q->entries}) {
    foreach my $cat ($e->{thing}->categories) {
      $scores{$cat->name} += ($self->{knn_weighting} eq 'score' ? $e->{score} : 1); #increment cat score
    }
  }
  
  $_ /= $self->{k_value} foreach values %scores;
  
  return (\%scores, $self->{threshold});
}

lib/AI/Categorizer/Learner/NaiveBayes.pm  view on Meta::CPAN

  (
   threshold => {type => SCALAR, default => 0.3},
  );

sub create_model {
  my $self = shift;
  my $m = $self->{model} = Algorithm::NaiveBayes->new;

  foreach my $d ($self->knowledge_set->documents) {
    $m->add_instance(attributes => $d->features->as_hash,
		     label      => [ map $_->name, $d->categories ]);
  }
  $m->train;
}

sub get_scores {
  my ($self, $newdoc) = @_;

  return ($self->{model}->predict( attributes => $newdoc->features->as_hash ),
	  $self->{threshold});
}

lib/AI/Categorizer/Learner/NaiveBayes.pm  view on Meta::CPAN

}

sub save_state {
  my $self = shift;
  local $self->{knowledge_set};  # Don't need the knowledge_set to categorize
  $self->SUPER::save_state(@_);
}

sub categories {
  my $self = shift;
  return map AI::Categorizer::Category->by_name( name => $_ ), $self->{model}->labels;
}

1;

__END__

=head1 NAME

AI::Categorizer::Learner::NaiveBayes - Naive Bayes Algorithm For AI::Categorizer

lib/AI/Categorizer/Learner/SVM.pm  view on Meta::CPAN

use File::Spec;

__PACKAGE__->valid_params
  (
   svm_kernel => {type => SCALAR, default => 'linear'},
  );

sub create_model {
  my $self = shift;
  my $f = $self->knowledge_set->features->as_hash;
  my $rmap = [ keys %$f ];
  $self->{model}{feature_map} = { map { $rmap->[$_], $_ } 0..$#$rmap };
  $self->{model}{feature_map_reverse} = $rmap;
  $self->SUPER::create_model(@_);
}

sub _doc_2_dataset {
  my ($self, $doc, $label, $fm) = @_;

  my $ds = new Algorithm::SVM::DataSet(Label => $label);
  my $f = $doc->features->as_hash;
  while (my ($k, $v) = each %$f) {
    next unless exists $fm->{$k};

lib/AI/Categorizer/Learner/SVM.pm  view on Meta::CPAN

  }
  return $ds;
}

sub create_boolean_model {
  my ($self, $positives, $negatives, $cat) = @_;
  my $svm = new Algorithm::SVM(Kernel => $self->{svm_kernel});
  
  my (@pos, @neg);
  foreach my $doc (@$positives) {
    push @pos, $self->_doc_2_dataset($doc, 1, $self->{model}{feature_map});
  }
  foreach my $doc (@$negatives) {
    push @neg, $self->_doc_2_dataset($doc, 0, $self->{model}{feature_map});
  }

  $svm->train(@pos, @neg);
  return $svm;
}

sub get_scores {
  my ($self, $doc) = @_;
  local $self->{current_doc} = $self->_doc_2_dataset($doc, -1, $self->{model}{feature_map});
  return $self->SUPER::get_scores($doc);
}

sub get_boolean_score {
  my ($self, $doc, $svm) = @_;
  return $svm->predict($self->{current_doc});
}

sub save_state {
  my ($self, $path) = @_;

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

  # Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement)
  my $dummy_features = $self->create_delayed_object('features');
  $m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]);

  $self->SUPER::create_model(@_);
}

sub create_boolean_model {
  my ($self, $pos, $neg, $cat) = @_;

  my @docs = (map([$_->features, 1], @$pos),
	      map([$_->features, 0], @$neg));
  my $train_file = $self->create_arff_file($cat->name . '_train', \@docs);

  my %info = (machine_file => $cat->name . '_model');
  my $outfile = File::Spec->catfile($self->{model}{_in_dir}, $info{machine_file});

  my @args = ($self->{java_path},
	      @{$self->{java_args}},
	      $self->{weka_classifier}, 
	      @{$self->{weka_args}},
	      '-t', $train_file,

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

}

sub categorize_collection {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No collection provided";
  
  my @alldocs;
  while (my $d = $c->next) {
    push @alldocs, $d;
  }
  my $doc_file = $self->create_arff_file("docs", [map [$_->features, 0], @alldocs]);

  my @assigned;
  
  my $l = $self->{model}{learners};
  foreach my $cat (keys %$l) {
    my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, "${cat}_model");
    my @args = ($self->{java_path},
		@{$self->{java_args}},
		$self->{weka_classifier},
		'-l', $machine_file,

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

	warn "Can't parse line $line";
	next;
      }
      my ($index, $predicted, $score) = ($1, $2, $3);
      $assigned[$index]{$cat} = $score if $predicted;  # Not sure what weka's scores represent
      print STDERR "$index: assigned=($predicted) correct=(", $alldocs[$index]->is_in_category($cat) ? 1 : 0, ")\n"
	if $self->verbose;
    }
  }

  my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]);
  foreach my $i (0..$#alldocs) {
    $experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name);
  }

  return $experiment;
}


sub do_cmd {
  my ($self, @cmd) = @_;
  print STDERR " % @cmd\n" if $self->verbose;
  

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

					     SUFFIX => '.arff',
					    );
  print $fh "\@RELATION foo\n\n";
  
  my $feature_names = $self->{model}{all_features};
  foreach my $name (@$feature_names) {
    print $fh "\@ATTRIBUTE feature-$name REAL\n";
  }
  print $fh "\@ATTRIBUTE category {1, 0}\n\n";
  
  my %feature_indices = map {$feature_names->[$_], $_} 0..$#{$feature_names};
  my $last_index = keys %feature_indices;
  
  # We use the 'sparse' format, see http://www.cs.waikato.ac.nz/~ml/weka/arff.html
  
  print $fh "\@DATA\n";
  foreach my $doc (@$docs) {
    my ($features, $cat) = @$doc;
    my $f = $features->as_hash;
    my @ordered_keys = (sort {$feature_indices{$a} <=> $feature_indices{$b}} 
			grep {exists $feature_indices{$_}}
			keys %$f);

    print $fh ("{",
	       join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"),
	       "}\n"
	      );
  }
  
  return $filename;
}

sub save_state {
  my ($self, $path) = @_;

lib/AI/Categorizer/Util.pm  view on Meta::CPAN

  my ($one, $two) = @_;
  $two = _hashify($two);

  return UNIVERSAL::isa($one, 'HASH') ?	# Accept hash or array for $one
    grep {exists $two->{$_}} keys %$one :
    grep {exists $two->{$_}} @$one;
}

sub _hashify {
  return $_[0] if UNIVERSAL::isa($_[0], 'HASH');
  return {map {$_=>1} @{$_[0]}};
}

sub random_elements {
  my ($a_ref, $n) = @_;
  return @$a_ref if $n >= @$a_ref;
  
  my ($select, $mode) = ($n < @$a_ref/2) ? ($n, 'include') : (@$a_ref - $n, 'exclude');

  my %i;
  $i{int rand @$a_ref} = 1 while keys(%i) < $select;

  return @{$a_ref}[keys %i] if $mode eq 'include';
  return map {$i{$_} ? () : $a_ref->[$_]} 0..$#$a_ref;
}

1;



( run in 0.691 second using v1.01-cache-2.11-cpan-49f99fa48dc )