AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/FeatureSelector.pm  view on Meta::CPAN

package AI::Categorizer::FeatureSelector;

use strict;
use Class::Container;
use base qw(Class::Container);

use Params::Validate qw(:types);
use AI::Categorizer::FeatureVector;
use AI::Categorizer::Util;
use Carp qw(croak);

__PACKAGE__->valid_params
  (
   features_kept => {
		     type => SCALAR,
		     default => 0.2,
		    },
   verbose => {
	       type => SCALAR,
	       default => 0,
	      },
  );

sub verbose {
  my $self = shift;
  $self->{verbose} = shift if @_;
  return $self->{verbose};
}

sub reduce_features {
  # Takes a feature vector whose weights are "feature scores", and
  # chops to the highest n features.  n is specified by the
  # 'features_kept' parameter.  If it's zero, all features are kept.
  # If it's between 0 and 1, we multiply by the present number of
  # features.  If it's greater than 1, we treat it as the number of
  # features to use.

  my ($self, $f, %args) = @_;
  my $kept = defined $args{features_kept} ? $args{features_kept} : $self->{features_kept};
  return $f unless $kept;

  my $num_kept = ($kept < 1 ? 
		  $f->length * $kept :
		  $kept);

  print "Trimming features - # features = " . $f->length . "\n" if $self->verbose;
  
  # This is algorithmic overkill, but the sort seems fast enough.  Will revisit later.
  my $features = $f->as_hash;
  my @new_features = (sort {$features->{$b} <=> $features->{$a}} keys %$features)
                      [0 .. $num_kept-1];

  my $result = $f->intersection( \@new_features );
  print "Finished trimming features - # features = " . $result->length . "\n" if $self->verbose;
  return $result;
}

# Abstract methods
sub rank_features;
sub scan_features;

sub select_features {
  my ($self, %args) = @_;
  
  die "No knowledge_set parameter provided to select_features()"
    unless $args{knowledge_set};

  my $f = $self->rank_features( knowledge_set => $args{knowledge_set} );
  return $self->reduce_features( $f, features_kept => $args{features_kept} );
}


1;

__END__

=head1 NAME

AI::Categorizer::FeatureSelector - Abstract Feature Selection class

=head1 SYNOPSIS

 ...

=head1 DESCRIPTION

The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two.  Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.

=head1 METHODS

=over 4

=item new()

Creates a new KnowledgeSet and returns it.  Accepts the following
parameters:

=over 4

=item load

If a C<load> parameter is present, the C<load()> method will be
invoked immediately.  If the C<load> parameter is a string, it will be
passed as the C<path> parameter to C<load()>.  If the C<load>
parameter is a hash reference, it will represent all the parameters to
pass to C<load()>.

=item categories

An optional reference to an array of Category objects representing the
complete set of categories in a KnowledgeSet.  If used, the
C<documents> parameter should also be specified.

=item documents

An optional reference to an array of Document objects representing the
complete set of documents in a KnowledgeSet.  If used, the
C<categories> parameter should also be specified.



( run in 0.892 second using v1.01-cache-2.11-cpan-39bf76dae61 )