AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/FeatureSelector/CategorySelector.pm  view on Meta::CPAN

package AI::Categorizer::FeatureSelector::CategorySelector;

use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector);

use Params::Validate qw(:types);

__PACKAGE__->contained_objects
  (
   features => { class => 'AI::Categorizer::FeatureVector',
		 delayed => 1 },
  );

1;


sub reduction_function;

# figure out the feature set before reading collection (default)

sub scan_features {
  my ($self, %args) = @_;
  my $c = $args{collection} or 
    die "No 'collection' parameter provided to scan_features()";

  if(!($self->{features_kept})) {return;}

  my %cat_features;
  my $coll_features = $self->create_delayed_object('features');
  my $nbDocuments = 0;

  while (my $doc = $c->next) {
    $nbDocuments++;
    $args{prog_bar}->() if $args{prog_bar};
    my $docfeatures = $doc->features->as_hash;
    foreach my $cat ($doc->categories) {
      my $catname = $cat->name;
      if(!(exists $cat_features{$catname})) {
        $cat_features{$catname} = $self->create_delayed_object('features');
      }
      $cat_features{$catname}->add($docfeatures);
    }
    $coll_features->add( $docfeatures );
  }
  print STDERR "\n* Computing Chi-Square values\n" if $self->verbose;

  my $r_features = $self->create_delayed_object('features');
  my @terms = $coll_features->names;
  my $progressBar = $self->prog_bar(scalar @terms);
  my $allFeaturesSum = $coll_features->sum;
  my %cat_features_sum;
  while( my($catname,$features) = each %cat_features ) {
    $cat_features_sum{$catname} = $features->sum;
  }

  foreach my $term (@terms) {
    $progressBar->();
    $r_features->{features}{$term} = $self->reduction_function($term,
      $nbDocuments,$allFeaturesSum,$coll_features,
      \%cat_features,\%cat_features_sum);
  }
  print STDERR "\n" if $self->verbose;
  my $new_features = $self->reduce_features($r_features);
  return $coll_features->intersection( $new_features );
}


# calculate feature set after reading collection (scan_first=0)

sub rank_features {
  die "CategorySelector->rank_features is not implemented yet!";
#  my ($self, %args) = @_;
#  
#  my $k = $args{knowledge_set} 
#    or die "No knowledge_set parameter provided to rank_features()";
#
#  my %freq_counts;
#  foreach my $name ($k->features->names) {
#    $freq_counts{$name} = $k->document_frequency($name);
#  }
#  return $self->create_delayed_object('features', features => \%freq_counts);
}


# copied from KnowledgeSet->prog_bar by Ken Williams

sub prog_bar {
  my ($self, $count) = @_;

  return sub {} unless $self->verbose;
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";

  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
  };
}


__END__

=head1 NAME

AI::Categorizer::CategorySelector - Abstract Category Selection class

=head1 SYNOPSIS

This class is abstract. For example of instanciation, see
ChiSquare.

=head1 DESCRIPTION

A base class for FeatureSelectors that calculate their global features
from a set of features by categories.

=head1 METHODS

=head1 AUTHOR

Francois Paradis, paradifr@iro.umontreal.ca
with inspiration from Ken Williams AI::Categorizer code

=cut



( run in 0.224 second using v1.01-cache-2.11-cpan-4d50c553e7e )