AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/FeatureSelector/ChiSquare.pm  view on Meta::CPAN

package AI::Categorizer::FeatureSelector::ChiSquare;

use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector::CategorySelector);

use Params::Validate qw(:types);

# Chi-Square function
# NB: this could probably be optimised a bit...

sub reduction_function {
  my ($self,$term,$N,$allFeaturesSum,
      $coll_features,$cat_features,$cat_features_sum) = @_;
  my $CHI2SUM = 0;
  my $nbcats = 0;
  foreach my $catname (keys %{$cat_features}) {
#  while ( my ($catname,$catfeatures) = each %{$cat_features}) {
    my ($A,$B,$C,$D); # A = number of times where t and c co-occur
                      # B =   "     "   "   t occurs without c
                      # C =   "     "   "   c occurs without t
                      # D =   "     "   "   neither c nor t occur
    $A = $cat_features->{$catname}->value($term);
    $B = $coll_features->value($term) - $A;
    $C = $cat_features_sum->{$catname} - $A;
    $D = $allFeaturesSum - ($A+$B+$C);
    my $ADminCB = ($A*$D)-($C*$B);
    my $CHI2 = $N*$ADminCB*$ADminCB / (($A+$C)*($B+$D)*($A+$B)*($C+$D));
    $CHI2SUM += $CHI2;
    $nbcats++;
  }
  return $CHI2SUM/$nbcats;
}

1;

__END__

=head1 NAME

AI::Categorizer::FeatureSelector::ChiSquare - ChiSquare Feature Selection class

=head1 SYNOPSIS

 # the recommended way to use this class is to let the KnowledgeSet
 # instanciate it

 use AI::Categorizer::KnowledgeSetSMART;
 my $ksetCHI = new AI::Categorizer::KnowledgeSetSMART(
   tfidf_notation =>'Categorizer',
   feature_selection=>'chi_square', ...other parameters...); 

 # however it is also possible to pass an instance to the KnowledgeSet

 use AI::Categorizer::KnowledgeSet;
 use AI::Categorizer::FeatureSelector::ChiSquare;
 my $ksetCHI = new AI::Categorizer::KnowledgeSet(
   feature_selector => new ChiSquare(features_kept=>2000,verbose=>1),
   ...other parameters...
   );

=head1 DESCRIPTION

Feature selection with the ChiSquare function.

  Chi-Square(t,ci) = (N.(AD-CB)^2)
                    -----------------------
                    (A+C).(B+D).(A+B).(C+D)

where t = term
      ci = category i
      N = number of documents in the collection
      A = number of times where t and c co-occur
      B =   "     "   "   t occurs without c
      C =   "     "   "   c occurs without t
      D =   "     "   "   neither c nor t occur

for more details, see :
Yiming Yang, Jan O. Pedersen, A Comparative Study on Feature Selection 
in Text Categorization, in Proceedings of ICML-97, 14th International 
Conference on Machine Learning, 1997.
(available on citeseer.nj.nec.com)

=head1 METHODS

=head1 AUTHOR

Francois Paradis, paradifr@iro.umontreal.ca
with inspiration from Ken Williams AI::Categorizer code

=cut



( run in 0.461 second using v1.01-cache-2.11-cpan-4d50c553e7e )