AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/FeatureSelector/ChiSquare.pm view on Meta::CPAN
package AI::Categorizer::FeatureSelector::ChiSquare;
use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector::CategorySelector);
use Params::Validate qw(:types);
# Chi-Square function
# NB: this could probably be optimised a bit...
sub reduction_function {
my ($self,$term,$N,$allFeaturesSum,
$coll_features,$cat_features,$cat_features_sum) = @_;
my $CHI2SUM = 0;
my $nbcats = 0;
foreach my $catname (keys %{$cat_features}) {
# while ( my ($catname,$catfeatures) = each %{$cat_features}) {
my ($A,$B,$C,$D); # A = number of times where t and c co-occur
# B = " " " t occurs without c
# C = " " " c occurs without t
# D = " " " neither c nor t occur
$A = $cat_features->{$catname}->value($term);
$B = $coll_features->value($term) - $A;
$C = $cat_features_sum->{$catname} - $A;
$D = $allFeaturesSum - ($A+$B+$C);
my $ADminCB = ($A*$D)-($C*$B);
my $CHI2 = $N*$ADminCB*$ADminCB / (($A+$C)*($B+$D)*($A+$B)*($C+$D));
$CHI2SUM += $CHI2;
$nbcats++;
}
return $CHI2SUM/$nbcats;
}
1;
__END__
=head1 NAME
AI::Categorizer::FeatureSelector::ChiSquare - ChiSquare Feature Selection class
=head1 SYNOPSIS
# the recommended way to use this class is to let the KnowledgeSet
# instanciate it
use AI::Categorizer::KnowledgeSetSMART;
my $ksetCHI = new AI::Categorizer::KnowledgeSetSMART(
tfidf_notation =>'Categorizer',
feature_selection=>'chi_square', ...other parameters...);
# however it is also possible to pass an instance to the KnowledgeSet
use AI::Categorizer::KnowledgeSet;
use AI::Categorizer::FeatureSelector::ChiSquare;
my $ksetCHI = new AI::Categorizer::KnowledgeSet(
feature_selector => new ChiSquare(features_kept=>2000,verbose=>1),
...other parameters...
);
=head1 DESCRIPTION
Feature selection with the ChiSquare function.
Chi-Square(t,ci) = (N.(AD-CB)^2)
-----------------------
(A+C).(B+D).(A+B).(C+D)
where t = term
ci = category i
N = number of documents in the collection
A = number of times where t and c co-occur
B = " " " t occurs without c
C = " " " c occurs without t
D = " " " neither c nor t occur
for more details, see :
Yiming Yang, Jan O. Pedersen, A Comparative Study on Feature Selection
in Text Categorization, in Proceedings of ICML-97, 14th International
Conference on Machine Learning, 1997.
(available on citeseer.nj.nec.com)
=head1 METHODS
=head1 AUTHOR
Francois Paradis, paradifr@iro.umontreal.ca
with inspiration from Ken Williams AI::Categorizer code
=cut
( run in 0.461 second using v1.01-cache-2.11-cpan-4d50c553e7e )