AI-Classifier
view release on metacpan or search on metacpan
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
my $num_docs = +@documents;
my %frequency;
for my $doc (@documents) {
for my $k (keys %{$doc->{attributes}}) {
$frequency{$k}++;
}
}
foreach my $doc (@documents) {
my $f = $doc->{attributes};
for (keys %$f) {
$f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend);
}
}
}
sub euclidean_length {
my $f = shift;
my $total = 0;
foreach (values %$f) {
$total += $_**2;
}
return sqrt($total);
}
sub scale {
my ($f, $scalar) = @_;
$_ *= $scalar foreach values %$f;
return $f;
}
sub normalize {
my $attrs = shift;
my $length = euclidean_length($attrs);
return $length ? scale($attrs, 1/$length) : $attrs;
}
1;
=pod
=head1 NAME
AI::Classifier::Text::FileLearner - Training data reader for AI::NaiveBayes
=head1 VERSION
version 0.03
=head1 SYNOPSIS
use AI::Classifier::Text::FileLearner;
my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' );
my $classifier = $learner->classifier;
=head1 DESCRIPTION
This is a trainer of text classifiers. It traverses a directory filled,
interprets the subdirectories in it as category names, reads all files in them and adds them
as examples for the classifier being trained.
head1 METHODS
=over 4
=item next
Internal method for traversing the training data directory.
=item classifier
Returns a trained classifier.
=back
=head1 AUTHOR
Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz SoÅnierz <tsosnierz@opera.com>
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2012 by Opera Software ASA.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
__END__
# ABSTRACT: Training data reader for AI::NaiveBayes
( run in 0.783 second using v1.01-cache-2.11-cpan-39bf76dae61 )