AI-Classifier
view release on metacpan or search on metacpan
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
my $learner = $self->learner;
while ( my $data = $self->next ) {
normalize( $data->{features} );
$self->weight_terms($data);
$learner->add_example(
attributes => $data->{features},
labels => $data->{categories}
);
}
}
sub classifier {
my $self = shift;
$self->teach_it;
return AI::Classifier::Text->new(
classifier => $self->learner->classifier,
analyzer => $self->analyzer,
);
}
sub weight_terms {
my ( $self, $doc ) = @_;
my $f = $doc->{features};
given ($self->term_weighting) {
when ('n') {
my $max_tf = max values %$f;
$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
}
when ('b') {
$_ = $_ ? 1 : 0 for values %$f;
}
when (undef){
}
default {
croak 'Unknown weighting type: '.$self->term_weighting;
}
}
}
# this doesn't quite fit the current model (it requires the entire collection
# of documents to be in memory at once), but it may be useful to someone, someday
# so let's just leave it here
sub collection_weighting {
my (@documents, $subtrahend) = @_;
$subtrahend //= 0;
my $num_docs = +@documents;
my %frequency;
for my $doc (@documents) {
for my $k (keys %{$doc->{attributes}}) {
$frequency{$k}++;
}
}
foreach my $doc (@documents) {
my $f = $doc->{attributes};
for (keys %$f) {
$f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend);
}
}
}
sub euclidean_length {
my $f = shift;
my $total = 0;
foreach (values %$f) {
$total += $_**2;
}
return sqrt($total);
}
sub scale {
my ($f, $scalar) = @_;
$_ *= $scalar foreach values %$f;
return $f;
}
sub normalize {
my $attrs = shift;
my $length = euclidean_length($attrs);
return $length ? scale($attrs, 1/$length) : $attrs;
}
1;
=pod
=head1 NAME
AI::Classifier::Text::FileLearner - Training data reader for AI::NaiveBayes
=head1 VERSION
version 0.03
=head1 SYNOPSIS
use AI::Classifier::Text::FileLearner;
my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' );
my $classifier = $learner->classifier;
=head1 DESCRIPTION
This is a trainer of text classifiers. It traverses a directory filled,
interprets the subdirectories in it as category names, reads all files in them and adds them
as examples for the classifier being trained.
head1 METHODS
=over 4
( run in 2.169 seconds using v1.01-cache-2.11-cpan-cdf2f3d4e48 )