AI-Classifier
view release on metacpan or search on metacpan
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
my $file = $self->iterator->match;
return if !defined($file);
my $category = $self->get_category( $file );
open(my $fh, "<:encoding(UTF-8)", $file )
|| Carp::croak(
"Unable to read the specified training file: $file\n");
my $content = join('', <$fh>);
close $fh;
my $initial_features = {};
if( -f "$file.data" ){
my $data = do "$file.data";
$initial_features = $data->{initial_features}
}
my $features = $self->analyzer->analyze( $content, $initial_features );
return {
file => $file,
features => $features,
categories => [ $category ],
};
}
sub teach_it {
my $self = shift;
my $learner = $self->learner;
while ( my $data = $self->next ) {
normalize( $data->{features} );
$self->weight_terms($data);
$learner->add_example(
attributes => $data->{features},
labels => $data->{categories}
);
}
}
sub classifier {
my $self = shift;
$self->teach_it;
return AI::Classifier::Text->new(
classifier => $self->learner->classifier,
analyzer => $self->analyzer,
);
}
sub weight_terms {
my ( $self, $doc ) = @_;
my $f = $doc->{features};
given ($self->term_weighting) {
when ('n') {
my $max_tf = max values %$f;
$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
}
when ('b') {
$_ = $_ ? 1 : 0 for values %$f;
}
when (undef){
}
default {
croak 'Unknown weighting type: '.$self->term_weighting;
}
}
}
# this doesn't quite fit the current model (it requires the entire collection
# of documents to be in memory at once), but it may be useful to someone, someday
# so let's just leave it here
sub collection_weighting {
my (@documents, $subtrahend) = @_;
$subtrahend //= 0;
my $num_docs = +@documents;
my %frequency;
for my $doc (@documents) {
for my $k (keys %{$doc->{attributes}}) {
$frequency{$k}++;
}
}
foreach my $doc (@documents) {
my $f = $doc->{attributes};
for (keys %$f) {
$f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend);
}
}
}
sub euclidean_length {
my $f = shift;
my $total = 0;
foreach (values %$f) {
$total += $_**2;
}
return sqrt($total);
}
sub scale {
my ($f, $scalar) = @_;
$_ *= $scalar foreach values %$f;
return $f;
}
sub normalize {
my $attrs = shift;
my $length = euclidean_length($attrs);
return $length ? scale($attrs, 1/$length) : $attrs;
}
1;
=pod
=head1 NAME
( run in 1.985 second using v1.01-cache-2.11-cpan-df04353d9ac )