AI-Classifier
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
package AI::Classifier::Text::FileLearner;
{
$AI::Classifier::Text::FileLearner::VERSION = '0.03';
}
use strict;
use warnings;
use 5.010;
use Moose;
use File::Find::Rule;
use File::Spec;
use List::Util 'max';
use Carp 'croak';
use AI::NaiveBayes::Learner;
use AI::Classifier::Text;
use AI::Classifier::Text::Analyzer;
has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
my $self = shift;
my $rule = File::Find::Rule->new( );
$rule->file;
$rule->not_name('*.data');
$rule->start( $self->training_dir );
return $rule;
}
sub get_category {
my( $self, $file ) = @_;
my $training_dir = $self->training_dir;
my $rest = File::Spec->abs2rel( $file, $training_dir );
my @dirs = File::Spec->splitdir( $rest );
return $dirs[0]
}
sub next {
my $self = shift;
my $file = $self->iterator->match;
return if !defined($file);
my $category = $self->get_category( $file );
open(my $fh, "<:encoding(UTF-8)", $file )
|| Carp::croak(
"Unable to read the specified training file: $file\n");
my $content = join('', <$fh>);
close $fh;
my $initial_features = {};
if( -f "$file.data" ){
my $data = do "$file.data";
$initial_features = $data->{initial_features}
}
my $features = $self->analyzer->analyze( $content, $initial_features );
return {
file => $file,
features => $features,
categories => [ $category ],
};
}
sub teach_it {
my $self = shift;
my $learner = $self->learner;
while ( my $data = $self->next ) {
normalize( $data->{features} );
$self->weight_terms($data);
$learner->add_example(
attributes => $data->{features},
labels => $data->{categories}
);
}
}
sub classifier {
my $self = shift;
$self->teach_it;
return AI::Classifier::Text->new(
classifier => $self->learner->classifier,
analyzer => $self->analyzer,
);
}
sub weight_terms {
my ( $self, $doc ) = @_;
my $f = $doc->{features};
given ($self->term_weighting) {
when ('n') {
my $max_tf = max values %$f;
$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
}
when ('b') {
$_ = $_ ? 1 : 0 for values %$f;
}
when (undef){
}
default {
croak 'Unknown weighting type: '.$self->term_weighting;
}
}
}
view all matches for this distributionview release on metacpan - search on metacpan
( run in 0.621 second using v1.00-cache-2.02-grep-82fe00e-cpan-2c419f77a38b )