AI-Classifier
view release on metacpan or search on metacpan
MANIFEST
META.json
META.yml
Makefile.PL
README
README.pod
dist.ini
lib/AI/Classifier/Text.pm
lib/AI/Classifier/Text/Analyzer.pm
lib/AI/Classifier/Text/FileLearner.pm
t/data/training_cache/predictor
t/data/training_initial_features/ham/1
t/data/training_initial_features/ham/1.data
t/data/training_set_ordered/ham/2
t/data/training_set_ordered/spam/1
t/file_reader.t
t/model.dat
t/release-pod-coverage.t
t/release-pod-syntax.t
t/state.t
t/text.t
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
my $self = shift;
my $rule = File::Find::Rule->new( );
$rule->file;
$rule->not_name('*.data');
$rule->start( $self->training_dir );
return $rule;
}
sub get_category {
my( $self, $file ) = @_;
my $training_dir = $self->training_dir;
my $rest = File::Spec->abs2rel( $file, $training_dir );
my @dirs = File::Spec->splitdir( $rest );
return $dirs[0]
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
my $file = $self->iterator->match;
return if !defined($file);
my $category = $self->get_category( $file );
open(my $fh, "<:encoding(UTF-8)", $file )
|| Carp::croak(
"Unable to read the specified training file: $file\n");
my $content = join('', <$fh>);
close $fh;
my $initial_features = {};
if( -f "$file.data" ){
my $data = do "$file.data";
$initial_features = $data->{initial_features}
}
my $features = $self->analyzer->analyze( $content, $initial_features );
return {
file => $file,
features => $features,
categories => [ $category ],
};
}
sub teach_it {
my $self = shift;
my $learner = $self->learner;
while ( my $data = $self->next ) {
normalize( $data->{features} );
$self->weight_terms($data);
$learner->add_example(
attributes => $data->{features},
labels => $data->{categories}
);
}
}
sub classifier {
my $self = shift;
$self->teach_it;
return AI::Classifier::Text->new(
classifier => $self->learner->classifier,
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
return $length ? scale($attrs, 1/$length) : $attrs;
}
1;
=pod
=head1 NAME
AI::Classifier::Text::FileLearner - Training data reader for AI::NaiveBayes
=head1 VERSION
version 0.03
=head1 SYNOPSIS
use AI::Classifier::Text::FileLearner;
my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' );
my $classifier = $learner->classifier;
=head1 DESCRIPTION
This is a trainer of text classifiers. It traverses a directory filled,
interprets the subdirectories in it as category names, reads all files in them and adds them
as examples for the classifier being trained.
head1 METHODS
=over 4
=item next
Internal method for traversing the training data directory.
=item classifier
Returns a trained classifier.
=back
=head1 AUTHOR
Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz SoÅnierz <tsosnierz@opera.com>
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
This software is copyright (c) 2012 by Opera Software ASA.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
__END__
# ABSTRACT: Training data reader for AI::NaiveBayes
t/file_reader.t view on Meta::CPAN
my $iterator = AI::Classifier::Text::FileLearner->new(
training_dir => File::Spec->catdir( qw( something something else ) )
);
is(
$iterator->get_category( File::Spec->catdir( qw( something something else aaa bbb ) ) ),
'aaa',
'get_category'
);
}
my @training_dirs = qw( t data training_set_ordered );
my $iterator = AI::Classifier::Text::FileLearner->new(
training_dir => File::Spec->catdir( @training_dirs ) );
my %hash;
while( my $doc = $iterator->next ){
$hash{$doc->{file}} = $doc;
}
my $target = {
File::Spec->catfile( @training_dirs, 'spam', '1' ) => {
t/file_reader.t view on Meta::CPAN
}
};
is_deeply( \%hash, $target );
my $classifier = AI::Classifier::Text::FileLearner->new( training_dir => File::Spec->catdir( @training_dirs ) )->classifier;
ok( $classifier, 'Classifier created' );
ok( $classifier->classifier->model()->{prior_probs}{ham}, 'ham prior probs' );
ok( $classifier->classifier->model()->{prior_probs}{spam}, 'spam prior probs' );
{
my $iterator = AI::Classifier::Text::FileLearner->new( training_dir => File::Spec->catdir( qw( t data training_initial_features ) ) );
my %hash;
while( my $doc = $iterator->next ){
$hash{$doc->{file}} = $doc;
}
my $target = {
File::Spec->catfile( qw( t data training_initial_features ham 1 ) ) => {
'file' => File::Spec->catfile( qw( t data training_initial_features ham 1 ) ),
'categories' => [ 'ham' ],
features => { trala => 1, some_tag => 3, NO_URLS => 2 }
},
};
is_deeply( \%hash, $target );
}
{
{
package TestLearner;
( run in 0.452 second using v1.01-cache-2.11-cpan-8d75d55dd25 )