AI-Classifier

 view release on metacpan or  search on metacpan

MANIFEST  view on Meta::CPAN

MANIFEST
META.json
META.yml
Makefile.PL
README
README.pod
dist.ini
lib/AI/Classifier/Text.pm
lib/AI/Classifier/Text/Analyzer.pm
lib/AI/Classifier/Text/FileLearner.pm
t/data/training_cache/predictor
t/data/training_initial_features/ham/1
t/data/training_initial_features/ham/1.data
t/data/training_set_ordered/ham/2
t/data/training_set_ordered/spam/1
t/file_reader.t
t/model.dat
t/release-pod-coverage.t
t/release-pod-syntax.t
t/state.t
t/text.t

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN


has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
    my $self = shift;
    my $rule = File::Find::Rule->new( );
    $rule->file;
    $rule->not_name('*.data');
    $rule->start( $self->training_dir );
    return $rule;
}

sub get_category {
    my( $self, $file ) = @_;
    my $training_dir = $self->training_dir;
    my $rest = File::Spec->abs2rel( $file, $training_dir );
    my @dirs = File::Spec->splitdir( $rest );
    return $dirs[0]

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN


    my $file = $self->iterator->match;
    return if !defined($file);
    my $category = $self->get_category( $file );
    open(my $fh, "<:encoding(UTF-8)", $file )
    || Carp::croak(
                "Unable to read the specified training file: $file\n");
    my $content = join('', <$fh>);
    close $fh;
    my $initial_features = {};
    if( -f "$file.data" ){
        my $data = do "$file.data";
        $initial_features = $data->{initial_features}
    }
    my $features = $self->analyzer->analyze( $content, $initial_features );

    return { 
        file => $file, 
        features => $features, 
        categories => [ $category ],
    };
}

sub teach_it {
    my $self = shift;
    my $learner = $self->learner;
    while ( my $data  = $self->next ) {
        normalize( $data->{features} );
        $self->weight_terms($data);
        $learner->add_example( 
            attributes => $data->{features},
            labels     => $data->{categories}
        );
    }
}


sub classifier {
    my $self = shift;
    $self->teach_it;
    return AI::Classifier::Text->new(
        classifier => $self->learner->classifier,

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN


    return $length ? scale($attrs, 1/$length) : $attrs;
}

1;

=pod

=head1 NAME

AI::Classifier::Text::FileLearner - Training data reader for AI::NaiveBayes

=head1 VERSION

version 0.03

=head1 SYNOPSIS

    use AI::Classifier::Text::FileLearner;

    my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' );

    my $classifier = $learner->classifier;

=head1 DESCRIPTION

This is a trainer of text classifiers.  It traverses a directory filled,
interprets the subdirectories in it as category names, reads all files in them and adds them
as examples for the classifier being trained.

head1 METHODS

=over 4

=item next

Internal method for traversing the training data directory.

=item classifier

Returns a trained classifier.

=back

=head1 AUTHOR

Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz Sośnierz <tsosnierz@opera.com>

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN


This software is copyright (c) 2012 by Opera Software ASA.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut

__END__

# ABSTRACT: Training data reader for AI::NaiveBayes

t/file_reader.t  view on Meta::CPAN

    my $iterator = AI::Classifier::Text::FileLearner->new( 
        training_dir => File::Spec->catdir( qw( something something else ) ) 
    );
    is( 
        $iterator->get_category( File::Spec->catdir( qw( something something else aaa bbb ) ) ),
        'aaa',
        'get_category' 
    );
}

my @training_dirs = qw( t data training_set_ordered );
my $iterator = AI::Classifier::Text::FileLearner->new( 
    training_dir => File::Spec->catdir( @training_dirs ) );


my %hash;
while( my $doc = $iterator->next ){
    $hash{$doc->{file}} = $doc;
}
my $target = {
    File::Spec->catfile( @training_dirs, 'spam', '1' ) => {

t/file_reader.t  view on Meta::CPAN

    }
};
is_deeply( \%hash, $target );

my $classifier = AI::Classifier::Text::FileLearner->new( training_dir => File::Spec->catdir( @training_dirs ) )->classifier;

ok( $classifier, 'Classifier created' );
ok( $classifier->classifier->model()->{prior_probs}{ham}, 'ham prior probs' );
ok( $classifier->classifier->model()->{prior_probs}{spam}, 'spam prior probs' );
{
    my $iterator = AI::Classifier::Text::FileLearner->new( training_dir => File::Spec->catdir( qw( t data training_initial_features ) ) );

    my %hash;
    while( my $doc = $iterator->next ){
        $hash{$doc->{file}} = $doc;
    }
    my $target = {
        File::Spec->catfile( qw( t data training_initial_features ham 1 ) ) => {
            'file' => File::Spec->catfile( qw( t data training_initial_features ham 1 ) ),
            'categories' => [ 'ham' ],
            features => { trala => 1, some_tag => 3, NO_URLS => 2 }
        },
    };
    is_deeply( \%hash, $target );
}

{
    {
        package TestLearner;



( run in 0.382 second using v1.01-cache-2.11-cpan-8d75d55dd25 )