AI-Classifier

 view release on metacpan or  search on metacpan

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN

package AI::Classifier::Text::FileLearner;
{
  $AI::Classifier::Text::FileLearner::VERSION = '0.03';
}
use strict;
use warnings;
use 5.010;

use Moose;
use File::Find::Rule;
use File::Spec;
use List::Util 'max';
use Carp 'croak';
use AI::NaiveBayes::Learner;
use AI::Classifier::Text;
use AI::Classifier::Text::Analyzer;

has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
    my $self = shift;
    my $rule = File::Find::Rule->new( );
    $rule->file;
    $rule->not_name('*.data');
    $rule->start( $self->training_dir );
    return $rule;
}

sub get_category {
    my( $self, $file ) = @_;
    my $training_dir = $self->training_dir;
    my $rest = File::Spec->abs2rel( $file, $training_dir );
    my @dirs = File::Spec->splitdir( $rest );
    return $dirs[0]
}


sub next {
    my $self = shift;

    my $file = $self->iterator->match;
    return if !defined($file);
    my $category = $self->get_category( $file );
    open(my $fh, "<:encoding(UTF-8)", $file )
    || Carp::croak(
                "Unable to read the specified training file: $file\n");
    my $content = join('', <$fh>);
    close $fh;
    my $initial_features = {};
    if( -f "$file.data" ){
        my $data = do "$file.data";
        $initial_features = $data->{initial_features}
    }
    my $features = $self->analyzer->analyze( $content, $initial_features );

    return { 
        file => $file, 
        features => $features, 
        categories => [ $category ],
    };
}

sub teach_it {
    my $self = shift;
    my $learner = $self->learner;
    while ( my $data  = $self->next ) {
        normalize( $data->{features} );
        $self->weight_terms($data);
        $learner->add_example( 
            attributes => $data->{features},
            labels     => $data->{categories}
        );
    }
}


sub classifier {
    my $self = shift;
    $self->teach_it;
    return AI::Classifier::Text->new(
        classifier => $self->learner->classifier,
        analyzer => $self->analyzer,
    );
}


sub weight_terms {
    my ( $self, $doc ) = @_;
    my $f = $doc->{features};
    given ($self->term_weighting) {
        when ('n') {
            my $max_tf = max values %$f;
            $_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
        }
        when ('b') {
            $_ = $_ ? 1 : 0 for values %$f;
        }
        when (undef){
        }
        default {
            croak 'Unknown weighting type: '.$self->term_weighting;
        }
    }
}

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 0.561 second using v1.00-cache-2.02-grep-82fe00e-cpan-2c419f77a38b )