AI-Classifier

 view release on metacpan or  search on metacpan

README.pod  view on Meta::CPAN

package AI::Classifier::Text;

use strict;
use warnings;
use 5.010;
use Moose;
use MooseX::Storage;

use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()

with Storage(format => 'Storable', io => 'File');

has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');

before store => sub {
    my $self = shift;
    $self->{classifier_class} = $self->classifier->meta->name;
};

around load => sub {
    my ($orig, $class) = (shift, shift);
    my $self = $class->$orig(@_);
    Module::Load::load($self->{classifier_class});
    return $self;
};

sub classify {
    my( $self, $text, $features ) = @_;
    return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}

__PACKAGE__->meta->make_immutable;

1;

__END__

lib/AI/Classifier/Text.pm  view on Meta::CPAN

  $AI::Classifier::Text::VERSION = '0.03';
}

use strict;
use warnings;
use 5.010;
use Moose;
use MooseX::Storage;

use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()

with Storage(format => 'Storable', io => 'File');

has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');

before store => sub {
    my $self = shift;
    $self->{classifier_class} = $self->classifier->meta->name;
};

around load => sub {
    my ($orig, $class) = (shift, shift);
    my $self = $class->$orig(@_);
    Module::Load::load($self->{classifier_class});
    return $self;
};

sub classify {
    my( $self, $text, $features ) = @_;
    return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}

__PACKAGE__->meta->make_immutable;

1;

=pod

lib/AI/Classifier/Text/Analyzer.pm  view on Meta::CPAN

use strict;
use warnings;
use 5.010;
use Moose;

use Text::WordCounter;

has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );

sub analyze_urls {
    my ( $self, $text, $features ) = @_;
    my @urls;
    my $p = URI::Find->new(
        sub {
            my ($uri, $t) = @_;
            push @urls, $uri;
            eval{
                my $host = $uri->host;
                $host =~ s/^www\.//;
                $features->{ lc $host }++;
                for (split /\//, $uri->path) {
                    if (length $_ > 3 ) {
                        $features->{ lc $_}++;
                    }

lib/AI/Classifier/Text/Analyzer.pm  view on Meta::CPAN

        my %urls;
        for my $url ( @urls ) {
            if( $urls{$url}++ > 3 ){
                $features->{REPEATED_URLS} = $weight;
                last;
            }
        }
    }
}

sub filter {
    my ( $self, $text ) = @_;
    $text =~ s/<[^>]+>//g;
    return $text;
}

sub analyze {
    my( $self, $text, $features ) = @_;
    $features ||= {};
    $self->analyze_urls( \$text, $features );
    $text = $self->filter( $text );
    $self->word_counter->word_count( $text, $features );
    return $features;
}

__PACKAGE__->meta->make_immutable;

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN

use Carp 'croak';
use AI::NaiveBayes::Learner;
use AI::Classifier::Text;
use AI::Classifier::Text::Analyzer;

has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
    my $self = shift;
    my $rule = File::Find::Rule->new( );
    $rule->file;
    $rule->not_name('*.data');
    $rule->start( $self->training_dir );
    return $rule;
}

sub get_category {
    my( $self, $file ) = @_;
    my $training_dir = $self->training_dir;
    my $rest = File::Spec->abs2rel( $file, $training_dir );
    my @dirs = File::Spec->splitdir( $rest );
    return $dirs[0]
}


sub next {
    my $self = shift;

    my $file = $self->iterator->match;
    return if !defined($file);
    my $category = $self->get_category( $file );
    open(my $fh, "<:encoding(UTF-8)", $file )
    || Carp::croak(
                "Unable to read the specified training file: $file\n");
    my $content = join('', <$fh>);
    close $fh;

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN

    }
    my $features = $self->analyzer->analyze( $content, $initial_features );

    return { 
        file => $file, 
        features => $features, 
        categories => [ $category ],
    };
}

sub teach_it {
    my $self = shift;
    my $learner = $self->learner;
    while ( my $data  = $self->next ) {
        normalize( $data->{features} );
        $self->weight_terms($data);
        $learner->add_example( 
            attributes => $data->{features},
            labels     => $data->{categories}
        );
    }
}


sub classifier {
    my $self = shift;
    $self->teach_it;
    return AI::Classifier::Text->new(
        classifier => $self->learner->classifier,
        analyzer => $self->analyzer,
    );
}


sub weight_terms {
    my ( $self, $doc ) = @_;
    my $f = $doc->{features};
    given ($self->term_weighting) {
        when ('n') {
            my $max_tf = max values %$f;
            $_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
        }
        when ('b') {
            $_ = $_ ? 1 : 0 for values %$f;
        }

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN

        }
        default {
            croak 'Unknown weighting type: '.$self->term_weighting;
        }
    }
}

# this doesn't quite fit the current model (it requires the entire collection
# of documents to be in memory at once), but it may be useful to someone, someday
# so let's just leave it here
sub collection_weighting {
    my (@documents, $subtrahend) = @_;
    $subtrahend //= 0;

    my $num_docs   = +@documents;

    my %frequency;
    for my $doc (@documents) {
        for my $k (keys %{$doc->{attributes}}) {
            $frequency{$k}++;
        }
    }

    foreach my $doc (@documents) {
        my $f = $doc->{attributes};
        for (keys %$f) {
            $f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend);
        }
    }
}

sub euclidean_length {
    my $f = shift;

    my $total = 0;
    foreach (values %$f) {
        $total += $_**2;
    }

    return sqrt($total);
}

sub scale {
    my ($f, $scalar) = @_;

    $_ *= $scalar foreach values %$f;

    return $f;
}

sub normalize {
    my $attrs = shift;

    my $length = euclidean_length($attrs);

    return $length ? scale($attrs, 1/$length) : $attrs;
}

1;

=pod

t/file_reader.t  view on Meta::CPAN

            features => { trala => 1, some_tag => 3, NO_URLS => 2 }
        },
    };
    is_deeply( \%hash, $target );
}

{
    {
        package TestLearner;

        sub new { bless { examples => [] } };
        sub add_example {
            my ( $self, @example ) = @_;
            push @{ $self->{examples} }, \@example;
        }

    }

    my $internal_learner = TestLearner->new();
    my $learner = AI::Classifier::Text::FileLearner->new( 
        training_dir => File::Spec->catdir( @training_dirs ),
        learner => $internal_learner

t/state.t  view on Meta::CPAN

$tp->store($file);
is -e $file, 1;

# Restore
$tp = AI::Classifier::Text->load($file);
ok $tp;
isa_ok( $tp, 'AI::Classifier::Text' );


################################################################
sub _hash { +{ map {$_,1} @_ } }



( run in 0.291 second using v1.01-cache-2.11-cpan-4d50c553e7e )