AI-Classifier
view release on metacpan or search on metacpan
package AI::Classifier::Text;
use strict;
use warnings;
use 5.010;
use Moose;
use MooseX::Storage;
use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()
with Storage(format => 'Storable', io => 'File');
has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');
before store => sub {
my $self = shift;
$self->{classifier_class} = $self->classifier->meta->name;
};
around load => sub {
my ($orig, $class) = (shift, shift);
my $self = $class->$orig(@_);
Module::Load::load($self->{classifier_class});
return $self;
};
sub classify {
my( $self, $text, $features ) = @_;
return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}
__PACKAGE__->meta->make_immutable;
1;
__END__
lib/AI/Classifier/Text.pm view on Meta::CPAN
$AI::Classifier::Text::VERSION = '0.03';
}
use strict;
use warnings;
use 5.010;
use Moose;
use MooseX::Storage;
use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()
with Storage(format => 'Storable', io => 'File');
has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');
before store => sub {
my $self = shift;
$self->{classifier_class} = $self->classifier->meta->name;
};
around load => sub {
my ($orig, $class) = (shift, shift);
my $self = $class->$orig(@_);
Module::Load::load($self->{classifier_class});
return $self;
};
sub classify {
my( $self, $text, $features ) = @_;
return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}
__PACKAGE__->meta->make_immutable;
1;
=pod
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
use strict;
use warnings;
use 5.010;
use Moose;
use Text::WordCounter;
has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );
sub analyze_urls {
my ( $self, $text, $features ) = @_;
my @urls;
my $p = URI::Find->new(
sub {
my ($uri, $t) = @_;
push @urls, $uri;
eval{
my $host = $uri->host;
$host =~ s/^www\.//;
$features->{ lc $host }++;
for (split /\//, $uri->path) {
if (length $_ > 3 ) {
$features->{ lc $_}++;
}
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
my %urls;
for my $url ( @urls ) {
if( $urls{$url}++ > 3 ){
$features->{REPEATED_URLS} = $weight;
last;
}
}
}
}
sub filter {
my ( $self, $text ) = @_;
$text =~ s/<[^>]+>//g;
return $text;
}
sub analyze {
my( $self, $text, $features ) = @_;
$features ||= {};
$self->analyze_urls( \$text, $features );
$text = $self->filter( $text );
$self->word_counter->word_count( $text, $features );
return $features;
}
__PACKAGE__->meta->make_immutable;
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
use Carp 'croak';
use AI::NaiveBayes::Learner;
use AI::Classifier::Text;
use AI::Classifier::Text::Analyzer;
has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
my $self = shift;
my $rule = File::Find::Rule->new( );
$rule->file;
$rule->not_name('*.data');
$rule->start( $self->training_dir );
return $rule;
}
sub get_category {
my( $self, $file ) = @_;
my $training_dir = $self->training_dir;
my $rest = File::Spec->abs2rel( $file, $training_dir );
my @dirs = File::Spec->splitdir( $rest );
return $dirs[0]
}
sub next {
my $self = shift;
my $file = $self->iterator->match;
return if !defined($file);
my $category = $self->get_category( $file );
open(my $fh, "<:encoding(UTF-8)", $file )
|| Carp::croak(
"Unable to read the specified training file: $file\n");
my $content = join('', <$fh>);
close $fh;
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
}
my $features = $self->analyzer->analyze( $content, $initial_features );
return {
file => $file,
features => $features,
categories => [ $category ],
};
}
sub teach_it {
my $self = shift;
my $learner = $self->learner;
while ( my $data = $self->next ) {
normalize( $data->{features} );
$self->weight_terms($data);
$learner->add_example(
attributes => $data->{features},
labels => $data->{categories}
);
}
}
sub classifier {
my $self = shift;
$self->teach_it;
return AI::Classifier::Text->new(
classifier => $self->learner->classifier,
analyzer => $self->analyzer,
);
}
sub weight_terms {
my ( $self, $doc ) = @_;
my $f = $doc->{features};
given ($self->term_weighting) {
when ('n') {
my $max_tf = max values %$f;
$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
}
when ('b') {
$_ = $_ ? 1 : 0 for values %$f;
}
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
}
default {
croak 'Unknown weighting type: '.$self->term_weighting;
}
}
}
# this doesn't quite fit the current model (it requires the entire collection
# of documents to be in memory at once), but it may be useful to someone, someday
# so let's just leave it here
sub collection_weighting {
my (@documents, $subtrahend) = @_;
$subtrahend //= 0;
my $num_docs = +@documents;
my %frequency;
for my $doc (@documents) {
for my $k (keys %{$doc->{attributes}}) {
$frequency{$k}++;
}
}
foreach my $doc (@documents) {
my $f = $doc->{attributes};
for (keys %$f) {
$f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend);
}
}
}
sub euclidean_length {
my $f = shift;
my $total = 0;
foreach (values %$f) {
$total += $_**2;
}
return sqrt($total);
}
sub scale {
my ($f, $scalar) = @_;
$_ *= $scalar foreach values %$f;
return $f;
}
sub normalize {
my $attrs = shift;
my $length = euclidean_length($attrs);
return $length ? scale($attrs, 1/$length) : $attrs;
}
1;
=pod
t/file_reader.t view on Meta::CPAN
features => { trala => 1, some_tag => 3, NO_URLS => 2 }
},
};
is_deeply( \%hash, $target );
}
{
{
package TestLearner;
sub new { bless { examples => [] } };
sub add_example {
my ( $self, @example ) = @_;
push @{ $self->{examples} }, \@example;
}
}
my $internal_learner = TestLearner->new();
my $learner = AI::Classifier::Text::FileLearner->new(
training_dir => File::Spec->catdir( @training_dirs ),
learner => $internal_learner
$tp->store($file);
is -e $file, 1;
# Restore
$tp = AI::Classifier::Text->load($file);
ok $tp;
isa_ok( $tp, 'AI::Classifier::Text' );
################################################################
sub _hash { +{ map {$_,1} @_ } }
( run in 0.291 second using v1.01-cache-2.11-cpan-4d50c553e7e )