AI-Classifier
view release on metacpan or search on metacpan
use 5.010;
use Moose;
use MooseX::Storage;
use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()
with Storage(format => 'Storable', io => 'File');
has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');
before store => sub {
my $self = shift;
$self->{classifier_class} = $self->classifier->meta->name;
};
around load => sub {
my ($orig, $class) = (shift, shift);
my $cl = AI::Classifier::Text->new(classifier => AI::NaiveBayes->new(...));
my $res = $cl->classify("do cats eat bats?");
$res = $cl->classify("do cats eat bats?", { new_user => 1 });
$cl->store('some-file');
# later
my $cl = AI::Classifier::Text->load('some-file');
my $res = $cl->classify("do cats eat bats?");
=head1 DESCRIPTION
AI::Classifier::Text combines a lexical analyzer (by default being
L<AI::Classifier::Text::Analyzer>) and a classifier (like AI::NaiveBayes) to
perform text classification.
This is partially based on AI::TextCategorizer.
=head1 ATTRIBUTES
=over 4
=item C<classifier>
An object that'll perform classification of supplied feature vectors. Has to
define a C<classify()> method, which accepts a hash refence. The return value of
C<AI::Classifier::Text->classify()> will be the return value of C<classifier>'s
C<classify()> method.
This attribute has to be supplied to the C<new()> method during object creation.
=item C<analyzer>
The class performing lexical analysis of the text in order to produce a feature
vector. This defaults to C<AI::Classifier::Text::Analyzer>.
=back
=head1 METHODS
=over 4
=item C<< new(classifier => $foo) >>
Creates a new C<AI::Classifier::Text> object. The classifier argument is mandatory.
lib/AI/Classifier/Text.pm view on Meta::CPAN
use 5.010;
use Moose;
use MooseX::Storage;
use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()
with Storage(format => 'Storable', io => 'File');
has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');
before store => sub {
my $self = shift;
$self->{classifier_class} = $self->classifier->meta->name;
};
around load => sub {
my ($orig, $class) = (shift, shift);
lib/AI/Classifier/Text.pm view on Meta::CPAN
my $cl = AI::Classifier::Text->new(classifier => AI::NaiveBayes->new(...));
my $res = $cl->classify("do cats eat bats?");
$res = $cl->classify("do cats eat bats?", { new_user => 1 });
$cl->store('some-file');
# later
my $cl = AI::Classifier::Text->load('some-file');
my $res = $cl->classify("do cats eat bats?");
=head1 DESCRIPTION
AI::Classifier::Text combines a lexical analyzer (by default being
L<AI::Classifier::Text::Analyzer>) and a classifier (like AI::NaiveBayes) to
perform text classification.
This is partially based on AI::TextCategorizer.
=head1 ATTRIBUTES
=over 4
=item C<classifier>
lib/AI/Classifier/Text.pm view on Meta::CPAN
An object that'll perform classification of supplied feature vectors. Has to
define a C<classify()> method, which accepts a hash refence. The return value of
C<AI::Classifier::Text->classify()> will be the return value of C<classifier>'s
C<classify()> method.
This attribute has to be supplied to the C<new()> method during object creation.
=item C<analyzer>
The class performing lexical analysis of the text in order to produce a feature
vector. This defaults to C<AI::Classifier::Text::Analyzer>.
=back
=head1 METHODS
=over 4
=item C<< new(classifier => $foo) >>
Creates a new C<AI::Classifier::Text> object. The classifier argument is mandatory.
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
$AI::Classifier::Text::Analyzer::VERSION = '0.03';
}
use strict;
use warnings;
use 5.010;
use Moose;
use Text::WordCounter;
has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );
sub analyze_urls {
my ( $self, $text, $features ) = @_;
my @urls;
my $p = URI::Find->new(
sub {
my ($uri, $t) = @_;
push @urls, $uri;
eval{
my $host = $uri->host;
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
use AI::Classifier::Text::Analyzer;
my $analyzer = AI::Classifier::Text::Analyzer->new();
my $features = $analyzer->analyze( 'aaaa http://www.example.com/bbb?xx=yy&bb=cc;dd=ff' );
=head1 DESCRIPTION
Computes feature vectors of text using some heuristics and adds words count
(using L<Text::WordCounter> by default).
The object is immutable - but some methods use a second parameter as an accumulator for the
features found in given text.
It uses some specific values and methods that work for our case - but are not guaranteed
to bring good results universally - see the source for details!
=head1 ATTRIBUTES
=over 4
=item C<word_counter>
Object with a word_count method that will calculate the frequency of words in a text document.
By default L<Text::WordCounter>.
=item C<global_feature_weight>
The weight assigned for computed features of the text document. By default 2.
=back
=head1 METHODS
=over 4
=item C<< new(word_counter => $foo, global_feature_weight => 3) >>
Creates a new AI::Classifier::Text::Analyzer object. Both arguments are optional.
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
use Moose;
use File::Find::Rule;
use File::Spec;
use List::Util 'max';
use Carp 'croak';
use AI::NaiveBayes::Learner;
use AI::Classifier::Text;
use AI::Classifier::Text::Analyzer;
has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
my $self = shift;
my $rule = File::Find::Rule->new( );
$rule->file;
$rule->not_name('*.data');
$rule->start( $self->training_dir );
return $rule;
}
lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN
given ($self->term_weighting) {
when ('n') {
my $max_tf = max values %$f;
$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
}
when ('b') {
$_ = $_ ? 1 : 0 for values %$f;
}
when (undef){
}
default {
croak 'Unknown weighting type: '.$self->term_weighting;
}
}
}
# this doesn't quite fit the current model (it requires the entire collection
# of documents to be in memory at once), but it may be useful to someone, someday
# so let's just leave it here
sub collection_weighting {
my (@documents, $subtrahend) = @_;
( run in 0.440 second using v1.01-cache-2.11-cpan-0a6323c29d9 )