AI-Classifier

 view release on metacpan or  search on metacpan

MANIFEST  view on Meta::CPAN

META.json
META.yml
Makefile.PL
README
README.pod
dist.ini
lib/AI/Classifier/Text.pm
lib/AI/Classifier/Text/Analyzer.pm
lib/AI/Classifier/Text/FileLearner.pm
t/data/training_cache/predictor
t/data/training_initial_features/ham/1
t/data/training_initial_features/ham/1.data
t/data/training_set_ordered/ham/2
t/data/training_set_ordered/spam/1
t/file_reader.t
t/model.dat
t/release-pod-coverage.t
t/release-pod-syntax.t
t/state.t
t/text.t

README.pod  view on Meta::CPAN

};

around load => sub {
    my ($orig, $class) = (shift, shift);
    my $self = $class->$orig(@_);
    Module::Load::load($self->{classifier_class});
    return $self;
};

sub classify {
    my( $self, $text, $features ) = @_;
    return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}

__PACKAGE__->meta->make_immutable;

1;

__END__

# ABSTRACT: A convenient class for text classification

README.pod  view on Meta::CPAN

perform text classification.

This is partially based on AI::TextCategorizer.

=head1 ATTRIBUTES

=over 4

=item C<classifier>

An object that'll perform classification of supplied feature vectors. Has to
define a C<classify()> method, which accepts a hash refence. The return value of
C<AI::Classifier::Text->classify()> will be the return value of C<classifier>'s
C<classify()> method.

This attribute has to be supplied to the C<new()> method during object creation.

=item C<analyzer>

The class performing lexical analysis of the text in order to produce a feature
vector. This defaults to C<AI::Classifier::Text::Analyzer>.

=back

=head1 METHODS

=over 4

=item C<< new(classifier => $foo) >>

Creates a new C<AI::Classifier::Text> object. The classifier argument is mandatory.

=item C<classify($document, $features)>

Categorize the given document. A lexical analyzer will be used to extract
features from C<$document>, and in addition to that the features from
C<$features> hash reference will be added. The return value comes directly from
the C<classifier> object's C<classify> method.

=back

=head1 SEE ALSO

AI::NaiveBayes (3), AI::Categorizer(3)

=cut

lib/AI/Classifier/Text.pm  view on Meta::CPAN

};

around load => sub {
    my ($orig, $class) = (shift, shift);
    my $self = $class->$orig(@_);
    Module::Load::load($self->{classifier_class});
    return $self;
};

sub classify {
    my( $self, $text, $features ) = @_;
    return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}

__PACKAGE__->meta->make_immutable;

1;

=pod

=head1 NAME

lib/AI/Classifier/Text.pm  view on Meta::CPAN

perform text classification.

This is partially based on AI::TextCategorizer.

=head1 ATTRIBUTES

=over 4

=item C<classifier>

An object that'll perform classification of supplied feature vectors. Has to
define a C<classify()> method, which accepts a hash refence. The return value of
C<AI::Classifier::Text->classify()> will be the return value of C<classifier>'s
C<classify()> method.

This attribute has to be supplied to the C<new()> method during object creation.

=item C<analyzer>

The class performing lexical analysis of the text in order to produce a feature
vector. This defaults to C<AI::Classifier::Text::Analyzer>.

=back

=head1 METHODS

=over 4

=item C<< new(classifier => $foo) >>

Creates a new C<AI::Classifier::Text> object. The classifier argument is mandatory.

=item C<classify($document, $features)>

Categorize the given document. A lexical analyzer will be used to extract
features from C<$document>, and in addition to that the features from
C<$features> hash reference will be added. The return value comes directly from
the C<classifier> object's C<classify> method.

=back

=head1 SEE ALSO

AI::NaiveBayes (3), AI::Categorizer(3)

=head1 AUTHOR

lib/AI/Classifier/Text/Analyzer.pm  view on Meta::CPAN

}

use strict;
use warnings;
use 5.010;
use Moose;

use Text::WordCounter;

has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );

sub analyze_urls {
    my ( $self, $text, $features ) = @_;
    my @urls;
    my $p = URI::Find->new(
        sub {
            my ($uri, $t) = @_;
            push @urls, $uri;
            eval{
                my $host = $uri->host;
                $host =~ s/^www\.//;
                $features->{ lc $host }++;
                for (split /\//, $uri->path) {
                    if (length $_ > 3 ) {
                        $features->{ lc $_}++;
                    }
                }
            }
        }
    );
    $p->find($text);
    my $weight = $self->global_feature_weight;
    if (!@urls) {
        $features->{NO_URLS} = $weight;
    }
    if (scalar @urls > length( $text ) / 120 ) {
        $features->{MANY_URLS} = $weight;
    }
    {
        my %urls;
        for my $url ( @urls ) {
            if( $urls{$url}++ > 3 ){
                $features->{REPEATED_URLS} = $weight;
                last;
            }
        }
    }
}

sub filter {
    my ( $self, $text ) = @_;
    $text =~ s/<[^>]+>//g;
    return $text;
}

sub analyze {
    my( $self, $text, $features ) = @_;
    $features ||= {};
    $self->analyze_urls( \$text, $features );
    $text = $self->filter( $text );
    $self->word_counter->word_count( $text, $features );
    return $features;
}

__PACKAGE__->meta->make_immutable;

1;

=pod

=head1 NAME

AI::Classifier::Text::Analyzer - computing feature vectors from documents

=head1 VERSION

version 0.03

=head1 SYNOPSIS

    use AI::Classifier::Text::Analyzer;

    my $analyzer = AI::Classifier::Text::Analyzer->new();
    
    my $features = $analyzer->analyze( 'aaaa http://www.example.com/bbb?xx=yy&bb=cc;dd=ff' );

=head1 DESCRIPTION

Computes feature vectors of text using some heuristics and adds words count 
(using L<Text::WordCounter> by default).

The object is immutable - but some methods use a second parameter as an accumulator for the
features found in given text.

It uses some specific values and methods that work for our case - but are not guaranteed
to bring good results universally - see the source for details!

=head1 ATTRIBUTES

=over 4

=item C<word_counter>

Object with a word_count method that will calculate the frequency of words in a text document.
By default L<Text::WordCounter>.

=item C<global_feature_weight>

The weight assigned for computed features of the text document. By default 2.

=back

=head1 METHODS

=over 4

=item C<< new(word_counter => $foo, global_feature_weight => 3) >>

Creates a new AI::Classifier::Text::Analyzer object. Both arguments are optional.

=item C<analyze($document, $features)>

Computes the feature vector of the given document and adds the initial vector of C<$features>.

=item C<analyze_urls($document, $features)>

Computes a vector special url related features of a given text - currently there are used 
C<NO_URLS>, C<MANY_URLS> and C<REPEATED_URLS> features.  

=item C<filter($document)>

Removes html related parts from the text.

=back

=head1 SEE ALSO

AI::NaiveBayes (3), AI::Classifier::Text(3)

lib/AI/Classifier/Text/Analyzer.pm  view on Meta::CPAN


This software is copyright (c) 2012 by Opera Software ASA.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut

__END__

# ABSTRACT: computing feature vectors from documents

lib/AI/Classifier/Text/FileLearner.pm  view on Meta::CPAN

    my $self = shift;

    my $file = $self->iterator->match;
    return if !defined($file);
    my $category = $self->get_category( $file );
    open(my $fh, "<:encoding(UTF-8)", $file )
    || Carp::croak(
                "Unable to read the specified training file: $file\n");
    my $content = join('', <$fh>);
    close $fh;
    my $initial_features = {};
    if( -f "$file.data" ){
        my $data = do "$file.data";
        $initial_features = $data->{initial_features}
    }
    my $features = $self->analyzer->analyze( $content, $initial_features );

    return { 
        file => $file, 
        features => $features, 
        categories => [ $category ],
    };
}

sub teach_it {
    my $self = shift;
    my $learner = $self->learner;
    while ( my $data  = $self->next ) {
        normalize( $data->{features} );
        $self->weight_terms($data);
        $learner->add_example( 
            attributes => $data->{features},
            labels     => $data->{categories}
        );
    }
}


sub classifier {
    my $self = shift;
    $self->teach_it;
    return AI::Classifier::Text->new(
        classifier => $self->learner->classifier,
        analyzer => $self->analyzer,
    );
}


sub weight_terms {
    my ( $self, $doc ) = @_;
    my $f = $doc->{features};
    given ($self->term_weighting) {
        when ('n') {
            my $max_tf = max values %$f;
            $_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
        }
        when ('b') {
            $_ = $_ ? 1 : 0 for values %$f;
        }
        when (undef){
        }

t/data/training_initial_features/ham/1.data  view on Meta::CPAN

{
    initial_features => { some_tag => 3 },
}

t/file_reader.t  view on Meta::CPAN

my $iterator = AI::Classifier::Text::FileLearner->new( 
    training_dir => File::Spec->catdir( @training_dirs ) );


my %hash;
while( my $doc = $iterator->next ){
    $hash{$doc->{file}} = $doc;
}
my $target = {
    File::Spec->catfile( @training_dirs, 'spam', '1' ) => {
        'features' => { ccccc => 1, NO_URLS => 2 },
        'file' => File::Spec->catfile( @training_dirs, 'spam', '1' ),
        'categories' => [ 'spam' ]
    },
    File::Spec->catfile( @training_dirs, 'ham', '2' ) => {
        'features' => { ccccc => 1, aaaa => 1, NO_URLS => 2 },
        'file' => File::Spec->catfile( @training_dirs, 'ham', '2' ),
        'categories' => [ 'ham' ]
    }
};
is_deeply( \%hash, $target );

my $classifier = AI::Classifier::Text::FileLearner->new( training_dir => File::Spec->catdir( @training_dirs ) )->classifier;

ok( $classifier, 'Classifier created' );
ok( $classifier->classifier->model()->{prior_probs}{ham}, 'ham prior probs' );
ok( $classifier->classifier->model()->{prior_probs}{spam}, 'spam prior probs' );
{
    my $iterator = AI::Classifier::Text::FileLearner->new( training_dir => File::Spec->catdir( qw( t data training_initial_features ) ) );

    my %hash;
    while( my $doc = $iterator->next ){
        $hash{$doc->{file}} = $doc;
    }
    my $target = {
        File::Spec->catfile( qw( t data training_initial_features ham 1 ) ) => {
            'file' => File::Spec->catfile( qw( t data training_initial_features ham 1 ) ),
            'categories' => [ 'ham' ],
            features => { trala => 1, some_tag => 3, NO_URLS => 2 }
        },
    };
    is_deeply( \%hash, $target );
}

{
    {
        package TestLearner;

        sub new { bless { examples => [] } };

t/text.t  view on Meta::CPAN

use strict;
use warnings;

use Test::More;
use AI::Classifier::Text::Analyzer;

my $analyzer = AI::Classifier::Text::Analyzer->new();
    
ok( $analyzer, 'Analyzer created' );

my $features = {};
$analyzer->analyze( 'aaaa http://www.example.com/bbb?xx=yy&bb=cc;dd=ff', $features );
is_deeply( $features, { aaaa => 1, 'example.com' => 1, MANY_URLS => 2 } );

$features = $analyzer->analyze( 'nothing special' );
is_deeply( $features, { nothing => 1, special => 1, NO_URLS => 2 } );

my $text = 'http://www.hungry.birds! http://www.hungry.birds! http://www.hungry.birds! '
      . 'http://www.hungry.birds! http://www.hungry.birds!';
$features = {};
$analyzer->analyze_urls( \$text, $features );
is_deeply( $features, { 
        'hungry.birds!' => 5, 
        REPEATED_URLS => 2,
        MANY_URLS => 2,
    } 
);

done_testing;



( run in 0.333 second using v1.01-cache-2.11-cpan-a5abf4f5562 )