sub results from the CPAN

AI-Classifier

view release on metacpan or search on metacpan

    received the program in object code or executable form alone.)

Source code for a work means the preferred form of the work for making
modifications to it.  For an executable file, complete source code means
all the source code for all modules it contains; but, as a special
exception, it need not include source code for modules which are standard
libraries that accompany the operating system on which the executable
file runs, or for standard header files or definitions files that
accompany that operating system.

  4. You may not copy, modify, sublicense, distribute or transfer the
Program except as expressly provided under this General Public License.
Any attempt otherwise to copy, modify, sublicense, distribute or transfer
the Program is void, and will automatically terminate your rights to use
the Program under this License.  However, parties who have received
copies, or rights to use copies, from you under this General Public
License will not have their licenses terminated so long as such parties
remain in full compliance.

  5. By copying, distributing or modifying the Program (or any work based
on the Program) you indicate your acceptance of this license to do so,
and all its terms and conditions.

  6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the original
licensor to copy, distribute or modify the Program subject to these
terms and conditions.  You may not impose any further restrictions on the
recipients' exercise of the rights granted herein.

  7. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

Each version is given a distinguishing version number.  If the Program
specifies a version number of the license which applies to it and "any

LICENSE view on Meta::CPAN

may not charge a fee for this Package itself. However, you may distribute this
Package in aggregate with other (possibly commercial) programs as part of a
larger (possibly commercial) software distribution provided that you do not
advertise this Package as a product of your own.

6. The scripts and library files supplied as input to or produced as output
from the programs of this Package do not automatically fall under the copyright
of this Package, but belong to whomever generated them, and may be sold
commercially, and may be aggregated with this Package.

7. C or perl subroutines supplied by you and linked into this Package shall not
be considered part of this Package.

8. The name of the Copyright Holder may not be used to endorse or promote
products derived from this software without specific prior written permission.

9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.

The End

README.pod view on Meta::CPAN

package AI::Classifier::Text;

use strict;
use warnings;
use 5.010;
use Moose;
use MooseX::Storage;

use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()

with Storage(format => 'Storable', io => 'File');

has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');

before store => sub {
    my $self = shift;
    $self->{classifier_class} = $self->classifier->meta->name;
};

around load => sub {
    my ($orig, $class) = (shift, shift);
    my $self = $class->$orig(@_);
    Module::Load::load($self->{classifier_class});
    return $self;
};

sub classify {
    my( $self, $text, $features ) = @_;
    return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}

__PACKAGE__->meta->make_immutable;

1;

__END__

lib/AI/Classifier/Text.pm view on Meta::CPAN

  $AI::Classifier::Text::VERSION = '0.03';
}

use strict;
use warnings;
use 5.010;
use Moose;
use MooseX::Storage;

use AI::Classifier::Text::Analyzer;
use Module::Load (); # don't overwrite our sub load() with Module::Load::load()

with Storage(format => 'Storable', io => 'File');

has classifier => (is => 'ro', required => 1 );
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
# for store/load only, don't touch unless you really know what you're doing
has classifier_class => (is => 'bare');

before store => sub {
    my $self = shift;
    $self->{classifier_class} = $self->classifier->meta->name;
};

around load => sub {
    my ($orig, $class) = (shift, shift);
    my $self = $class->$orig(@_);
    Module::Load::load($self->{classifier_class});
    return $self;
};

sub classify {
    my( $self, $text, $features ) = @_;
    return $self->classifier->classify( $self->analyzer->analyze( $text, $features ) );
}

__PACKAGE__->meta->make_immutable;

1;

=pod

lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN

  $AI::Classifier::Text::Analyzer::VERSION = '0.03';
}

use strict;
use warnings;
use 5.010;
use Moose;

use Text::WordCounter;

has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );

sub analyze_urls {
    my ( $self, $text, $features ) = @_;
    my @urls;
    my $p = URI::Find->new(
        sub {
            my ($uri, $t) = @_;
            push @urls, $uri;
            eval{
                my $host = $uri->host;
                $host =~ s/^www\.//;
                $features->{ lc $host }++;
                for (split /\//, $uri->path) {
                    if (length $_ > 3 ) {
                        $features->{ lc $_}++;
                    }

lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN

        my %urls;
        for my $url ( @urls ) {
            if( $urls{$url}++ > 3 ){
                $features->{REPEATED_URLS} = $weight;
                last;
            }
        }
    }
}

sub filter {
    my ( $self, $text ) = @_;
    $text =~ s/<[^>]+>//g;
    return $text;
}

sub analyze {
    my( $self, $text, $features ) = @_;
    $features ||= {};
    $self->analyze_urls( \$text, $features );
    $text = $self->filter( $text );
    $self->word_counter->word_count( $text, $features );
    return $features;
}

__PACKAGE__->meta->make_immutable;

lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN

use Moose;
use File::Find::Rule;
use File::Spec;
use List::Util 'max';
use Carp 'croak';
use AI::NaiveBayes::Learner;
use AI::Classifier::Text;
use AI::Classifier::Text::Analyzer;

has term_weighting => (is => 'ro', isa => 'Str');
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
has iterator => ( is => 'ro', lazy_build => 1 );
sub _build_iterator {
    my $self = shift;
    my $rule = File::Find::Rule->new( );
    $rule->file;
    $rule->not_name('*.data');
    $rule->start( $self->training_dir );
    return $rule;
}

sub get_category {
    my( $self, $file ) = @_;
    my $training_dir = $self->training_dir;
    my $rest = File::Spec->abs2rel( $file, $training_dir );
    my @dirs = File::Spec->splitdir( $rest );
    return $dirs[0]
}


sub next {
    my $self = shift;

    my $file = $self->iterator->match;
    return if !defined($file);
    my $category = $self->get_category( $file );
    open(my $fh, "<:encoding(UTF-8)", $file )
    || Carp::croak(
                "Unable to read the specified training file: $file\n");
    my $content = join('', <$fh>);
    close $fh;

lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN

    }
    my $features = $self->analyzer->analyze( $content, $initial_features );

    return { 
        file => $file, 
        features => $features, 
        categories => [ $category ],
    };
}

sub teach_it {
    my $self = shift;
    my $learner = $self->learner;
    while ( my $data  = $self->next ) {
        normalize( $data->{features} );
        $self->weight_terms($data);
        $learner->add_example( 
            attributes => $data->{features},
            labels     => $data->{categories}
        );
    }
}


sub classifier {
    my $self = shift;
    $self->teach_it;
    return AI::Classifier::Text->new(
        classifier => $self->learner->classifier,
        analyzer => $self->analyzer,
    );
}


sub weight_terms {
    my ( $self, $doc ) = @_;
    my $f = $doc->{features};
    given ($self->term_weighting) {
        when ('n') {
            my $max_tf = max values %$f;
            $_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
        }
        when ('b') {
            $_ = $_ ? 1 : 0 for values %$f;
        }

lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN

        }
        default {
            croak 'Unknown weighting type: '.$self->term_weighting;
        }
    }
}

# this doesn't quite fit the current model (it requires the entire collection
# of documents to be in memory at once), but it may be useful to someone, someday
# so let's just leave it here
sub collection_weighting {
    my (@documents, $subtrahend) = @_;
    $subtrahend //= 0;

    my $num_docs   = +@documents;

    my %frequency;
    for my $doc (@documents) {
        for my $k (keys %{$doc->{attributes}}) {
            $frequency{$k}++;
        }
    }

    foreach my $doc (@documents) {
        my $f = $doc->{attributes};
        for (keys %$f) {
            $f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend);
        }
    }
}

sub euclidean_length {
    my $f = shift;

    my $total = 0;
    foreach (values %$f) {
        $total += $_**2;
    }

    return sqrt($total);
}

sub scale {
    my ($f, $scalar) = @_;

    $_ *= $scalar foreach values %$f;

    return $f;
}

sub normalize {
    my $attrs = shift;

    my $length = euclidean_length($attrs);

    return $length ? scale($attrs, 1/$length) : $attrs;
}

1;

=pod

lib/AI/Classifier/Text/FileLearner.pm view on Meta::CPAN


    use AI::Classifier::Text::FileLearner;

    my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' );

    my $classifier = $learner->classifier;

=head1 DESCRIPTION

This is a trainer of text classifiers.  It traverses a directory filled,
interprets the subdirectories in it as category names, reads all files in them and adds them
as examples for the classifier being trained.

head1 METHODS

=over 4

=item next

Internal method for traversing the training data directory.

t/file_reader.t view on Meta::CPAN

            features => { trala => 1, some_tag => 3, NO_URLS => 2 }
        },
    };
    is_deeply( \%hash, $target );
}

{
    {
        package TestLearner;

        sub new { bless { examples => [] } };
        sub add_example {
            my ( $self, @example ) = @_;
            push @{ $self->{examples} }, \@example;
        }

    }

    my $internal_learner = TestLearner->new();
    my $learner = AI::Classifier::Text::FileLearner->new( 
        training_dir => File::Spec->catdir( @training_dirs ),
        learner => $internal_learner

t/state.t view on Meta::CPAN

$tp->store($file);
is -e $file, 1;

# Restore
$tp = AI::Classifier::Text->load($file);
ok $tp;
isa_ok( $tp, 'AI::Classifier::Text' );


################################################################
sub _hash { +{ map {$_,1} @_ } }

( run in 0.431 second using v1.01-cache-2.11-cpan-88abd93f124 )