AI-Classifier
    
    
  
  
  
view release on metacpan or search on metacpan
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
}
use strict;
use warnings;
use 5.010;
use Moose;
use Text::WordCounter;
has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );
sub analyze_urls {
    my ( $self, $text, $features ) = @_;
    my @urls;
    my $p = URI::Find->new(
        sub {
            my ($uri, $t) = @_;
            push @urls, $uri;
            eval{
                my $host = $uri->host;
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
                $features->{ lc $host }++;
                for (split /\//, $uri->path) {
                    if (length $_ > 3 ) {
                        $features->{ lc $_}++;
                    }
                }
            }
        }
    );
    $p->find($text);
    my $weight = $self->global_feature_weight;
    if (!@urls) {
        $features->{NO_URLS} = $weight;
    }
    if (scalar @urls > length( $text ) / 120 ) {
        $features->{MANY_URLS} = $weight;
    }
    {
        my %urls;
        for my $url ( @urls ) {
            if( $urls{$url}++ > 3 ){
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
=head1 ATTRIBUTES
=over 4
=item C<word_counter>
Object with a word_count method that will calculate the frequency of words in a text document.
By default L<Text::WordCounter>.
=item C<global_feature_weight>
The weight assigned for computed features of the text document. By default 2.
=back
=head1 METHODS
=over 4
=item C<< new(word_counter => $foo, global_feature_weight => 3) >>
Creates a new AI::Classifier::Text::Analyzer object. Both arguments are optional.
=item C<analyze($document, $features)>
Computes the feature vector of the given document and adds the initial vector of C<$features>.
=item C<analyze_urls($document, $features)>
Computes a vector special url related features of a given text - currently there are used 
( run in 0.265 second using v1.01-cache-2.11-cpan-c21f80fb71c )