AI-Classifier
view release on metacpan or search on metacpan
lib/AI/Classifier/Text/Analyzer.pm view on Meta::CPAN
package AI::Classifier::Text::Analyzer;
{
$AI::Classifier::Text::Analyzer::VERSION = '0.03';
}
use strict;
use warnings;
use 5.010;
use Moose;
use Text::WordCounter;
has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );
sub analyze_urls {
my ( $self, $text, $features ) = @_;
my @urls;
my $p = URI::Find->new(
sub {
my ($uri, $t) = @_;
push @urls, $uri;
eval{
my $host = $uri->host;
$host =~ s/^www\.//;
$features->{ lc $host }++;
for (split /\//, $uri->path) {
if (length $_ > 3 ) {
$features->{ lc $_}++;
}
}
}
}
);
$p->find($text);
my $weight = $self->global_feature_weight;
if (!@urls) {
$features->{NO_URLS} = $weight;
}
if (scalar @urls > length( $text ) / 120 ) {
$features->{MANY_URLS} = $weight;
}
{
my %urls;
for my $url ( @urls ) {
if( $urls{$url}++ > 3 ){
$features->{REPEATED_URLS} = $weight;
last;
}
}
}
}
sub filter {
my ( $self, $text ) = @_;
$text =~ s/<[^>]+>//g;
return $text;
}
sub analyze {
my( $self, $text, $features ) = @_;
$features ||= {};
$self->analyze_urls( \$text, $features );
$text = $self->filter( $text );
$self->word_counter->word_count( $text, $features );
return $features;
}
__PACKAGE__->meta->make_immutable;
1;
=pod
=head1 NAME
AI::Classifier::Text::Analyzer - computing feature vectors from documents
=head1 VERSION
version 0.03
=head1 SYNOPSIS
use AI::Classifier::Text::Analyzer;
my $analyzer = AI::Classifier::Text::Analyzer->new();
my $features = $analyzer->analyze( 'aaaa http://www.example.com/bbb?xx=yy&bb=cc;dd=ff' );
=head1 DESCRIPTION
Computes feature vectors of text using some heuristics and adds words count
(using L<Text::WordCounter> by default).
The object is immutable - but some methods use a second parameter as an accumulator for the
features found in given text.
It uses some specific values and methods that work for our case - but are not guaranteed
to bring good results universally - see the source for details!
( run in 1.141 second using v1.01-cache-2.11-cpan-39bf76dae61 )