AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Document.pm view on Meta::CPAN
return $tokens_in;
}
sub _weigh_tokens {
my ($self, $tokens, $weight) = @_;
my %counts;
if (my $b = 0+$self->{front_bias}) {
die "'front_bias' value must be between -1 and 1"
unless -1 < $b and $b < 1;
my $n = @$tokens;
my $r = ($b-1)**2 / ($b+1);
my $mult = $weight * log($r)/($r-1);
my $i = 0;
foreach my $feature (@$tokens) {
$counts{$feature} += $mult * $r**($i/$n);
$i++;
}
} else {
foreach my $feature (@$tokens) {
$counts{$feature} += $weight;
}
}
return \%counts;
}
sub vectorize {
my ($self, %args) = @_;
if ($self->{stem_stopwords}) {
my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
$self->{stopwords} = { map {+$_, 1} @$s };
$args{tokens} = $self->_filter_tokens($args{tokens});
}
return $self->_weigh_tokens($args{tokens}, $args{weight});
}
sub read {
my ($class, %args) = @_;
my $path = delete $args{path} or die "Must specify 'path' argument to read()";
my $self = $class->new(%args);
open my($fh), "< $path" or die "$path: $!";
$self->parse_handle(handle => $fh);
close $fh;
$self->finish;
return $self;
}
sub dump_features {
my ($self, %args) = @_;
my $path = $args{path} or die "No 'path' argument given to dump_features()";
open my($fh), "> $path" or die "Can't create $path: $!";
my $f = $self->features->as_hash;
while (my ($k, $v) = each %$f) {
print $fh "$k\t$v\n";
}
}
1;
__END__
=head1 NAME
AI::Categorizer::Document - Embodies a document
=head1 SYNOPSIS
use AI::Categorizer::Document;
# Simplest way to create a document:
my $d = new AI::Categorizer::Document(name => $string,
content => $string);
# Other parameters are accepted:
my $d = new AI::Categorizer::Document(name => $string,
categories => \@category_objects,
content => { subject => $string,
body => $string2, ... },
content_weights => { subject => 3,
body => 1, ... },
stopwords => \%skip_these_words,
stemming => $string,
front_bias => $float,
use_features => $feature_vector,
);
# Specify explicit feature vector:
my $d = new AI::Categorizer::Document(name => $string);
$d->features( $feature_vector );
# Now pass the document to a categorization algorithm:
my $learner = AI::Categorizer::Learner::NaiveBayes->restore_state($path);
my $hypothesis = $learner->categorize($document);
=head1 DESCRIPTION
The Document class embodies the data in a single document, and
contains methods for turning this data into a FeatureVector. Usually
documents are plain text, but subclasses of the Document class may
handle any kind of data.
=head1 METHODS
=over 4
=item new(%parameters)
Creates a new Document object. Document objects are used during
training (for the training documents), testing (for the test
documents), and when categorizing new unseen documents in an
application (for the unseen documents). However, you'll typically
only call C<new()> in the latter case, since the KnowledgeSet or
Collection classes will create Document objects for you in the former
cases.
( run in 0.967 second using v1.01-cache-2.11-cpan-39bf76dae61 )