AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Document.pm view on Meta::CPAN
package AI::Categorizer::Document;
use strict;
use Class::Container;
use base qw(Class::Container);
use Params::Validate qw(:types);
use AI::Categorizer::ObjectSet;
use AI::Categorizer::FeatureVector;
__PACKAGE__->valid_params
(
name => {
type => SCALAR,
},
categories => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Category objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
},
public => 0,
},
stopwords => {
type => ARRAYREF|HASHREF,
default => {},
},
content => {
type => HASHREF|SCALAR,
default => undef,
},
parse => {
type => SCALAR,
optional => 1,
},
parse_handle => {
type => HANDLE,
optional => 1,
},
features => {
isa => 'AI::Categorizer::FeatureVector',
optional => 1,
},
content_weights => {
type => HASHREF,
default => {},
},
front_bias => {
type => SCALAR,
default => 0,
},
use_features => {
type => HASHREF|UNDEF,
default => undef,
},
stemming => {
type => SCALAR|UNDEF,
optional => 1,
},
stopword_behavior => {
type => SCALAR,
default => "stem",
},
);
__PACKAGE__->contained_objects
(
features => { delayed => 1,
class => 'AI::Categorizer::FeatureVector' },
);
### Constructors
my $NAME = 'a';
sub new {
my $pkg = shift;
my $self = $pkg->SUPER::new(name => $NAME++, # Use a default name
@_);
# Get efficient internal data structures
$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
$self->_fix_stopwords;
# A few different ways for the caller to initialize the content
if (exists $self->{parse}) {
$self->parse(content => delete $self->{parse});
} elsif (exists $self->{parse_handle}) {
$self->parse_handle(handle => delete $self->{parse_handle});
} elsif (defined $self->{content}) {
# Allow a simple string as the content
$self->{content} = { body => $self->{content} } unless ref $self->{content};
}
$self->finish if $self->{content};
return $self;
}
sub _fix_stopwords {
my $self = shift;
# Convert to hash
$self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
my $s = $self->{stopwords};
# May need to perform stemming on the stopwords
return unless keys %$s; # No point in doing anything if there are no stopwords
return unless $self->{stopword_behavior} eq 'stem';
return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
return if $s->{___stemmed};
my @keys = keys %$s;
%$s = ();
$self->stem_words(\@keys);
$s->{$_} = 1 foreach @keys;
# This flag is attached to the stopword structure itself so that
# other documents will notice it.
$s->{___stemmed} = 1;
}
sub finish {
my $self = shift;
$self->create_feature_vector;
# Now we're done with all the content stuff
delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
}
# Parse a document format - a virtual method
sub parse;
sub parse_handle {
my ($self, %args) = @_;
my $fh = $args{handle} or die "No 'handle' argument given to parse_handle()";
return $self->parse( content => join '', <$fh> );
}
### Accessors
sub name { $_[0]->{name} }
sub stopword_behavior { $_[0]->{stopword_behavior} }
sub features {
my $self = shift;
if (@_) {
$self->{features} = shift;
}
return $self->{features};
}
sub categories {
my $c = $_[0]->{categories};
return wantarray ? $c->members : $c->size;
}
### Workers
sub create_feature_vector {
my $self = shift;
my $content = $self->{content};
my $weights = $self->{content_weights};
die "'stopword_behavior' must be one of 'stem', 'no_stem', or 'pre_stemmed'"
unless $self->{stopword_behavior} =~ /^stem|no_stem|pre_stemmed$/;
$self->{features} = $self->create_delayed_object('features');
while (my ($name, $data) = each %$content) {
my $t = $self->tokenize($data);
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} eq 'no_stem';
$self->stem_words($t);
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} =~ /^stem|pre_stemmed$/;
my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 );
$self->{features}->add($h);
}
}
sub is_in_category {
return (ref $_[1]
? $_[0]->{categories}->includes( $_[1] )
: $_[0]->{categories}->includes_name( $_[1] ));
}
sub tokenize {
my $self = shift;
my @tokens;
while ($_[0] =~ /([-\w]+)/g) {
my $word = lc $1;
next unless $word =~ /[a-z]/;
$word =~ s/^[^a-z]+//; # Trim leading non-alpha characters (helps with ordinals)
push @tokens, $word;
}
return \@tokens;
}
sub stem_words {
my ($self, $tokens) = @_;
return unless $self->{stemming};
return if $self->{stemming} eq 'none';
die "Unknown stemming option '$self->{stemming}' - options are 'porter' or 'none'"
unless $self->{stemming} eq 'porter';
eval {require Lingua::Stem; 1}
or die "Porter stemming requires the Lingua::Stem module, available from CPAN.\n";
@$tokens = @{ Lingua::Stem::stem(@$tokens) };
}
sub _filter_tokens {
my ($self, $tokens_in) = @_;
if ($self->{use_features}) {
my $f = $self->{use_features}->as_hash;
return [ grep exists($f->{$_}), @$tokens_in ];
} elsif ($self->{stopwords} and keys %{$self->{stopwords}}) {
my $s = $self->{stopwords};
return [ grep !exists($s->{$_}), @$tokens_in ];
}
return $tokens_in;
}
sub _weigh_tokens {
my ($self, $tokens, $weight) = @_;
my %counts;
if (my $b = 0+$self->{front_bias}) {
die "'front_bias' value must be between -1 and 1"
unless -1 < $b and $b < 1;
my $n = @$tokens;
my $r = ($b-1)**2 / ($b+1);
my $mult = $weight * log($r)/($r-1);
my $i = 0;
foreach my $feature (@$tokens) {
$counts{$feature} += $mult * $r**($i/$n);
$i++;
}
} else {
foreach my $feature (@$tokens) {
$counts{$feature} += $weight;
}
}
return \%counts;
}
sub vectorize {
my ($self, %args) = @_;
if ($self->{stem_stopwords}) {
my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
$self->{stopwords} = { map {+$_, 1} @$s };
$args{tokens} = $self->_filter_tokens($args{tokens});
}
return $self->_weigh_tokens($args{tokens}, $args{weight});
}
sub read {
my ($class, %args) = @_;
my $path = delete $args{path} or die "Must specify 'path' argument to read()";
my $self = $class->new(%args);
open my($fh), "< $path" or die "$path: $!";
$self->parse_handle(handle => $fh);
close $fh;
$self->finish;
return $self;
}
sub dump_features {
my ($self, %args) = @_;
my $path = $args{path} or die "No 'path' argument given to dump_features()";
open my($fh), "> $path" or die "Can't create $path: $!";
my $f = $self->features->as_hash;
while (my ($k, $v) = each %$f) {
print $fh "$k\t$v\n";
}
}
1;
__END__
=head1 NAME
AI::Categorizer::Document - Embodies a document
=head1 SYNOPSIS
use AI::Categorizer::Document;
# Simplest way to create a document:
my $d = new AI::Categorizer::Document(name => $string,
content => $string);
# Other parameters are accepted:
my $d = new AI::Categorizer::Document(name => $string,
categories => \@category_objects,
content => { subject => $string,
body => $string2, ... },
content_weights => { subject => 3,
body => 1, ... },
stopwords => \%skip_these_words,
stemming => $string,
front_bias => $float,
use_features => $feature_vector,
);
# Specify explicit feature vector:
my $d = new AI::Categorizer::Document(name => $string);
$d->features( $feature_vector );
# Now pass the document to a categorization algorithm:
my $learner = AI::Categorizer::Learner::NaiveBayes->restore_state($path);
my $hypothesis = $learner->categorize($document);
=head1 DESCRIPTION
The Document class embodies the data in a single document, and
contains methods for turning this data into a FeatureVector. Usually
documents are plain text, but subclasses of the Document class may
handle any kind of data.
=head1 METHODS
=over 4
=item new(%parameters)
Creates a new Document object. Document objects are used during
( run in 1.227 second using v1.01-cache-2.11-cpan-39bf76dae61 )