AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Document.pm view on Meta::CPAN
default => "stem",
},
);
__PACKAGE__->contained_objects
(
features => { delayed => 1,
class => 'AI::Categorizer::FeatureVector' },
);
### Constructors
my $NAME = 'a';
sub new {
my $pkg = shift;
my $self = $pkg->SUPER::new(name => $NAME++, # Use a default name
@_);
# Get efficient internal data structures
$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
$self->_fix_stopwords;
# A few different ways for the caller to initialize the content
if (exists $self->{parse}) {
$self->parse(content => delete $self->{parse});
} elsif (exists $self->{parse_handle}) {
$self->parse_handle(handle => delete $self->{parse_handle});
} elsif (defined $self->{content}) {
# Allow a simple string as the content
$self->{content} = { body => $self->{content} } unless ref $self->{content};
}
$self->finish if $self->{content};
return $self;
}
sub _fix_stopwords {
my $self = shift;
# Convert to hash
$self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
my $s = $self->{stopwords};
# May need to perform stemming on the stopwords
return unless keys %$s; # No point in doing anything if there are no stopwords
return unless $self->{stopword_behavior} eq 'stem';
return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
return if $s->{___stemmed};
my @keys = keys %$s;
%$s = ();
$self->stem_words(\@keys);
$s->{$_} = 1 foreach @keys;
# This flag is attached to the stopword structure itself so that
# other documents will notice it.
$s->{___stemmed} = 1;
}
sub finish {
my $self = shift;
$self->create_feature_vector;
# Now we're done with all the content stuff
delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
}
# Parse a document format - a virtual method
sub parse;
sub parse_handle {
my ($self, %args) = @_;
my $fh = $args{handle} or die "No 'handle' argument given to parse_handle()";
return $self->parse( content => join '', <$fh> );
}
### Accessors
sub name { $_[0]->{name} }
sub stopword_behavior { $_[0]->{stopword_behavior} }
sub features {
my $self = shift;
if (@_) {
$self->{features} = shift;
}
return $self->{features};
}
sub categories {
my $c = $_[0]->{categories};
return wantarray ? $c->members : $c->size;
}
### Workers
sub create_feature_vector {
my $self = shift;
my $content = $self->{content};
my $weights = $self->{content_weights};
die "'stopword_behavior' must be one of 'stem', 'no_stem', or 'pre_stemmed'"
unless $self->{stopword_behavior} =~ /^stem|no_stem|pre_stemmed$/;
$self->{features} = $self->create_delayed_object('features');
while (my ($name, $data) = each %$content) {
my $t = $self->tokenize($data);
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} eq 'no_stem';
$self->stem_words($t);
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} =~ /^stem|pre_stemmed$/;
my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 );
$self->{features}->add($h);
}
( run in 1.398 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )