AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Document.pm view on Meta::CPAN
default => [],
callbacks => { 'all are Category objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
},
public => 0,
},
stopwords => {
type => ARRAYREF|HASHREF,
default => {},
},
content => {
type => HASHREF|SCALAR,
default => undef,
},
parse => {
type => SCALAR,
optional => 1,
},
parse_handle => {
type => HANDLE,
optional => 1,
},
features => {
isa => 'AI::Categorizer::FeatureVector',
optional => 1,
},
content_weights => {
type => HASHREF,
default => {},
},
front_bias => {
type => SCALAR,
default => 0,
},
use_features => {
type => HASHREF|UNDEF,
default => undef,
},
stemming => {
type => SCALAR|UNDEF,
optional => 1,
},
stopword_behavior => {
type => SCALAR,
default => "stem",
},
);
__PACKAGE__->contained_objects
(
features => { delayed => 1,
class => 'AI::Categorizer::FeatureVector' },
);
### Constructors
my $NAME = 'a';
sub new {
my $pkg = shift;
my $self = $pkg->SUPER::new(name => $NAME++, # Use a default name
@_);
# Get efficient internal data structures
$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
$self->_fix_stopwords;
# A few different ways for the caller to initialize the content
if (exists $self->{parse}) {
$self->parse(content => delete $self->{parse});
} elsif (exists $self->{parse_handle}) {
$self->parse_handle(handle => delete $self->{parse_handle});
} elsif (defined $self->{content}) {
# Allow a simple string as the content
$self->{content} = { body => $self->{content} } unless ref $self->{content};
}
$self->finish if $self->{content};
return $self;
}
sub _fix_stopwords {
my $self = shift;
# Convert to hash
$self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
my $s = $self->{stopwords};
# May need to perform stemming on the stopwords
return unless keys %$s; # No point in doing anything if there are no stopwords
return unless $self->{stopword_behavior} eq 'stem';
return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
return if $s->{___stemmed};
my @keys = keys %$s;
%$s = ();
$self->stem_words(\@keys);
$s->{$_} = 1 foreach @keys;
# This flag is attached to the stopword structure itself so that
# other documents will notice it.
$s->{___stemmed} = 1;
}
sub finish {
my $self = shift;
$self->create_feature_vector;
# Now we're done with all the content stuff
delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
}
# Parse a document format - a virtual method
sub parse;
( run in 2.378 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )