view release on metacpan or search on metacpan
}
if (-e $cats) {
$params{category_file} = $cats;
} else {
die "$cats not found - can't proceed without category information.\n";
}
# In a real-world application these Collection objects could be of any
# type (any Collection subclass). Or you could create each Document
# object manually. Or you could let the KnowledgeSet create the
# Collection objects for you.
$training = AI::Categorizer::Collection::Files->new( path => $training, %params );
$test = AI::Categorizer::Collection::Files->new( path => $test, %params );
# We turn on verbose mode so you can watch the progress of loading &
# training. This looks nicer if you have Time::Progress installed!
print "Loading training set\n";
lib/AI/Categorizer.pm view on Meta::CPAN
package AI::Categorizer;
$VERSION = '0.09';
use strict;
use Class::Container;
use base qw(Class::Container);
use Params::Validate qw(:types);
use File::Spec;
use AI::Categorizer::Learner;
use AI::Categorizer::Document;
use AI::Categorizer::Category;
use AI::Categorizer::Collection;
use AI::Categorizer::Hypothesis;
use AI::Categorizer::KnowledgeSet;
__PACKAGE__->valid_params
(
progress_file => { type => SCALAR, default => 'save' },
knowledge_set => { isa => 'AI::Categorizer::KnowledgeSet' },
learner => { isa => 'AI::Categorizer::Learner' },
verbose => { type => BOOLEAN, default => 0 },
training_set => { type => SCALAR, optional => 1 },
test_set => { type => SCALAR, optional => 1 },
data_root => { type => SCALAR, optional => 1 },
);
__PACKAGE__->contained_objects
(
knowledge_set => { class => 'AI::Categorizer::KnowledgeSet' },
learner => { class => 'AI::Categorizer::Learner::NaiveBayes' },
experiment => { class => 'AI::Categorizer::Experiment',
delayed => 1 },
collection => { class => 'AI::Categorizer::Collection::Files',
delayed => 1 },
lib/AI/Categorizer/Category.pm view on Meta::CPAN
package AI::Categorizer::Category;
use strict;
use AI::Categorizer::ObjectSet;
use Class::Container;
use base qw(Class::Container);
use Params::Validate qw(:types);
use AI::Categorizer::FeatureVector;
__PACKAGE__->valid_params
(
name => {type => SCALAR, public => 0},
documents => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Document objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'), @_ },
},
public => 0,
},
);
__PACKAGE__->contained_objects
(
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
package AI::Categorizer::Collection;
use strict;
use Params::Validate qw(:types);
use Class::Container;
use base qw(Class::Container);
__PACKAGE__->valid_params
(
verbose => {type => SCALAR, default => 0},
stopword_file => { type => SCALAR, optional => 1 },
category_hash => { type => HASHREF, default => {} },
category_file => { type => SCALAR, optional => 1 },
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my ($class, %args) = @_;
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
be parsed and then fed as the C<stopwords> parameter to the
Document C<new()> method.
=item verbose
If true, some status/debugging information will be printed to
C<STDOUT> during operation.
=item document_class
The class indicating what type of Document object should be created.
This generally specifies the format that the documents are stored in.
The default is C<AI::Categorizer::Document::Text>.
=back
=item next()
Returns the next Document object in the Collection.
=item rewind()
lib/AI/Categorizer/Collection/DBI.pm view on Meta::CPAN
package AI::Categorizer::Collection::DBI;
use strict;
use DBI;
use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
connection_string => {type => SCALAR, default => undef},
dbh => {isa => 'DBI::db', default => undef},
select_statement => {type => SCALAR, default => "SELECT text FROM documents"},
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document',
delayed => 1 },
);
sub new {
my $class = shift;
lib/AI/Categorizer/Collection/Files.pm view on Meta::CPAN
package AI::Categorizer::Collection::Files;
use strict;
use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);
use Params::Validate qw(:types);
use File::Spec;
__PACKAGE__->valid_params
(
path => { type => SCALAR|ARRAYREF },
recurse => { type => BOOLEAN, default => 0 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->{dir_fh} = do {local *FH; *FH}; # double *FH avoids a warning
# Documents are contained in a directory, or list of directories
$self->{path} = [$self->{path}] unless ref $self->{path};
lib/AI/Categorizer/Collection/InMemory.pm view on Meta::CPAN
package AI::Categorizer::Collection::InMemory;
use strict;
use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
data => { type => HASHREF },
);
sub new {
my $self = shift()->SUPER::new(@_);
while (my ($name, $params) = each %{$self->{data}}) {
foreach (@{$params->{categories}}) {
next if ref $_;
$_ = AI::Categorizer::Category->by_name(name => $_);
}
lib/AI/Categorizer/Collection/SingleFile.pm view on Meta::CPAN
package AI::Categorizer::Collection::SingleFile;
use strict;
use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
path => { type => SCALAR|ARRAYREF },
categories => { type => HASHREF|UNDEF, default => undef },
delimiter => { type => SCALAR },
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my $class = shift;
lib/AI/Categorizer/Document.pm view on Meta::CPAN
package AI::Categorizer::Document;
use strict;
use Class::Container;
use base qw(Class::Container);
use Params::Validate qw(:types);
use AI::Categorizer::ObjectSet;
use AI::Categorizer::FeatureVector;
__PACKAGE__->valid_params
(
name => {
type => SCALAR,
},
categories => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Category objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
},
public => 0,
},
stopwords => {
type => ARRAYREF|HASHREF,
default => {},
},
content => {
type => HASHREF|SCALAR,
default => undef,
},
parse => {
type => SCALAR,
optional => 1,
},
parse_handle => {
type => HANDLE,
optional => 1,
},
features => {
isa => 'AI::Categorizer::FeatureVector',
optional => 1,
},
content_weights => {
type => HASHREF,
default => {},
},
front_bias => {
type => SCALAR,
default => 0,
},
use_features => {
type => HASHREF|UNDEF,
default => undef,
},
stemming => {
type => SCALAR|UNDEF,
optional => 1,
},
stopword_behavior => {
type => SCALAR,
default => "stem",
},
);
__PACKAGE__->contained_objects
(
features => { delayed => 1,
class => 'AI::Categorizer::FeatureVector' },
);
lib/AI/Categorizer/Document.pm view on Meta::CPAN
=over 4
=item name
A string that identifies this document. Required.
=item content
The raw content of this document. May be specified as either a string
or as a hash reference, allowing structured document types.
=item content_weights
A hash reference indicating the weights that should be assigned to
features in different sections of a structured document when creating
its feature vector. The weight is a multiplier of the feature vector
values. For instance, if a C<subject> section has a weight of 3 and a
C<body> section has a weight of 1, and word counts are used as feature
vector values, then it will be as if all words appearing in the
C<subject> appeared 3 times.
lib/AI/Categorizer/Document/Text.pm view on Meta::CPAN
package AI::Categorizer::Document::Text;
use strict;
use AI::Categorizer::Document;
use base qw(AI::Categorizer::Document);
#use Params::Validate qw(:types);
#use AI::Categorizer::ObjectSet;
#use AI::Categorizer::FeatureVector;
### Constructors
sub parse {
my ($self, %args) = @_;
$self->{content} = { body => $args{content} };
}
lib/AI/Categorizer/Experiment.pm view on Meta::CPAN
package AI::Categorizer::Experiment;
use strict;
use Class::Container;
use AI::Categorizer::Storable;
use Statistics::Contingency;
use base qw(Class::Container AI::Categorizer::Storable Statistics::Contingency);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
categories => { type => ARRAYREF|HASHREF },
sig_figs => { type => SCALAR, default => 4 },
);
sub new {
my $package = shift;
my $self = $package->Class::Container::new(@_);
$self->{$_} = 0 foreach qw(a b c d);
my $c = delete $self->{categories};
$self->{categories} = { map {($_ => {a=>0, b=>0, c=>0, d=>0})}
UNIVERSAL::isa($c, 'HASH') ? keys(%$c) : @$c
lib/AI/Categorizer/FeatureSelector.pm view on Meta::CPAN
package AI::Categorizer::FeatureSelector;
use strict;
use Class::Container;
use base qw(Class::Container);
use Params::Validate qw(:types);
use AI::Categorizer::FeatureVector;
use AI::Categorizer::Util;
use Carp qw(croak);
__PACKAGE__->valid_params
(
features_kept => {
type => SCALAR,
default => 0.2,
},
verbose => {
type => SCALAR,
default => 0,
},
);
sub verbose {
my $self = shift;
$self->{verbose} = shift if @_;
return $self->{verbose};
}
lib/AI/Categorizer/FeatureSelector.pm view on Meta::CPAN
when training the Learner or categorizing new documents. May be
specified as a positive integer (e.g. 2000) indicating the absolute
number of features to be kept, or as a decimal between 0 and 1
(e.g. 0.2) indicating the fraction of the total number of features to
be kept, or as 0 to indicate that no feature selection should be done
and that the entire set of features should be used. The default is
0.2.
=item feature_selection
A string indicating the type of feature selection that should be
performed. Currently the only option is also the default option:
C<document_frequency>.
=item tfidf_weighting
Specifies how document word counts should be converted to vector
values. Uses the three-character specification strings from Salton &
Buckley's paper "Term-weighting approaches in automatic text
retrieval". The three characters indicate the three factors that will
be multiplied for each feature to find the final vector value for that
lib/AI/Categorizer/FeatureSelector/CategorySelector.pm view on Meta::CPAN
package AI::Categorizer::FeatureSelector::CategorySelector;
use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector);
use Params::Validate qw(:types);
__PACKAGE__->contained_objects
(
features => { class => 'AI::Categorizer::FeatureVector',
delayed => 1 },
);
1;
lib/AI/Categorizer/FeatureSelector/ChiSquare.pm view on Meta::CPAN
package AI::Categorizer::FeatureSelector::ChiSquare;
use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector::CategorySelector);
use Params::Validate qw(:types);
# Chi-Square function
# NB: this could probably be optimised a bit...
sub reduction_function {
my ($self,$term,$N,$allFeaturesSum,
$coll_features,$cat_features,$cat_features_sum) = @_;
my $CHI2SUM = 0;
my $nbcats = 0;
foreach my $catname (keys %{$cat_features}) {
lib/AI/Categorizer/FeatureSelector/DocFrequency.pm view on Meta::CPAN
package AI::Categorizer::FeatureSelector::DocFrequency;
use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector);
use Params::Validate qw(:types);
use Carp qw(croak);
__PACKAGE__->contained_objects
(
features => { class => 'AI::Categorizer::FeatureVector',
delayed => 1 },
);
# The KnowledgeSet keeps track of document frequency, so just use that.
sub rank_features {
lib/AI/Categorizer/Hypothesis.pm view on Meta::CPAN
package AI::Categorizer::Hypothesis;
use strict;
use Class::Container;
use base qw(Class::Container);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
all_categories => {type => ARRAYREF},
scores => {type => HASHREF},
threshold => {type => SCALAR},
document_name => {type => SCALAR, optional => 1},
);
sub all_categories { @{$_[0]->{all_categories}} }
sub document_name { $_[0]->{document_name} }
sub threshold { $_[0]->{threshold} }
sub best_category {
my ($self) = @_;
my $sc = $self->{scores};
return unless %$sc;
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
package AI::Categorizer::KnowledgeSet;
use strict;
use Class::Container;
use AI::Categorizer::Storable;
use base qw(Class::Container AI::Categorizer::Storable);
use Params::Validate qw(:types);
use AI::Categorizer::ObjectSet;
use AI::Categorizer::Document;
use AI::Categorizer::Category;
use AI::Categorizer::FeatureVector;
use AI::Categorizer::Util;
use Carp qw(croak);
__PACKAGE__->valid_params
(
categories => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Category objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'),
@{$_[0]} },
},
},
documents => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Document objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'),
@{$_[0]} },
},
},
scan_first => {
type => BOOLEAN,
default => 1,
},
feature_selector => {
isa => 'AI::Categorizer::FeatureSelector',
},
tfidf_weighting => {
type => SCALAR,
optional => 1,
},
term_weighting => {
type => SCALAR,
default => 'x',
},
collection_weighting => {
type => SCALAR,
default => 'x',
},
normalize_weighting => {
type => SCALAR,
default => 'x',
},
verbose => {
type => SCALAR,
default => 0,
},
);
__PACKAGE__->contained_objects
(
document => { delayed => 1,
class => 'AI::Categorizer::Document' },
category => { delayed => 1,
class => 'AI::Categorizer::Category' },
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
my ($self, $args) = @_;
return $args->{collection} || $self->create_delayed_object('collection', %$args);
}
sub scan_stats {
# Should determine:
# - number of documents
# - number of categories
# - avg. number of categories per document (whole corpus)
# - avg. number of tokens per document (whole corpus)
# - avg. number of types per document (whole corpus)
# - number of documents, tokens, & types for each category
# - "category skew index" (% variance?) by num. documents, tokens, and types
my ($self, %args) = @_;
my $collection = $self->_make_collection(\%args);
my $pb = $self->prog_bar($collection);
my %stats;
while (my $doc = $collection->next) {
$pb->();
$stats{category_count_with_duplicates} += $doc->categories;
my ($sum, $length) = ($doc->features->sum, $doc->features->length);
$stats{document_count}++;
$stats{token_count} += $sum;
$stats{type_count} += $length;
foreach my $cat ($doc->categories) {
#warn $doc->name, ": ", $cat->name, "\n";
$stats{categories}{$cat->name}{document_count}++;
$stats{categories}{$cat->name}{token_count} += $sum;
$stats{categories}{$cat->name}{type_count} += $length;
}
}
print "\n" if $self->verbose;
my @cats = keys %{ $stats{categories} };
$stats{category_count} = @cats;
$stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
$stats{tokens_per_document} = $stats{token_count} / $stats{document_count};
$stats{types_per_document} = $stats{type_count} / $stats{document_count};
foreach my $thing ('type', 'token', 'document') {
$stats{"${thing}s_per_category"} = AI::Categorizer::Util::average
( map { $stats{categories}{$_}{"${thing}_count"} } @cats );
next unless @cats;
# Compute the skews
my $ssum;
foreach my $cat (@cats) {
$ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
}
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
when training the Learner or categorizing new documents. May be
specified as a positive integer (e.g. 2000) indicating the absolute
number of features to be kept, or as a decimal between 0 and 1
(e.g. 0.2) indicating the fraction of the total number of features to
be kept, or as 0 to indicate that no feature selection should be done
and that the entire set of features should be used. The default is
0.2.
=item feature_selection
A string indicating the type of feature selection that should be
performed. Currently the only option is also the default option:
C<document_frequency>.
=item tfidf_weighting
Specifies how document word counts should be converted to vector
values. Uses the three-character specification strings from Salton &
Buckley's paper "Term-weighting approaches in automatic text
retrieval". The three characters indicate the three factors that will
be multiplied for each feature to find the final vector value for that
lib/AI/Categorizer/Learner.pm view on Meta::CPAN
package AI::Categorizer::Learner;
use strict;
use Class::Container;
use AI::Categorizer::Storable;
use base qw(Class::Container AI::Categorizer::Storable);
use Params::Validate qw(:types);
use AI::Categorizer::ObjectSet;
__PACKAGE__->valid_params
(
knowledge_set => { isa => 'AI::Categorizer::KnowledgeSet', optional => 1 },
verbose => {type => SCALAR, default => 0},
);
__PACKAGE__->contained_objects
(
hypothesis => {
class => 'AI::Categorizer::Hypothesis',
delayed => 1,
},
experiment => {
class => 'AI::Categorizer::Experiment',
lib/AI/Categorizer/Learner/Boolean.pm view on Meta::CPAN
package AI::Categorizer::Learner::Boolean;
use strict;
use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
use AI::Categorizer::Util qw(random_elements);
__PACKAGE__->valid_params
(
max_instances => {type => SCALAR, default => 0},
threshold => {type => SCALAR, default => 0.5},
);
sub create_model {
my $self = shift;
my $m = $self->{model} ||= {};
my $mi = $self->{max_instances};
foreach my $cat ($self->knowledge_set->categories) {
my (@p, @n);
foreach my $doc ($self->knowledge_set->documents) {
lib/AI/Categorizer/Learner/Boolean.pm view on Meta::CPAN
into multi-valued categorizers. For instance, the decision tree
categorizer C<AI::Categorizer::Learner::DecisionTree> maintains a
decision tree for each category, then uses it to decide whether a
certain document belongs to the given category.
Any class that inherits from this class should implement the following
methods:
=head2 create_boolean_model()
Used during training to create a category-specific model. The type of
model you create is up to you - it should be returned as a scalar.
Whatever you return will be available to you in the
C<get_boolean_score()> method, so put any information you'll need
during categorization in this scalar.
In addition to C<$self>, this method will be passed three arguments.
The first argument is a reference to an array of B<positive> examples,
i.e. documents that belong to the given category. The next argument
is a reference to an array of B<negative> examples, i.e. documents
that do I<not> belong to the given category. The final argument is
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
package AI::Categorizer::Learner::KNN;
use strict;
use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
threshold => {type => SCALAR, default => 0.4},
k_value => {type => SCALAR, default => 20},
knn_weighting => {type => SCALAR, default => 'score'},
max_instances => {type => SCALAR, default => 0},
);
sub create_model {
my $self = shift;
foreach my $doc ($self->knowledge_set->documents) {
$doc->features->normalize;
}
$self->knowledge_set->features; # Initialize
}
lib/AI/Categorizer/Learner/NaiveBayes.pm view on Meta::CPAN
package AI::Categorizer::Learner::NaiveBayes;
use strict;
use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
use Algorithm::NaiveBayes;
__PACKAGE__->valid_params
(
threshold => {type => SCALAR, default => 0.3},
);
sub create_model {
my $self = shift;
my $m = $self->{model} = Algorithm::NaiveBayes->new;
foreach my $d ($self->knowledge_set->documents) {
$m->add_instance(attributes => $d->features->as_hash,
label => [ map $_->name, $d->categories ]);
}
lib/AI/Categorizer/Learner/Rocchio.pm view on Meta::CPAN
package AI::Categorizer::Learner::Rocchio;
$VERSION = '0.01';
use strict;
use Params::Validate qw(:types);
use AI::Categorizer::FeatureVector;
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);
__PACKAGE__->valid_params
(
positive_setting => {type => SCALAR, default => 16 },
negative_setting => {type => SCALAR, default => 4 },
threshold => {type => SCALAR, default => 0.1},
);
sub create_model {
my $self = shift;
foreach my $doc ($self->knowledge_set->documents) {
$doc->features->normalize;
}
$self->{model}{all_features} = $self->knowledge_set->features(undef);
$self->SUPER::create_model(@_);
lib/AI/Categorizer/Learner/SVM.pm view on Meta::CPAN
package AI::Categorizer::Learner::SVM;
$VERSION = '0.01';
use strict;
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);
use Algorithm::SVM;
use Algorithm::SVM::DataSet;
use Params::Validate qw(:types);
use File::Spec;
__PACKAGE__->valid_params
(
svm_kernel => {type => SCALAR, default => 'linear'},
);
sub create_model {
my $self = shift;
my $f = $self->knowledge_set->features->as_hash;
my $rmap = [ keys %$f ];
$self->{model}{feature_map} = { map { $rmap->[$_], $_ } 0..$#$rmap };
$self->{model}{feature_map_reverse} = $rmap;
$self->SUPER::create_model(@_);
}
lib/AI/Categorizer/Learner/SVM.pm view on Meta::CPAN
=head2 new()
Creates a new SVM Learner and returns it. In addition to the
parameters accepted by the C<AI::Categorizer::Learner> class, the
SVM subclass accepts the following parameters:
=over 4
=item svm_kernel
Specifies what type of kernel should be used when building the SVM.
Default is 'linear'. Possible values are 'linear', 'polynomial',
'radial' and 'sigmoid'.
=back
=head2 train(knowledge_set => $k)
Trains the categorizer. This prepares it for later use in
categorizing documents. The C<knowledge_set> parameter must provide
an object of the class C<AI::Categorizer::KnowledgeSet> (or a subclass
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
package AI::Categorizer::Learner::Weka;
use strict;
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);
use Params::Validate qw(:types);
use File::Spec;
use File::Copy;
use File::Path ();
use File::Temp ();
__PACKAGE__->valid_params
(
java_path => {type => SCALAR, default => 'java'},
java_args => {type => SCALAR|ARRAYREF, optional => 1},
weka_path => {type => SCALAR, optional => 1},
weka_classifier => {type => SCALAR, default => 'weka.classifiers.NaiveBayes'},
weka_args => {type => SCALAR|ARRAYREF, optional => 1},
tmpdir => {type => SCALAR, default => File::Spec->tmpdir},
);
__PACKAGE__->contained_objects
(
features => {class => 'AI::Categorizer::FeatureVector', delayed => 1},
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
lib/AI/Categorizer/ObjectSet.pm view on Meta::CPAN
return values %{$_[0]};
}
sub size {
return scalar keys %{$_[0]};
}
sub insert {
my $self = shift;
foreach my $element (@_) {
#warn "types are ", @_;
$self->{ $element->name } = $element;
}
}
sub retrieve { $_[0]->{$_[1]} }
sub includes { exists $_[0]->{ $_[1]->name } }
sub includes_name { exists $_[0]->{ $_[1] } }
1;