view release on metacpan or search on metacpan
lib/AI/Categorizer.pm view on Meta::CPAN
my $package = shift;
my %args = @_;
my %defaults;
if (exists $args{data_root}) {
$defaults{training_set} = File::Spec->catfile($args{data_root}, 'training');
$defaults{test_set} = File::Spec->catfile($args{data_root}, 'test');
$defaults{category_file} = File::Spec->catfile($args{data_root}, 'cats.txt');
delete $args{data_root};
}
return $package->SUPER::new(%defaults, %args);
}
#sub dump_parameters {
# my $p = shift()->SUPER::dump_parameters;
# delete $p->{stopwords} if $p->{stopword_file};
# return $p;
#}
sub knowledge_set { shift->{knowledge_set} }
sub learner { shift->{learner} }
# Combines several methods in one sub
sub run_experiment {
my $self = shift;
lib/AI/Categorizer/Category.pm view on Meta::CPAN
(
features => {
class => 'AI::Categorizer::FeatureVector',
delayed => 1,
},
);
my %REGISTRY = ();
sub new {
my $self = shift()->SUPER::new(@_);
$self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
$REGISTRY{$self->{name}} = $self;
return $self;
}
sub by_name {
my ($class, %args) = @_;
return $REGISTRY{$args{name}} if exists $REGISTRY{$args{name}};
return $class->new(%args);
}
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
);
sub new {
my ($class, %args) = @_;
# Optimize so every document doesn't have to convert the stopword list to a hash
if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
$args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
}
my $self = $class->SUPER::new(%args);
if ($self->{category_file}) {
local *FH;
open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!";
while (<FH>) {
my ($doc, @cats) = split;
$self->{category_hash}{$doc} = \@cats;
}
close FH;
}
lib/AI/Categorizer/Collection/DBI.pm view on Meta::CPAN
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document',
delayed => 1 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
die "Must provide 'dbh' or 'connection_string' arguments"
unless $self->{dbh} or $self->{connection_string};
unless ($self->{dbh}) {
$self->{dbh} = DBI->connect($self->{connection_string}, '', '', {RaiseError => 1})
or die DBI->errstr;
delete $self->{connection_string};
}
lib/AI/Categorizer/Collection/Files.pm view on Meta::CPAN
use File::Spec;
__PACKAGE__->valid_params
(
path => { type => SCALAR|ARRAYREF },
recurse => { type => BOOLEAN, default => 0 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->{dir_fh} = do {local *FH; *FH}; # double *FH avoids a warning
# Documents are contained in a directory, or list of directories
$self->{path} = [$self->{path}] unless ref $self->{path};
$self->{used} = [];
$self->_next_path;
return $self;
}
lib/AI/Categorizer/Collection/InMemory.pm view on Meta::CPAN
use base qw(AI::Categorizer::Collection);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
data => { type => HASHREF },
);
sub new {
my $self = shift()->SUPER::new(@_);
while (my ($name, $params) = each %{$self->{data}}) {
foreach (@{$params->{categories}}) {
next if ref $_;
$_ = AI::Categorizer::Category->by_name(name => $_);
}
}
return $self;
}
lib/AI/Categorizer/Collection/SingleFile.pm view on Meta::CPAN
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->{fh} = do {local *FH; *FH}; # double *FH avoids a warning
# Documents are contained in a file, or list of files
$self->{path} = [$self->{path}] unless ref $self->{path};
$self->{used} = [];
$self->_next_path;
return $self;
}
lib/AI/Categorizer/Document.pm view on Meta::CPAN
features => { delayed => 1,
class => 'AI::Categorizer::FeatureVector' },
);
### Constructors
my $NAME = 'a';
sub new {
my $pkg = shift;
my $self = $pkg->SUPER::new(name => $NAME++, # Use a default name
@_);
# Get efficient internal data structures
$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
$self->_fix_stopwords;
# A few different ways for the caller to initialize the content
if (exists $self->{parse}) {
$self->parse(content => delete $self->{parse});
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
use strict;
use base qw(XML::SAX::Base);
# Input: a hash which is weights of elements
# Output: object of this class
# Description: this is constructor
sub new{
my ($class, %args) = @_;
# call super class such as XML::SAX::Base
my $self = $class->SUPER::new;
# save weights of elements which is a hash for pairs <elementName, weight>
# weight is times duplication of corresponding element
# It is provided by caller(one of parameters) at construction, and
# we must save it in order to use doing duplication at end_element
$self->{weightHash} = $args{weights};
# It is storage to store the data produced by Text, CDataSection and etc.
$self->{content} = '';
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
# The level(depth) of the last called element in XML tree
# Calling of start_element is the preorder of the tree traversal.
# The level is the level of current visiting element in tree.
# the first element is 0-level
$self->{levelPointer} = 0;
# all data will be saved into here, initially, it is an empty
$self->{content} = "";
#$self->SUPER::start_document($doc);
}
# Input: None
# Output: None
# Description:
# it is called whenever the parser ends the document
# it will be called at once
# Nothing to do
sub end_document{
my ($self, $doc)= @_;
#$self->SUPER::end_document($doc);
}
# Input
# LocalName: $el->{LocalName}
# NamespaceURI: $el->{NamespaceURI}
# Name $el->{Name}
# Prefix $el->{Prefix}
# Attributes $el->{Attributes}
# for each attribute
# LocalName: $el->{LocalName}
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
# its meaning is to append the new data at this location
my $location= length $self->{content};
# save the last location of the current content
# so that at end_element the starting location of data of this element can be known
$self->{locationArray}[$self->{levelPointer}] = $location;
# for the next element, increase levelPointer
$self->{levelPointer}++;
#$self->SUPER::start_document($el);
}
# Input: None
# Output: None
# Description:
# it is called whenever the parser ends the element
sub end_element{
my ($self, $el)= @_;
$self->{levelPointer}--;
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
# n - duplicate data by n times
# get new content
my $newContent= substr($self->{content}, $location);
# start to copy
for(my $i=1; $i<$weight;$i++){
$self->{content} .= $newContent;
}
#$self->SUPER::end_document($el);
}
# Input: a hash which consists of pair <Data, Value>
# Output: None
# Description:
# it is called whenever the parser meets the text which comes from Text, CDataSection and etc
# Value must be saved into content buffer.
sub characters{
my ($self, $args)= @_;
lib/AI/Categorizer/Experiment.pm view on Meta::CPAN
sub add_hypothesis {
my ($self, $h, $correct, $name) = @_;
die "No hypothesis given to add_hypothesis()" unless $h;
$name = $h->document_name unless defined $name;
$self->add_result([$h->categories], $correct, $name);
}
sub stats_table {
my $self = shift;
$self->SUPER::stats_table($self->{sig_figs});
}
1;
__END__
=head1 NAME
AI::Categorizer::Experiment - Coordinate experimental results
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
sub new {
my ($pkg, %args) = @_;
# Shortcuts
if ($args{tfidf_weighting}) {
@args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting};
delete $args{tfidf_weighting};
}
my $self = $pkg->SUPER::new(%args);
# Convert to AI::Categorizer::ObjectSet sets
$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
$self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
if ($self->{load}) {
my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} };
$self->load(%$args);
delete $self->{load};
}
lib/AI/Categorizer/Learner/DecisionTree.pm view on Meta::CPAN
package AI::Categorizer::Learner::DecisionTree;
$VERSION = '0.01';
use strict;
use AI::DecisionTree;
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);
sub create_model {
my $self = shift;
$self->SUPER::create_model;
$self->{model}{first_tree}->do_purge;
delete $self->{model}{first_tree};
}
sub create_boolean_model {
my ($self, $positives, $negatives, $cat) = @_;
my $t = new AI::DecisionTree(noise_mode => 'pick_best',
verbose => $self->verbose);
lib/AI/Categorizer/Learner/DecisionTree.pm view on Meta::CPAN
}
print STDERR "\nBuilding tree for category '", $cat->name, "'" if $self->verbose;
$t->train;
return $t;
}
sub get_scores {
my ($self, $doc) = @_;
local $self->{current_doc} = $doc->features->as_boolean_hash;
return $self->SUPER::get_scores($doc);
}
sub get_boolean_score {
my ($self, $doc, $t) = @_;
return $t->get_result( attributes => $self->{current_doc} ) || 0;
}
1;
__END__
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
return $self->{threshold};
}
sub categorize_collection {
my $self = shift;
my $f_class = $self->knowledge_set->contained_class('features');
if ($f_class->can('all_features')) {
$f_class->all_features([$self->knowledge_set->features->names]);
}
$self->SUPER::categorize_collection(@_);
}
sub get_scores {
my ($self, $newdoc) = @_;
my $currentDocName = $newdoc->name;
#print "classifying $currentDocName\n";
my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});
lib/AI/Categorizer/Learner/NaiveBayes.pm view on Meta::CPAN
sub threshold {
my $self = shift;
$self->{threshold} = shift if @_;
return $self->{threshold};
}
sub save_state {
my $self = shift;
local $self->{knowledge_set}; # Don't need the knowledge_set to categorize
$self->SUPER::save_state(@_);
}
sub categories {
my $self = shift;
return map AI::Categorizer::Category->by_name( name => $_ ), $self->{model}->labels;
}
1;
__END__
lib/AI/Categorizer/Learner/Rocchio.pm view on Meta::CPAN
threshold => {type => SCALAR, default => 0.1},
);
sub create_model {
my $self = shift;
foreach my $doc ($self->knowledge_set->documents) {
$doc->features->normalize;
}
$self->{model}{all_features} = $self->knowledge_set->features(undef);
$self->SUPER::create_model(@_);
delete $self->{knowledge_set};
}
sub create_boolean_model {
my ($self, $positives, $negatives, $cat) = @_;
my $posdocnum = @$positives;
my $negdocnum = @$negatives;
my $beta = $self->{positive_setting};
my $gamma = $self->{negative_setting};
lib/AI/Categorizer/Learner/SVM.pm view on Meta::CPAN
(
svm_kernel => {type => SCALAR, default => 'linear'},
);
sub create_model {
my $self = shift;
my $f = $self->knowledge_set->features->as_hash;
my $rmap = [ keys %$f ];
$self->{model}{feature_map} = { map { $rmap->[$_], $_ } 0..$#$rmap };
$self->{model}{feature_map_reverse} = $rmap;
$self->SUPER::create_model(@_);
}
sub _doc_2_dataset {
my ($self, $doc, $label, $fm) = @_;
my $ds = new Algorithm::SVM::DataSet(Label => $label);
my $f = $doc->features->as_hash;
while (my ($k, $v) = each %$f) {
next unless exists $fm->{$k};
$ds->attribute( $fm->{$k}, $v );
lib/AI/Categorizer/Learner/SVM.pm view on Meta::CPAN
push @neg, $self->_doc_2_dataset($doc, 0, $self->{model}{feature_map});
}
$svm->train(@pos, @neg);
return $svm;
}
sub get_scores {
my ($self, $doc) = @_;
local $self->{current_doc} = $self->_doc_2_dataset($doc, -1, $self->{model}{feature_map});
return $self->SUPER::get_scores($doc);
}
sub get_boolean_score {
my ($self, $doc, $svm) = @_;
return $svm->predict($self->{current_doc});
}
sub save_state {
my ($self, $path) = @_;
{
local $self->{model}{learners};
local $self->{knowledge_set};
$self->SUPER::save_state($path);
}
return unless $self->{model};
my $svm_dir = File::Spec->catdir($path, 'svms');
mkdir($svm_dir, 0777) or die "Couldn't create $svm_dir: $!";
while (my ($name, $learner) = each %{$self->{model}{learners}}) {
my $path = File::Spec->catfile($svm_dir, $name);
$learner->save($path);
}
}
sub restore_state {
my ($self, $path) = @_;
$self = $self->SUPER::restore_state($path);
my $svm_dir = File::Spec->catdir($path, 'svms');
return $self unless -e $svm_dir;
opendir my($dh), $svm_dir or die "Can't open directory $svm_dir: $!";
while (defined (my $file = readdir $dh)) {
my $full_file = File::Spec->catfile($svm_dir, $file);
next if -d $full_file;
$self->{model}{learners}{$file} = new Algorithm::SVM(Model => $full_file);
}
return $self;
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
tmpdir => {type => SCALAR, default => File::Spec->tmpdir},
);
__PACKAGE__->contained_objects
(
features => {class => 'AI::Categorizer::FeatureVector', delayed => 1},
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
for ('java_args', 'weka_args') {
$self->{$_} = [] unless defined $self->{$_};
$self->{$_} = [$self->{$_}] unless UNIVERSAL::isa($self->{$_}, 'ARRAY');
}
if (defined $self->{weka_path}) {
push @{$self->{java_args}}, '-classpath', $self->{weka_path};
delete $self->{weka_path};
}
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
sub create_model {
my ($self) = shift;
my $m = $self->{model} ||= {};
$m->{all_features} = [ $self->knowledge_set->features->names ];
$m->{_in_dir} = File::Temp::tempdir( DIR => $self->{tmpdir} );
# Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement)
my $dummy_features = $self->create_delayed_object('features');
$m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]);
$self->SUPER::create_model(@_);
}
sub create_boolean_model {
my ($self, $pos, $neg, $cat) = @_;
my @docs = (map([$_->features, 1], @$pos),
map([$_->features, 0], @$neg));
my $train_file = $self->create_arff_file($cat->name . '_train', \@docs);
my %info = (machine_file => $cat->name . '_model');
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
}
return $filename;
}
sub save_state {
my ($self, $path) = @_;
{
local $self->{knowledge_set};
$self->SUPER::save_state($path);
}
return unless $self->{model};
my $model_dir = File::Spec->catdir($path, 'models');
mkdir($model_dir, 0777) or die "Couldn't create $model_dir: $!";
while (my ($name, $learner) = each %{$self->{model}{learners}}) {
my $oldpath = File::Spec->catdir($self->{model}{_in_dir}, $learner->{machine_file});
my $newpath = File::Spec->catfile($model_dir, "${name}_model");
File::Copy::copy($oldpath, $newpath);
}
$self->{model}{_in_dir} = $model_dir;
}
sub restore_state {
my ($pkg, $path) = @_;
my $self = $pkg->SUPER::restore_state($path);
my $model_dir = File::Spec->catdir($path, 'models');
return $self unless -e $model_dir;
$self->{model}{_in_dir} = $model_dir;
return $self;
}
1;