view release on metacpan or search on metacpan
eg/categorizer view on Meta::CPAN
my ($opt, $do_stage, $outfile) = parse_command_line(@ARGV);
@ARGV = grep !/^-\d$/, @ARGV;
my $c = eval {new AI::Categorizer(%$opt)};
if ($@ and $@ =~ /^The following parameter/) {
die "$@\nPlease see the AI::Categorizer documentation for a description of parameters accepted.\n";
}
die $@ if $@;
%$do_stage = map {$_, 1} 1..5 unless keys %$do_stage;
my $out_fh;
if ($outfile) {
open $out_fh, ">> $outfile" or die "Can't create $outfile: $!";
select((select($out_fh), $|=1)[0]);
if (keys(%$do_stage) > 1) {
print $out_fh "~~~~~~~~~~~~~~~~", scalar(localtime), "~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
if ($HAVE_YAML) {
print {$out_fh} YAML::Dump($c->dump_parameters);
} else {
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my ($class, %args) = @_;
# Optimize so every document doesn't have to convert the stopword list to a hash
if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
$args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
}
my $self = $class->SUPER::new(%args);
if ($self->{category_file}) {
local *FH;
open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!";
while (<FH>) {
my ($doc, @cats) = split;
$self->{category_hash}{$doc} = \@cats;
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
=item new()
Creates a new Collection object and returns it. Accepts the following
parameters:
=over 4
=item category_hash
Indicates a reference to a hash which maps document names to category
names. The keys of the hash are the document names, each value should
be a reference to an array containing the names of the categories to
which each document belongs.
=item category_file
Indicates a file which should be read in order to create the
C<category_hash>. Each line of the file should list a document's
name, followed by a list of category names, all separated by
whitespace.
lib/AI/Categorizer/Collection/Files.pm view on Meta::CPAN
push @{$self->{used}}, $self->{cur_dir};
opendir $self->{dir_fh}, $self->{cur_dir} or die "$self->{cur_dir}: $!";
}
sub next {
my $self = shift;
my $file = $self->_read_file;
return unless defined $file;
warn "No category information about '$file'" unless defined $self->{category_hash}{$file};
my @cats = map AI::Categorizer::Category->by_name(name => $_), @{ $self->{category_hash}{$file} || [] };
return $self->call_method('document', 'read',
path => File::Spec->catfile($self->{cur_dir}, $file),
name => $file,
categories => \@cats,
);
}
sub _read_file {
my ($self) = @_;
lib/AI/Categorizer/Document.pm view on Meta::CPAN
}
$self->finish if $self->{content};
return $self;
}
sub _fix_stopwords {
my $self = shift;
# Convert to hash
$self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
my $s = $self->{stopwords};
# May need to perform stemming on the stopwords
return unless keys %$s; # No point in doing anything if there are no stopwords
return unless $self->{stopword_behavior} eq 'stem';
return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
return if $s->{___stemmed};
lib/AI/Categorizer/Document.pm view on Meta::CPAN
}
}
return \%counts;
}
sub vectorize {
my ($self, %args) = @_;
if ($self->{stem_stopwords}) {
my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
$self->{stopwords} = { map {+$_, 1} @$s };
$args{tokens} = $self->_filter_tokens($args{tokens});
}
return $self->_weigh_tokens($args{tokens}, $args{weight});
}
sub read {
my ($class, %args) = @_;
my $path = delete $args{path} or die "Must specify 'path' argument to read()";
my $self = $class->new(%args);
lib/AI/Categorizer/Document/SMART.pm view on Meta::CPAN
or die "Malformed record: $args{content}";
my ($id, $categories, $title) = ($1, $2, $3);
$self->{name} = $id;
$self->{content} = { title => $title,
body => $args{content} };
my @categories = $categories =~ m/(.*?)\s+\d+[\s;]*/g;
@categories = map AI::Categorizer::Category->by_name(name => $_), @categories;
$self->{categories} = \@categories;
}
1;
lib/AI/Categorizer/Experiment.pm view on Meta::CPAN
categories => { type => ARRAYREF|HASHREF },
sig_figs => { type => SCALAR, default => 4 },
);
sub new {
my $package = shift;
my $self = $package->Class::Container::new(@_);
$self->{$_} = 0 foreach qw(a b c d);
my $c = delete $self->{categories};
$self->{categories} = { map {($_ => {a=>0, b=>0, c=>0, d=>0})}
UNIVERSAL::isa($c, 'HASH') ? keys(%$c) : @$c
};
return $self;
}
sub add_hypothesis {
my ($self, $h, $correct, $name) = @_;
die "No hypothesis given to add_hypothesis()" unless $h;
$name = $h->document_name unless defined $name;
lib/AI/Categorizer/Experiment.pm view on Meta::CPAN
AI::Categorizer::Experiment - Coordinate experimental results
=head1 SYNOPSIS
use AI::Categorizer::Experiment;
my $e = new AI::Categorizer::Experiment(categories => \%categories);
my $l = AI::Categorizer::Learner->restore_state(...path...);
while (my $d = ... get document ...) {
my $h = $l->categorize($d); # A Hypothesis
$e->add_hypothesis($h, [map $_->name, $d->categories]);
}
print "Micro F1: ", $e->micro_F1, "\n"; # Access a single statistic
print $e->stats_table; # Show several stats in table form
=head1 DESCRIPTION
The C<AI::Categorizer::Experiment> class helps you organize the
results of categorization experiments. As you get lots of
categorization results (Hypotheses) back from the Learner, you can
lib/AI/Categorizer/FeatureSelector.pm view on Meta::CPAN
AI::Categorizer::FeatureSelector - Abstract Feature Selection class
=head1 SYNOPSIS
...
=head1 DESCRIPTION
The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two. Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.
=head1 METHODS
=over 4
=item new()
Creates a new KnowledgeSet and returns it. Accepts the following
lib/AI/Categorizer/FeatureSelector/DocFrequency.pm view on Meta::CPAN
AI::Categorizer::FeatureSelector - Abstract Feature Selection class
=head1 SYNOPSIS
...
=head1 DESCRIPTION
The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two. Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.
=head1 METHODS
=over 4
=item new()
Creates a new KnowledgeSet and returns it. Accepts the following
lib/AI/Categorizer/FeatureVector.pm view on Meta::CPAN
}
sub scale {
my ($self, $scalar) = @_;
$_ *= $scalar foreach values %{$self->{features}};
return $self;
}
sub as_boolean_hash {
my $self = shift;
return { map {($_ => 1)} keys %{$self->{features}} };
}
sub length {
my $self = shift;
return scalar keys %{$self->{features}};
}
sub clone {
my $self = shift;
return ref($self)->new( features => { %{$self->{features}} } );
}
sub intersection {
my ($self, $other) = @_;
$other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);
my $common;
if (UNIVERSAL::isa($other, 'ARRAY')) {
$common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} @$other};
} elsif (UNIVERSAL::isa($other, 'HASH')) {
$common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} keys %$other};
}
return ref($self)->new( features => $common );
}
sub add {
my ($self, $other) = @_;
$other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);
while (my ($k,$v) = each %$other) {
$self->{features}{$k} += $v;
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
my @cats = keys %{ $stats{categories} };
$stats{category_count} = @cats;
$stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
$stats{tokens_per_document} = $stats{token_count} / $stats{document_count};
$stats{types_per_document} = $stats{type_count} / $stats{document_count};
foreach my $thing ('type', 'token', 'document') {
$stats{"${thing}s_per_category"} = AI::Categorizer::Util::average
( map { $stats{categories}{$_}{"${thing}_count"} } @cats );
next unless @cats;
# Compute the skews
my $ssum;
foreach my $cat (@cats) {
$ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
}
$stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
}
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
while (@sizes > 1) {
my $size = int ($num_docs * shift @sizes);
push @groups, [];
for (0..$size) {
push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
}
}
push @groups, \@docs;
return map { ref($self)->new( documents => $_ ) } @groups;
}
sub make_document {
my ($self, %args) = @_;
my $cats = delete $args{categories};
my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
my $d = $self->create_delayed_object('document', %args, categories => \@cats);
$self->add_document($d);
}
sub add_document {
my ($self, $doc) = @_;
foreach ($doc->categories) {
$_->add_document($doc);
}
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
=head1 SYNOPSIS
use AI::Categorizer::KnowledgeSet;
my $k = new AI::Categorizer::KnowledgeSet(...parameters...);
my $nb = new AI::Categorizer::Learner::NaiveBayes(...parameters...);
$nb->train(knowledge_set => $k);
=head1 DESCRIPTION
The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two. Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.
=head1 METHODS
=over 4
=item new()
Creates a new KnowledgeSet and returns it. Accepts the following
lib/AI/Categorizer/Learner.pm view on Meta::CPAN
}
sub train {
my ($self, %args) = @_;
$self->{knowledge_set} = $args{knowledge_set} if $args{knowledge_set};
die "No knowledge_set provided" unless $self->{knowledge_set};
$self->{knowledge_set}->finish;
$self->create_model; # Creates $self->{model}
$self->delayed_object_params('hypothesis',
all_categories => [map $_->name, $self->categories],
);
}
sub prog_bar {
my ($self, $count) = @_;
return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
my $pb = 'Time::Progress'->new;
$pb->attr(max => $count);
lib/AI/Categorizer/Learner.pm view on Meta::CPAN
}
print STDERR $pb->report("%50b %p ($i/$count)$string\r", $i);
return $i;
};
}
sub categorize_collection {
my ($self, %args) = @_;
my $c = $args{collection} or die "No collection provided";
my @all_cats = map $_->name, $self->categories;
my $experiment = $self->create_delayed_object('experiment', categories => \@all_cats);
my $pb = $self->verbose ? $self->prog_bar($c->count_documents) : sub {};
while (my $d = $c->next) {
my $h = $self->categorize($d);
$experiment->add_hypothesis($h, [map $_->name, $d->categories]);
$pb->($experiment);
if ($self->verbose > 1) {
printf STDERR ("%s: assigned=(%s) correct=(%s)\n",
$d->name,
join(', ', $h->categories),
join(', ', map $_->name, $d->categories));
}
}
print STDERR "\n" if $self->verbose;
return $experiment;
}
sub categorize {
my ($self, $doc) = @_;
lib/AI/Categorizer/Learner/Boolean.pm view on Meta::CPAN
sub get_boolean_score; # Abstract method
sub threshold {
my $self = shift;
$self->{threshold} = shift if @_;
return $self->{threshold};
}
sub categories {
my $self = shift;
return map AI::Categorizer::Category->by_name( name => $_ ), keys %{ $self->{model}{learners} };
}
1;
__END__
=head1 NAME
AI::Categorizer::Learner::Boolean - Abstract class for boolean categorizers
=head1 SYNOPSIS
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
# Use the whole corpus
@docset = $self->knowledge_set->documents;
}
foreach my $doc (@docset) {
my $score = $doc->features->dot( $features );
warn "Score for ", $doc->name, " (", ($doc->categories)[0]->name, "): $score" if $self->verbose > 1;
$q->add($doc, $score);
}
my %scores = map {+$_->name, 0} $self->categories;
foreach my $e (@{$q->entries}) {
foreach my $cat ($e->{thing}->categories) {
$scores{$cat->name} += ($self->{knn_weighting} eq 'score' ? $e->{score} : 1); #increment cat score
}
}
$_ /= $self->{k_value} foreach values %scores;
return (\%scores, $self->{threshold});
}
lib/AI/Categorizer/Learner/NaiveBayes.pm view on Meta::CPAN
(
threshold => {type => SCALAR, default => 0.3},
);
sub create_model {
my $self = shift;
my $m = $self->{model} = Algorithm::NaiveBayes->new;
foreach my $d ($self->knowledge_set->documents) {
$m->add_instance(attributes => $d->features->as_hash,
label => [ map $_->name, $d->categories ]);
}
$m->train;
}
sub get_scores {
my ($self, $newdoc) = @_;
return ($self->{model}->predict( attributes => $newdoc->features->as_hash ),
$self->{threshold});
}
lib/AI/Categorizer/Learner/NaiveBayes.pm view on Meta::CPAN
}
sub save_state {
my $self = shift;
local $self->{knowledge_set}; # Don't need the knowledge_set to categorize
$self->SUPER::save_state(@_);
}
sub categories {
my $self = shift;
return map AI::Categorizer::Category->by_name( name => $_ ), $self->{model}->labels;
}
1;
__END__
=head1 NAME
AI::Categorizer::Learner::NaiveBayes - Naive Bayes Algorithm For AI::Categorizer
lib/AI/Categorizer/Learner/SVM.pm view on Meta::CPAN
use File::Spec;
__PACKAGE__->valid_params
(
svm_kernel => {type => SCALAR, default => 'linear'},
);
sub create_model {
my $self = shift;
my $f = $self->knowledge_set->features->as_hash;
my $rmap = [ keys %$f ];
$self->{model}{feature_map} = { map { $rmap->[$_], $_ } 0..$#$rmap };
$self->{model}{feature_map_reverse} = $rmap;
$self->SUPER::create_model(@_);
}
sub _doc_2_dataset {
my ($self, $doc, $label, $fm) = @_;
my $ds = new Algorithm::SVM::DataSet(Label => $label);
my $f = $doc->features->as_hash;
while (my ($k, $v) = each %$f) {
next unless exists $fm->{$k};
lib/AI/Categorizer/Learner/SVM.pm view on Meta::CPAN
}
return $ds;
}
sub create_boolean_model {
my ($self, $positives, $negatives, $cat) = @_;
my $svm = new Algorithm::SVM(Kernel => $self->{svm_kernel});
my (@pos, @neg);
foreach my $doc (@$positives) {
push @pos, $self->_doc_2_dataset($doc, 1, $self->{model}{feature_map});
}
foreach my $doc (@$negatives) {
push @neg, $self->_doc_2_dataset($doc, 0, $self->{model}{feature_map});
}
$svm->train(@pos, @neg);
return $svm;
}
sub get_scores {
my ($self, $doc) = @_;
local $self->{current_doc} = $self->_doc_2_dataset($doc, -1, $self->{model}{feature_map});
return $self->SUPER::get_scores($doc);
}
sub get_boolean_score {
my ($self, $doc, $svm) = @_;
return $svm->predict($self->{current_doc});
}
sub save_state {
my ($self, $path) = @_;
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
# Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement)
my $dummy_features = $self->create_delayed_object('features');
$m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]);
$self->SUPER::create_model(@_);
}
sub create_boolean_model {
my ($self, $pos, $neg, $cat) = @_;
my @docs = (map([$_->features, 1], @$pos),
map([$_->features, 0], @$neg));
my $train_file = $self->create_arff_file($cat->name . '_train', \@docs);
my %info = (machine_file => $cat->name . '_model');
my $outfile = File::Spec->catfile($self->{model}{_in_dir}, $info{machine_file});
my @args = ($self->{java_path},
@{$self->{java_args}},
$self->{weka_classifier},
@{$self->{weka_args}},
'-t', $train_file,
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
}
sub categorize_collection {
my ($self, %args) = @_;
my $c = $args{collection} or die "No collection provided";
my @alldocs;
while (my $d = $c->next) {
push @alldocs, $d;
}
my $doc_file = $self->create_arff_file("docs", [map [$_->features, 0], @alldocs]);
my @assigned;
my $l = $self->{model}{learners};
foreach my $cat (keys %$l) {
my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, "${cat}_model");
my @args = ($self->{java_path},
@{$self->{java_args}},
$self->{weka_classifier},
'-l', $machine_file,
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
warn "Can't parse line $line";
next;
}
my ($index, $predicted, $score) = ($1, $2, $3);
$assigned[$index]{$cat} = $score if $predicted; # Not sure what weka's scores represent
print STDERR "$index: assigned=($predicted) correct=(", $alldocs[$index]->is_in_category($cat) ? 1 : 0, ")\n"
if $self->verbose;
}
}
my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]);
foreach my $i (0..$#alldocs) {
$experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name);
}
return $experiment;
}
sub do_cmd {
my ($self, @cmd) = @_;
print STDERR " % @cmd\n" if $self->verbose;
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
SUFFIX => '.arff',
);
print $fh "\@RELATION foo\n\n";
my $feature_names = $self->{model}{all_features};
foreach my $name (@$feature_names) {
print $fh "\@ATTRIBUTE feature-$name REAL\n";
}
print $fh "\@ATTRIBUTE category {1, 0}\n\n";
my %feature_indices = map {$feature_names->[$_], $_} 0..$#{$feature_names};
my $last_index = keys %feature_indices;
# We use the 'sparse' format, see http://www.cs.waikato.ac.nz/~ml/weka/arff.html
print $fh "\@DATA\n";
foreach my $doc (@$docs) {
my ($features, $cat) = @$doc;
my $f = $features->as_hash;
my @ordered_keys = (sort {$feature_indices{$a} <=> $feature_indices{$b}}
grep {exists $feature_indices{$_}}
keys %$f);
print $fh ("{",
join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"),
"}\n"
);
}
return $filename;
}
sub save_state {
my ($self, $path) = @_;
lib/AI/Categorizer/Util.pm view on Meta::CPAN
my ($one, $two) = @_;
$two = _hashify($two);
return UNIVERSAL::isa($one, 'HASH') ? # Accept hash or array for $one
grep {exists $two->{$_}} keys %$one :
grep {exists $two->{$_}} @$one;
}
sub _hashify {
return $_[0] if UNIVERSAL::isa($_[0], 'HASH');
return {map {$_=>1} @{$_[0]}};
}
sub random_elements {
my ($a_ref, $n) = @_;
return @$a_ref if $n >= @$a_ref;
my ($select, $mode) = ($n < @$a_ref/2) ? ($n, 'include') : (@$a_ref - $n, 'exclude');
my %i;
$i{int rand @$a_ref} = 1 while keys(%i) < $select;
return @{$a_ref}[keys %i] if $mode eq 'include';
return map {$i{$_} ? () : $a_ref->[$_]} 0..$#$a_ref;
}
1;