view release on metacpan or search on metacpan
eg/categorizer view on Meta::CPAN
run_section('scan_features', 1, $do_stage);
run_section('read_training_set', 2, $do_stage);
run_section('train', 3, $do_stage);
run_section('evaluate_test_set', 4, $do_stage);
if ($do_stage->{5}) {
my $result = $c->stats_table;
print $result if $c->verbose;
print $out_fh $result if $out_fh;
}
sub run_section {
my ($section, $stage, $do_stage) = @_;
return unless $do_stage->{$stage};
if (keys %$do_stage > 1) {
print " % $0 @ARGV -$stage\n" if $c->verbose;
die "$0 is not executable, please change its execution permissions"
unless -x $0;
system($0, @ARGV, "-$stage") == 0
or die "$0 returned nonzero status, \$?=$?";
return;
}
my $start = new Benchmark;
$c->$section();
my $end = new Benchmark;
my $summary = timestr(timediff($end, $start));
my ($rss, $vsz) = memory_usage();
print "$summary (memory: rss=$rss, vsz=$vsz)\n" if $c->verbose;
print $out_fh "Stage $stage: $summary (memory: rss=$rss, vsz=$vsz)\n" if $out_fh;
}
sub parse_command_line {
my (%opt, %do_stage);
while (@_) {
if ($_[0] =~ /^-(\d+)$/) {
shift;
$do_stage{$1} = 1;
} elsif ( $_[0] eq '--config_file' ) {
die "--config_file requires the YAML module from CPAN to be installed.\n" unless $HAVE_YAML;
shift;
eg/categorizer view on Meta::CPAN
}
my $outfile;
unless ($outfile = delete $opt{outfile}) {
$outfile = $opt{progress_file} ? "$opt{progress_file}-results.txt" : "results.txt";
}
return (\%opt, \%do_stage, $outfile);
}
sub usage {
return <<EOF;
Usage:
$0 --parameter_1 <value_1> --parameter_2 <value_2>
# You may specify a YAML config file as follows:
$0 --config_file <path> --parameter_3 <value_3>
# Or, to run only step 3 (of 5)
$0 --config_file <path> -3
--parameter_1, --parameter_2, etc. are parameters accepted by
AI::Categorizer objects' new() methods.
EOF
}
sub memory_usage {
my ($rss, $vsz);
if ($^O eq 'darwin' or $^O eq 'linux') {
($rss, $vsz) = `ps -eo rss,vsz -p $$` =~ /(\d+)\s+(\d+)/;
} elsif ($^O eq 'solaris') {
($rss, $vsz) = `ps -o rss,vsz -p $$` =~ /(\d+)\s+(\d+)/;
} else {
warn "Unknown system, can't get memory usage";
}
return ($rss, $vsz);
}
lib/AI/Categorizer.pm view on Meta::CPAN
__PACKAGE__->contained_objects
(
knowledge_set => { class => 'AI::Categorizer::KnowledgeSet' },
learner => { class => 'AI::Categorizer::Learner::NaiveBayes' },
experiment => { class => 'AI::Categorizer::Experiment',
delayed => 1 },
collection => { class => 'AI::Categorizer::Collection::Files',
delayed => 1 },
);
sub new {
my $package = shift;
my %args = @_;
my %defaults;
if (exists $args{data_root}) {
$defaults{training_set} = File::Spec->catfile($args{data_root}, 'training');
$defaults{test_set} = File::Spec->catfile($args{data_root}, 'test');
$defaults{category_file} = File::Spec->catfile($args{data_root}, 'cats.txt');
delete $args{data_root};
}
return $package->SUPER::new(%defaults, %args);
}
#sub dump_parameters {
# my $p = shift()->SUPER::dump_parameters;
# delete $p->{stopwords} if $p->{stopword_file};
# return $p;
#}
sub knowledge_set { shift->{knowledge_set} }
sub learner { shift->{learner} }
# Combines several methods in one sub
sub run_experiment {
my $self = shift;
$self->scan_features;
$self->read_training_set;
$self->train;
$self->evaluate_test_set;
print $self->stats_table;
}
sub scan_features {
my $self = shift;
return unless $self->knowledge_set->scan_first;
$self->knowledge_set->scan_features( path => $self->{training_set} );
$self->knowledge_set->save_features( "$self->{progress_file}-01-features" );
}
sub read_training_set {
my $self = shift;
$self->knowledge_set->restore_features( "$self->{progress_file}-01-features" )
if -e "$self->{progress_file}-01-features";
$self->knowledge_set->read( path => $self->{training_set} );
$self->_save_progress( '02', 'knowledge_set' );
return $self->knowledge_set;
}
sub train {
my $self = shift;
$self->_load_progress( '02', 'knowledge_set' );
$self->learner->train( knowledge_set => $self->{knowledge_set} );
$self->_save_progress( '03', 'learner' );
return $self->learner;
}
sub evaluate_test_set {
my $self = shift;
$self->_load_progress( '03', 'learner' );
my $c = $self->create_delayed_object('collection', path => $self->{test_set} );
$self->{experiment} = $self->learner->categorize_collection( collection => $c );
$self->_save_progress( '04', 'experiment' );
return $self->{experiment};
}
sub stats_table {
my $self = shift;
$self->_load_progress( '04', 'experiment' );
return $self->{experiment}->stats_table;
}
sub progress_file {
shift->{progress_file};
}
sub verbose {
shift->{verbose};
}
sub _save_progress {
my ($self, $stage, $node) = @_;
return unless $self->{progress_file};
my $file = "$self->{progress_file}-$stage-$node";
warn "Saving to $file\n" if $self->{verbose};
$self->{$node}->save_state($file);
}
sub _load_progress {
my ($self, $stage, $node) = @_;
return unless $self->{progress_file};
my $file = "$self->{progress_file}-$stage-$node";
warn "Loading $file\n" if $self->{verbose};
$self->{$node} = $self->contained_class($node)->restore_state($file);
}
1;
__END__
lib/AI/Categorizer/Category.pm view on Meta::CPAN
use Params::Validate qw(:types);
use AI::Categorizer::FeatureVector;
__PACKAGE__->valid_params
(
name => {type => SCALAR, public => 0},
documents => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Document objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'), @_ },
},
public => 0,
},
);
__PACKAGE__->contained_objects
(
features => {
class => 'AI::Categorizer::FeatureVector',
delayed => 1,
},
);
my %REGISTRY = ();
sub new {
my $self = shift()->SUPER::new(@_);
$self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
$REGISTRY{$self->{name}} = $self;
return $self;
}
sub by_name {
my ($class, %args) = @_;
return $REGISTRY{$args{name}} if exists $REGISTRY{$args{name}};
return $class->new(%args);
}
sub name { $_[0]->{name} }
sub documents {
my $d = $_[0]->{documents};
return wantarray ? $d->members : $d->size;
}
sub contains_document {
return $_[0]->{documents}->includes( $_[1] );
}
sub add_document {
my $self = shift;
$self->{documents}->insert( $_[0] );
delete $self->{features}; # Could be more efficient?
}
sub features {
my $self = shift;
if (@_) {
$self->{features} = shift;
}
return $self->{features} if $self->{features};
my $v = $self->create_delayed_object('features');
return $self->{features} = $v unless $self->documents;
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
category_hash => { type => HASHREF, default => {} },
category_file => { type => SCALAR, optional => 1 },
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my ($class, %args) = @_;
# Optimize so every document doesn't have to convert the stopword list to a hash
if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
$args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
}
my $self = $class->SUPER::new(%args);
if ($self->{category_file}) {
lib/AI/Categorizer/Collection.pm view on Meta::CPAN
close FH;
$self->delayed_object_params('document', stopwords => \%stopwords);
}
return $self;
}
# This should usually be replaced in subclasses with a faster version that doesn't
# need to create actual documents each time through
sub count_documents {
my $self = shift;
return $self->{document_count} if exists $self->{document_count};
$self->rewind;
my $count = 0;
$count++ while $self->next;
$self->rewind;
return $self->{document_count} = $count;
}
# Abstract methods
sub next;
sub rewind;
1;
__END__
=head1 NAME
AI::Categorizer::Collection - Access stored documents
=head1 SYNOPSIS
lib/AI/Categorizer/Collection/DBI.pm view on Meta::CPAN
dbh => {isa => 'DBI::db', default => undef},
select_statement => {type => SCALAR, default => "SELECT text FROM documents"},
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document',
delayed => 1 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
die "Must provide 'dbh' or 'connection_string' arguments"
unless $self->{dbh} or $self->{connection_string};
unless ($self->{dbh}) {
$self->{dbh} = DBI->connect($self->{connection_string}, '', '', {RaiseError => 1})
or die DBI->errstr;
delete $self->{connection_string};
}
$self->rewind;
return $self;
}
sub dbh { shift()->{dbh} }
sub rewind {
my $self = shift;
if (!$self->{sth}) {
$self->{sth} = $self->dbh->prepare($self->{select_statement});
}
if ($self->{sth}{Active}) {
$self->{sth}->finish;
}
$self->{sth}->execute;
}
sub next {
my $self = shift;
my @result = $self->{sth}->fetchrow_array;
return undef unless @result;
return $self->create_delayed_object('document',
name => $result[0],
categories => [$result[1]],
content => $result[2],
);
lib/AI/Categorizer/Collection/Files.pm view on Meta::CPAN
use Params::Validate qw(:types);
use File::Spec;
__PACKAGE__->valid_params
(
path => { type => SCALAR|ARRAYREF },
recurse => { type => BOOLEAN, default => 0 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->{dir_fh} = do {local *FH; *FH}; # double *FH avoids a warning
# Documents are contained in a directory, or list of directories
$self->{path} = [$self->{path}] unless ref $self->{path};
$self->{used} = [];
$self->_next_path;
return $self;
}
sub _next_path {
my $self = shift;
closedir $self->{dir_fh} if $self->{cur_dir};
$self->{cur_dir} = shift @{$self->{path}};
push @{$self->{used}}, $self->{cur_dir};
opendir $self->{dir_fh}, $self->{cur_dir} or die "$self->{cur_dir}: $!";
}
sub next {
my $self = shift;
my $file = $self->_read_file;
return unless defined $file;
warn "No category information about '$file'" unless defined $self->{category_hash}{$file};
my @cats = map AI::Categorizer::Category->by_name(name => $_), @{ $self->{category_hash}{$file} || [] };
return $self->call_method('document', 'read',
path => File::Spec->catfile($self->{cur_dir}, $file),
name => $file,
categories => \@cats,
);
}
sub _read_file {
my ($self) = @_;
my $file = readdir $self->{dir_fh};
if (!defined $file) { # Directory has been exhausted
return undef unless @{$self->{path}};
$self->_next_path;
return $self->_read_file;
} elsif ($file eq '.' or $file eq '..') {
return $self->_read_file;
} elsif (-d (my $path = File::Spec->catdir($self->{cur_dir}, $file))) {
push @{$self->{path}}, $path # Add for later processing
if $self->{recurse} and !grep {$_ eq $path} @{$self->{path}}, @{$self->{used}};
return $self->_read_file;
}
return $file;
}
sub rewind {
my $self = shift;
push @{$self->{path}}, @{$self->{used}};
@{$self->{used}} = ();
$self->_next_path;
}
# This should share an iterator with next()
sub count_documents {
my $self = shift;
return $self->{document_count} if defined $self->{document_count};
$self->rewind;
my $count = 0;
$count++ while defined $self->_read_file;
$self->rewind;
return $self->{document_count} = $count;
lib/AI/Categorizer/Collection/InMemory.pm view on Meta::CPAN
use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
data => { type => HASHREF },
);
sub new {
my $self = shift()->SUPER::new(@_);
while (my ($name, $params) = each %{$self->{data}}) {
foreach (@{$params->{categories}}) {
next if ref $_;
$_ = AI::Categorizer::Category->by_name(name => $_);
}
}
return $self;
}
sub next {
my $self = shift;
my ($name, $params) = each %{$self->{data}} or return;
return AI::Categorizer::Document->new(name => $name, %$params);
}
sub rewind {
my $self = shift;
scalar keys %{$self->{data}};
return;
}
sub count_documents {
my $self = shift;
return scalar keys %{$self->{data}};
}
1;
lib/AI/Categorizer/Collection/SingleFile.pm view on Meta::CPAN
categories => { type => HASHREF|UNDEF, default => undef },
delimiter => { type => SCALAR },
);
__PACKAGE__->contained_objects
(
document => { class => 'AI::Categorizer::Document::Text',
delayed => 1 },
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->{fh} = do {local *FH; *FH}; # double *FH avoids a warning
# Documents are contained in a file, or list of files
$self->{path} = [$self->{path}] unless ref $self->{path};
$self->{used} = [];
$self->_next_path;
return $self;
}
sub _next_path {
my $self = shift;
close $self->{fh} if $self->{cur_file};
push @{$self->{used}}, shift @{$self->{path}};
$self->{cur_file} = $self->{used}[-1];
open $self->{fh}, "< $self->{cur_file}" or die "$self->{cur_file}: $!";
}
sub next {
my $self = shift;
my $fh = $self->{fh}; # Must put in a simple scalar
my $content = do {local $/ = $self->{delimiter}; <$fh>};
if (!defined $content) { # File has been exhausted
unless (@{$self->{path}}) { # All files have been exhausted
$self->{fh} = undef;
return undef;
}
lib/AI/Categorizer/Collection/SingleFile.pm view on Meta::CPAN
return $self->next;
} elsif ($content =~ /^\s*$self->{delimiter}$/) { # Skip empty docs
return $self->next;
}
#warn "doc is $content";
#warn "creating document=>@{[ %{$self->{container}{delayed}{document}} ]}";
return $self->create_delayed_object('document', content => $content);
}
sub count_documents {
my ($self) = @_;
return $self->{document_count} if defined $self->{document_count};
$self->rewind;
my $count = 0;
local $/ = $self->{delimiter};
my $fh = $self->{fh};
while (1) {
$count++ while <$fh>;
last unless @{$self->{path}};
$self->_next_path;
}
$self->rewind;
return $self->{document_count} = $count;
}
sub rewind {
my ($self) = @_;
close $self->{fh} if $self->{cur_file};
unshift @{$self->{path}}, @{$self->{used}};
$self->{used} = [];
$self->_next_path;
}
1;
lib/AI/Categorizer/Document.pm view on Meta::CPAN
__PACKAGE__->valid_params
(
name => {
type => SCALAR,
},
categories => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Category objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
},
public => 0,
},
stopwords => {
type => ARRAYREF|HASHREF,
default => {},
},
content => {
type => HASHREF|SCALAR,
default => undef,
lib/AI/Categorizer/Document.pm view on Meta::CPAN
__PACKAGE__->contained_objects
(
features => { delayed => 1,
class => 'AI::Categorizer::FeatureVector' },
);
### Constructors
my $NAME = 'a';
sub new {
my $pkg = shift;
my $self = $pkg->SUPER::new(name => $NAME++, # Use a default name
@_);
# Get efficient internal data structures
$self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
$self->_fix_stopwords;
# A few different ways for the caller to initialize the content
lib/AI/Categorizer/Document.pm view on Meta::CPAN
} elsif (defined $self->{content}) {
# Allow a simple string as the content
$self->{content} = { body => $self->{content} } unless ref $self->{content};
}
$self->finish if $self->{content};
return $self;
}
sub _fix_stopwords {
my $self = shift;
# Convert to hash
$self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
my $s = $self->{stopwords};
# May need to perform stemming on the stopwords
return unless keys %$s; # No point in doing anything if there are no stopwords
lib/AI/Categorizer/Document.pm view on Meta::CPAN
my @keys = keys %$s;
%$s = ();
$self->stem_words(\@keys);
$s->{$_} = 1 foreach @keys;
# This flag is attached to the stopword structure itself so that
# other documents will notice it.
$s->{___stemmed} = 1;
}
sub finish {
my $self = shift;
$self->create_feature_vector;
# Now we're done with all the content stuff
delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
}
# Parse a document format - a virtual method
sub parse;
sub parse_handle {
my ($self, %args) = @_;
my $fh = $args{handle} or die "No 'handle' argument given to parse_handle()";
return $self->parse( content => join '', <$fh> );
}
### Accessors
sub name { $_[0]->{name} }
sub stopword_behavior { $_[0]->{stopword_behavior} }
sub features {
my $self = shift;
if (@_) {
$self->{features} = shift;
}
return $self->{features};
}
sub categories {
my $c = $_[0]->{categories};
return wantarray ? $c->members : $c->size;
}
### Workers
sub create_feature_vector {
my $self = shift;
my $content = $self->{content};
my $weights = $self->{content_weights};
die "'stopword_behavior' must be one of 'stem', 'no_stem', or 'pre_stemmed'"
unless $self->{stopword_behavior} =~ /^stem|no_stem|pre_stemmed$/;
$self->{features} = $self->create_delayed_object('features');
while (my ($name, $data) = each %$content) {
my $t = $self->tokenize($data);
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} eq 'no_stem';
$self->stem_words($t);
$t = $self->_filter_tokens($t) if $self->{stopword_behavior} =~ /^stem|pre_stemmed$/;
my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 );
$self->{features}->add($h);
}
}
sub is_in_category {
return (ref $_[1]
? $_[0]->{categories}->includes( $_[1] )
: $_[0]->{categories}->includes_name( $_[1] ));
}
sub tokenize {
my $self = shift;
my @tokens;
while ($_[0] =~ /([-\w]+)/g) {
my $word = lc $1;
next unless $word =~ /[a-z]/;
$word =~ s/^[^a-z]+//; # Trim leading non-alpha characters (helps with ordinals)
push @tokens, $word;
}
return \@tokens;
}
sub stem_words {
my ($self, $tokens) = @_;
return unless $self->{stemming};
return if $self->{stemming} eq 'none';
die "Unknown stemming option '$self->{stemming}' - options are 'porter' or 'none'"
unless $self->{stemming} eq 'porter';
eval {require Lingua::Stem; 1}
or die "Porter stemming requires the Lingua::Stem module, available from CPAN.\n";
@$tokens = @{ Lingua::Stem::stem(@$tokens) };
}
sub _filter_tokens {
my ($self, $tokens_in) = @_;
if ($self->{use_features}) {
my $f = $self->{use_features}->as_hash;
return [ grep exists($f->{$_}), @$tokens_in ];
} elsif ($self->{stopwords} and keys %{$self->{stopwords}}) {
my $s = $self->{stopwords};
return [ grep !exists($s->{$_}), @$tokens_in ];
}
return $tokens_in;
}
sub _weigh_tokens {
my ($self, $tokens, $weight) = @_;
my %counts;
if (my $b = 0+$self->{front_bias}) {
die "'front_bias' value must be between -1 and 1"
unless -1 < $b and $b < 1;
my $n = @$tokens;
my $r = ($b-1)**2 / ($b+1);
my $mult = $weight * log($r)/($r-1);
lib/AI/Categorizer/Document.pm view on Meta::CPAN
} else {
foreach my $feature (@$tokens) {
$counts{$feature} += $weight;
}
}
return \%counts;
}
sub vectorize {
my ($self, %args) = @_;
if ($self->{stem_stopwords}) {
my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
$self->{stopwords} = { map {+$_, 1} @$s };
$args{tokens} = $self->_filter_tokens($args{tokens});
}
return $self->_weigh_tokens($args{tokens}, $args{weight});
}
sub read {
my ($class, %args) = @_;
my $path = delete $args{path} or die "Must specify 'path' argument to read()";
my $self = $class->new(%args);
open my($fh), "< $path" or die "$path: $!";
$self->parse_handle(handle => $fh);
close $fh;
$self->finish;
return $self;
}
sub dump_features {
my ($self, %args) = @_;
my $path = $args{path} or die "No 'path' argument given to dump_features()";
open my($fh), "> $path" or die "Can't create $path: $!";
my $f = $self->features->as_hash;
while (my ($k, $v) = each %$f) {
print $fh "$k\t$v\n";
}
}
1;
lib/AI/Categorizer/Document/SMART.pm view on Meta::CPAN
package AI::Categorizer::Document::SMART;
use strict;
use AI::Categorizer::Document;
use base qw(AI::Categorizer::Document);
sub parse {
my ($self, %args) = @_;
$args{content} =~ s{
^(?:\.I)?\s+(\d+)\n # ID number - becomes document name
\.C\n
([^\n]+)\n # Categories
\.T\n
(.+)\n+ # Title
\.W\n
}
lib/AI/Categorizer/Document/Text.pm view on Meta::CPAN
use strict;
use AI::Categorizer::Document;
use base qw(AI::Categorizer::Document);
#use Params::Validate qw(:types);
#use AI::Categorizer::ObjectSet;
#use AI::Categorizer::FeatureVector;
### Constructors
sub parse {
my ($self, %args) = @_;
$self->{content} = { body => $args{content} };
}
1;
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
use base qw(AI::Categorizer::Document);
use XML::SAX;
__PACKAGE__->contained_objects
(
xml_handler => 'AI::Categorizer::Document::XML::Handler',
);
### Constructors
sub parse {
my ($self, %args) = @_;
# it is a string which contains the content of XML
my $body= $args{content};
# it is a hash which includes a pair of <elementName, weight>
my $elementWeight= $args{elementWeight};
# construct Handler which receive event of element, data, comment, processing_instruction
# And convert their values into a sequence of string and save it into buffer
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
}
##########################################################################
package AI::Categorizer::Document::XML::Handler;
use strict;
use base qw(XML::SAX::Base);
# Input: a hash which is weights of elements
# Output: object of this class
# Description: this is constructor
sub new{
my ($class, %args) = @_;
# call super class such as XML::SAX::Base
my $self = $class->SUPER::new;
# save weights of elements which is a hash for pairs <elementName, weight>
# weight is times duplication of corresponding element
# It is provided by caller(one of parameters) at construction, and
# we must save it in order to use doing duplication at end_element
$self->{weightHash} = $args{weights};
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
return $self;
}
# Input: None
# Output: None
# Description:
# it is called whenever the parser meets the document
# it will be called at once
# Currently, confirm if the content buffer is an empty
sub start_document{
my ($self, $doc)= @_;
# The level(depth) of the last called element in XML tree
# Calling of start_element is the preorder of the tree traversal.
# The level is the level of current visiting element in tree.
# the first element is 0-level
$self->{levelPointer} = 0;
# all data will be saved into here, initially, it is an empty
$self->{content} = "";
#$self->SUPER::start_document($doc);
}
# Input: None
# Output: None
# Description:
# it is called whenever the parser ends the document
# it will be called at once
# Nothing to do
sub end_document{
my ($self, $doc)= @_;
#$self->SUPER::end_document($doc);
}
# Input
# LocalName: $el->{LocalName}
# NamespaceURI: $el->{NamespaceURI}
# Name $el->{Name}
# Prefix $el->{Prefix}
# Attributes $el->{Attributes}
# for each attribute
# LocalName: $el->{LocalName}
# NamespaceURI: $el->{NamespaceURI}
# Name $el->{Name}
# Prefix $el->{Prefix}
# Value $el->{Value}
# Output: None
# Description:
# it is called whenever the parser meets the element
sub start_element{
my ($self, $el)= @_;
# find the last location of the content
# its meaning is to append the new data at this location
my $location= length $self->{content};
# save the last location of the current content
# so that at end_element the starting location of data of this element can be known
$self->{locationArray}[$self->{levelPointer}] = $location;
# for the next element, increase levelPointer
$self->{levelPointer}++;
#$self->SUPER::start_document($el);
}
# Input: None
# Output: None
# Description:
# it is called whenever the parser ends the element
sub end_element{
my ($self, $el)= @_;
$self->{levelPointer}--;
my $location= $self->{locationArray}[$self->{levelPointer}];
# find the name of element
my $elementName= $el->{Name};
# set the default weight
my $weight= 1;
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
}
#$self->SUPER::end_document($el);
}
# Input: a hash which consists of pair <Data, Value>
# Output: None
# Description:
# it is called whenever the parser meets the text which comes from Text, CDataSection and etc
# Value must be saved into content buffer.
sub characters{
my ($self, $args)= @_;
# save "data plus new line" into content
$self->{content} .= "$args->{Data}\n";
}
# Input: a hash which consists of pair <Data, Value>
# Output: None
# Description:
# it is called whenever the parser meets the comment
# Currently, it will be ignored
sub comment{
my ($self, $args)= @_;
}
# Input: a hash which consists of pair <Data, Value> and <Target, Value>
# Output: None
# Description:
# it is called whenever the parser meets the processing_instructing
# Currently, it will be ignored
sub processing_instruction{
my ($self, $args)= @_;
}
# Input: None
# Output: the converted data, that is, content
# Description:
# return the content
sub getContent{
my ($self)= @_;
return $self->{content};
}
1;
__END__
lib/AI/Categorizer/Experiment.pm view on Meta::CPAN
use base qw(Class::Container AI::Categorizer::Storable Statistics::Contingency);
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
categories => { type => ARRAYREF|HASHREF },
sig_figs => { type => SCALAR, default => 4 },
);
sub new {
my $package = shift;
my $self = $package->Class::Container::new(@_);
$self->{$_} = 0 foreach qw(a b c d);
my $c = delete $self->{categories};
$self->{categories} = { map {($_ => {a=>0, b=>0, c=>0, d=>0})}
UNIVERSAL::isa($c, 'HASH') ? keys(%$c) : @$c
};
return $self;
}
sub add_hypothesis {
my ($self, $h, $correct, $name) = @_;
die "No hypothesis given to add_hypothesis()" unless $h;
$name = $h->document_name unless defined $name;
$self->add_result([$h->categories], $correct, $name);
}
sub stats_table {
my $self = shift;
$self->SUPER::stats_table($self->{sig_figs});
}
1;
__END__
=head1 NAME
lib/AI/Categorizer/FeatureSelector.pm view on Meta::CPAN
features_kept => {
type => SCALAR,
default => 0.2,
},
verbose => {
type => SCALAR,
default => 0,
},
);
sub verbose {
my $self = shift;
$self->{verbose} = shift if @_;
return $self->{verbose};
}
sub reduce_features {
# Takes a feature vector whose weights are "feature scores", and
# chops to the highest n features. n is specified by the
# 'features_kept' parameter. If it's zero, all features are kept.
# If it's between 0 and 1, we multiply by the present number of
# features. If it's greater than 1, we treat it as the number of
# features to use.
my ($self, $f, %args) = @_;
my $kept = defined $args{features_kept} ? $args{features_kept} : $self->{features_kept};
return $f unless $kept;
lib/AI/Categorizer/FeatureSelector.pm view on Meta::CPAN
my $features = $f->as_hash;
my @new_features = (sort {$features->{$b} <=> $features->{$a}} keys %$features)
[0 .. $num_kept-1];
my $result = $f->intersection( \@new_features );
print "Finished trimming features - # features = " . $result->length . "\n" if $self->verbose;
return $result;
}
# Abstract methods
sub rank_features;
sub scan_features;
sub select_features {
my ($self, %args) = @_;
die "No knowledge_set parameter provided to select_features()"
unless $args{knowledge_set};
my $f = $self->rank_features( knowledge_set => $args{knowledge_set} );
return $self->reduce_features( $f, features_kept => $args{features_kept} );
}
lib/AI/Categorizer/FeatureSelector/CategorySelector.pm view on Meta::CPAN
__PACKAGE__->contained_objects
(
features => { class => 'AI::Categorizer::FeatureVector',
delayed => 1 },
);
1;
sub reduction_function;
# figure out the feature set before reading collection (default)
sub scan_features {
my ($self, %args) = @_;
my $c = $args{collection} or
die "No 'collection' parameter provided to scan_features()";
if(!($self->{features_kept})) {return;}
my %cat_features;
my $coll_features = $self->create_delayed_object('features');
my $nbDocuments = 0;
lib/AI/Categorizer/FeatureSelector/CategorySelector.pm view on Meta::CPAN
\%cat_features,\%cat_features_sum);
}
print STDERR "\n" if $self->verbose;
my $new_features = $self->reduce_features($r_features);
return $coll_features->intersection( $new_features );
}
# calculate feature set after reading collection (scan_first=0)
sub rank_features {
die "CategorySelector->rank_features is not implemented yet!";
# my ($self, %args) = @_;
#
# my $k = $args{knowledge_set}
# or die "No knowledge_set parameter provided to rank_features()";
#
# my %freq_counts;
# foreach my $name ($k->features->names) {
# $freq_counts{$name} = $k->document_frequency($name);
# }
# return $self->create_delayed_object('features', features => \%freq_counts);
}
# copied from KnowledgeSet->prog_bar by Ken Williams
sub prog_bar {
my ($self, $count) = @_;
return sub {} unless $self->verbose;
return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
my $pb = 'Time::Progress'->new;
$pb->attr(max => $count);
my $i = 0;
return sub {
$i++;
return if $i % 25;
print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
};
}
__END__
=head1 NAME
lib/AI/Categorizer/FeatureSelector/ChiSquare.pm view on Meta::CPAN
use strict;
use AI::Categorizer::FeatureSelector;
use base qw(AI::Categorizer::FeatureSelector::CategorySelector);
use Params::Validate qw(:types);
# Chi-Square function
# NB: this could probably be optimised a bit...
sub reduction_function {
my ($self,$term,$N,$allFeaturesSum,
$coll_features,$cat_features,$cat_features_sum) = @_;
my $CHI2SUM = 0;
my $nbcats = 0;
foreach my $catname (keys %{$cat_features}) {
# while ( my ($catname,$catfeatures) = each %{$cat_features}) {
my ($A,$B,$C,$D); # A = number of times where t and c co-occur
# B = " " " t occurs without c
# C = " " " c occurs without t
# D = " " " neither c nor t occur
lib/AI/Categorizer/FeatureSelector/DocFrequency.pm view on Meta::CPAN
use Params::Validate qw(:types);
use Carp qw(croak);
__PACKAGE__->contained_objects
(
features => { class => 'AI::Categorizer::FeatureVector',
delayed => 1 },
);
# The KnowledgeSet keeps track of document frequency, so just use that.
sub rank_features {
my ($self, %args) = @_;
my $k = $args{knowledge_set} or die "No knowledge_set parameter provided to rank_features()";
my %freq_counts;
foreach my $name ($k->features->names) {
$freq_counts{$name} = $k->document_frequency($name);
}
return $self->create_delayed_object('features', features => \%freq_counts);
}
sub scan_features {
my ($self, %args) = @_;
my $c = $args{collection} or die "No 'collection' parameter provided to scan_features()";
my $doc_freq = $self->create_delayed_object('features');
while (my $doc = $c->next) {
$args{prog_bar}->() if $args{prog_bar};
$doc_freq->add( $doc->features->as_boolean_hash );
}
print "\n" if $self->verbose;
lib/AI/Categorizer/FeatureVector.pm view on Meta::CPAN
package AI::Categorizer::FeatureVector;
sub new {
my ($package, %args) = @_;
$args{features} ||= {};
return bless {features => $args{features}}, $package;
}
sub names {
my $self = shift;
return keys %{$self->{features}};
}
sub set {
my $self = shift;
$self->{features} = (ref $_[0] ? $_[0] : {@_});
}
sub as_hash {
my $self = shift;
return $self->{features};
}
sub euclidean_length {
my $self = shift;
my $f = $self->{features};
my $total = 0;
foreach (values %$f) {
$total += $_**2;
}
return sqrt($total);
}
sub normalize {
my $self = shift;
my $length = $self->euclidean_length;
return $length ? $self->scale(1/$length) : $self;
}
sub scale {
my ($self, $scalar) = @_;
$_ *= $scalar foreach values %{$self->{features}};
return $self;
}
sub as_boolean_hash {
my $self = shift;
return { map {($_ => 1)} keys %{$self->{features}} };
}
sub length {
my $self = shift;
return scalar keys %{$self->{features}};
}
sub clone {
my $self = shift;
return ref($self)->new( features => { %{$self->{features}} } );
}
sub intersection {
my ($self, $other) = @_;
$other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);
my $common;
if (UNIVERSAL::isa($other, 'ARRAY')) {
$common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} @$other};
} elsif (UNIVERSAL::isa($other, 'HASH')) {
$common = {map {exists $self->{features}{$_} ? ($_ => $self->{features}{$_}) : ()} keys %$other};
}
return ref($self)->new( features => $common );
}
sub add {
my ($self, $other) = @_;
$other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);
while (my ($k,$v) = each %$other) {
$self->{features}{$k} += $v;
}
}
sub dot {
my ($self, $other) = @_;
$other = $other->as_hash if UNIVERSAL::isa($other, __PACKAGE__);
my $sum = 0;
my $f = $self->{features};
while (my ($k, $v) = each %$f) {
$sum += $other->{$k} * $v if exists $other->{$k};
}
return $sum;
}
sub sum {
my ($self) = @_;
# Return total of values in this vector
my $total = 0;
$total += $_ foreach values %{ $self->{features} };
return $total;
}
sub includes {
return exists $_[0]->{features}{$_[1]};
}
sub value {
return $_[0]->{features}{$_[1]};
}
sub values {
my $self = shift;
return @{ $self->{features} }{ @_ };
}
1;
__END__
=head1 NAME
AI::Categorizer::FeatureVector - Features vs. Values
lib/AI/Categorizer/Hypothesis.pm view on Meta::CPAN
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
all_categories => {type => ARRAYREF},
scores => {type => HASHREF},
threshold => {type => SCALAR},
document_name => {type => SCALAR, optional => 1},
);
sub all_categories { @{$_[0]->{all_categories}} }
sub document_name { $_[0]->{document_name} }
sub threshold { $_[0]->{threshold} }
sub best_category {
my ($self) = @_;
my $sc = $self->{scores};
return unless %$sc;
my ($best_cat, $best_score) = each %$sc;
while (my ($key, $val) = each %$sc) {
($best_cat, $best_score) = ($key, $val) if $val > $best_score;
}
return $best_cat;
}
sub in_category {
my ($self, $cat) = @_;
return '' unless exists $self->{scores}{$cat};
return $self->{scores}{$cat} > $self->{threshold};
}
sub categories {
my $self = shift;
return @{$self->{cats}} if $self->{cats};
$self->{cats} = [sort {$self->{scores}{$b} <=> $self->{scores}{$a}}
grep {$self->{scores}{$_} >= $self->{threshold}}
keys %{$self->{scores}}];
return @{$self->{cats}};
}
sub scores {
my $self = shift;
return @{$self->{scores}}{@_};
}
1;
__END__
=head1 NAME
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
use AI::Categorizer::FeatureVector;
use AI::Categorizer::Util;
use Carp qw(croak);
__PACKAGE__->valid_params
(
categories => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Category objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'),
@{$_[0]} },
},
},
documents => {
type => ARRAYREF,
default => [],
callbacks => { 'all are Document objects' =>
sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'),
@{$_[0]} },
},
},
scan_first => {
type => BOOLEAN,
default => 1,
},
feature_selector => {
isa => 'AI::Categorizer::FeatureSelector',
},
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
class => 'AI::Categorizer::Document' },
category => { delayed => 1,
class => 'AI::Categorizer::Category' },
collection => { delayed => 1,
class => 'AI::Categorizer::Collection::Files' },
features => { delayed => 1,
class => 'AI::Categorizer::FeatureVector' },
feature_selector => 'AI::Categorizer::FeatureSelector::DocFrequency',
);
sub new {
my ($pkg, %args) = @_;
# Shortcuts
if ($args{tfidf_weighting}) {
@args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting};
delete $args{tfidf_weighting};
}
my $self = $pkg->SUPER::new(%args);
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
$self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
if ($self->{load}) {
my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} };
$self->load(%$args);
delete $self->{load};
}
return $self;
}
sub features {
my $self = shift;
if (@_) {
$self->{features} = shift;
$self->trim_doc_features if $self->{features};
}
return $self->{features} if $self->{features};
# Create a feature vector encompassing the whole set of documents
my $v = $self->create_delayed_object('features');
foreach my $document ($self->documents) {
$v->add( $document->features );
}
return $self->{features} = $v;
}
sub categories {
my $c = $_[0]->{categories};
return wantarray ? $c->members : $c->size;
}
sub documents {
my $d = $_[0]->{documents};
return wantarray ? $d->members : $d->size;
}
sub document {
my ($self, $name) = @_;
return $self->{documents}->retrieve($name);
}
sub feature_selector { $_[0]->{feature_selector} }
sub scan_first { $_[0]->{scan_first} }
sub verbose {
my $self = shift;
$self->{verbose} = shift if @_;
return $self->{verbose};
}
sub trim_doc_features {
my ($self) = @_;
foreach my $doc ($self->documents) {
$doc->features( $doc->features->intersection($self->features) );
}
}
sub prog_bar {
my ($self, $collection) = @_;
return sub {} unless $self->verbose;
return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
my $count = $collection->can('count_documents') ? $collection->count_documents : 0;
my $pb = 'Time::Progress'->new;
$pb->attr(max => $count);
my $i = 0;
return sub {
$i++;
return if $i % 25;
print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
};
}
# A little utility method for several other methods like scan_stats(),
# load(), read(), etc.
sub _make_collection {
my ($self, $args) = @_;
return $args->{collection} || $self->create_delayed_object('collection', %$args);
}
sub scan_stats {
# Should determine:
# - number of documents
# - number of categories
# - avg. number of categories per document (whole corpus)
# - avg. number of tokens per document (whole corpus)
# - avg. number of types per document (whole corpus)
# - number of documents, tokens, & types for each category
# - "category skew index" (% variance?) by num. documents, tokens, and types
my ($self, %args) = @_;
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
my $ssum;
foreach my $cat (@cats) {
$ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
}
$stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
}
return \%stats;
}
sub load {
my ($self, %args) = @_;
my $c = $self->_make_collection(\%args);
if ($self->{features_kept}) {
# Read the whole thing in, then reduce
$self->read( collection => $c );
$self->select_features;
} elsif ($self->{scan_first}) {
# Figure out the feature set first, then read data in
$self->scan_features( collection => $c );
$c->rewind;
$self->read( collection => $c );
} else {
# Don't do any feature reduction, just read the data
$self->read( collection => $c );
}
}
sub read {
my ($self, %args) = @_;
my $collection = $self->_make_collection(\%args);
my $pb = $self->prog_bar($collection);
while (my $doc = $collection->next) {
$pb->();
$self->add_document($doc);
}
print "\n" if $self->verbose;
}
sub finish {
my $self = shift;
return if $self->{finished}++;
$self->weigh_features;
}
sub weigh_features {
# This could be made more efficient by figuring out an execution
# plan in advance
my $self = shift;
if ( $self->{term_weighting} =~ /^(t|x)$/ ) {
# Nothing to do
} elsif ( $self->{term_weighting} eq 'l' ) {
foreach my $doc ($self->documents) {
my $f = $doc->features->as_hash;
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
if ( $self->{normalize_weighting} eq 'x' ) {
# Nothing to do
} elsif ( $self->{normalize_weighting} eq 'c' ) {
$_->features->normalize foreach $self->documents;
} else {
die "normalize_weighting must be one of 'x' or 'c'";
}
}
sub document_frequency {
my ($self, $term) = @_;
unless (exists $self->{doc_freq_vector}) {
die "No corpus has been scanned for features" unless $self->documents;
my $doc_freq = $self->create_delayed_object('features', features => {});
foreach my $doc ($self->documents) {
$doc_freq->add( $doc->features->as_boolean_hash );
}
$self->{doc_freq_vector} = $doc_freq->as_hash;
}
return exists $self->{doc_freq_vector}{$term} ? $self->{doc_freq_vector}{$term} : 0;
}
sub scan_features {
my ($self, %args) = @_;
my $c = $self->_make_collection(\%args);
my $pb = $self->prog_bar($c);
my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );
$self->delayed_object_params('document', use_features => $ranked_features);
$self->delayed_object_params('collection', use_features => $ranked_features);
return $ranked_features;
}
sub select_features {
my $self = shift;
my $f = $self->feature_selector->select_features(knowledge_set => $self);
$self->features($f);
}
sub partition {
my ($self, @sizes) = @_;
my $num_docs = my @docs = $self->documents;
my @groups;
while (@sizes > 1) {
my $size = int ($num_docs * shift @sizes);
push @groups, [];
for (0..$size) {
push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
}
}
push @groups, \@docs;
return map { ref($self)->new( documents => $_ ) } @groups;
}
sub make_document {
my ($self, %args) = @_;
my $cats = delete $args{categories};
my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
my $d = $self->create_delayed_object('document', %args, categories => \@cats);
$self->add_document($d);
}
sub add_document {
my ($self, $doc) = @_;
foreach ($doc->categories) {
$_->add_document($doc);
}
$self->{documents}->insert($doc);
$self->{categories}->insert($doc->categories);
}
sub save_features {
my ($self, $file) = @_;
my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
or croak "No features to save";
open my($fh), "> $file" or croak "Can't create $file: $!";
my $h = $f->as_hash;
print $fh "# Total: ", $f->length, "\n";
foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
print $fh "$k\t$h->{$k}\n";
}
close $fh;
}
sub restore_features {
my ($self, $file, $n) = @_;
open my($fh), "< $file" or croak "Can't open $file: $!";
my %hash;
while (<$fh>) {
next if /^#/;
/^(.*)\t([\d.]+)$/ or croak "Malformed line: $_";
$hash{$1} = $2;
last if defined $n and $. >= $n;
lib/AI/Categorizer/Learner.pm view on Meta::CPAN
class => 'AI::Categorizer::Hypothesis',
delayed => 1,
},
experiment => {
class => 'AI::Categorizer::Experiment',
delayed => 1,
},
);
# Subclasses must override these virtual methods:
sub get_scores;
sub create_model;
# Optional virtual method for on-line learning:
sub add_knowledge;
sub verbose {
my $self = shift;
if (@_) {
$self->{verbose} = shift;
}
return $self->{verbose};
}
sub knowledge_set {
my $self = shift;
if (@_) {
$self->{knowledge_set} = shift;
}
return $self->{knowledge_set};
}
sub categories {
my $self = shift;
return $self->knowledge_set->categories;
}
sub train {
my ($self, %args) = @_;
$self->{knowledge_set} = $args{knowledge_set} if $args{knowledge_set};
die "No knowledge_set provided" unless $self->{knowledge_set};
$self->{knowledge_set}->finish;
$self->create_model; # Creates $self->{model}
$self->delayed_object_params('hypothesis',
all_categories => [map $_->name, $self->categories],
);
}
sub prog_bar {
my ($self, $count) = @_;
return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
my $pb = 'Time::Progress'->new;
$pb->attr(max => $count);
my $i = 0;
return sub {
$i++;
return if $i % 25;
my $string = '';
if (@_) {
my $e = shift;
$string = sprintf " (maF1=%.03f, miF1=%.03f)", $e->macro_F1, $e->micro_F1;
}
print STDERR $pb->report("%50b %p ($i/$count)$string\r", $i);
return $i;
};
}
sub categorize_collection {
my ($self, %args) = @_;
my $c = $args{collection} or die "No collection provided";
my @all_cats = map $_->name, $self->categories;
my $experiment = $self->create_delayed_object('experiment', categories => \@all_cats);
my $pb = $self->verbose ? $self->prog_bar($c->count_documents) : sub {};
while (my $d = $c->next) {
my $h = $self->categorize($d);
$experiment->add_hypothesis($h, [map $_->name, $d->categories]);
$pb->($experiment);
if ($self->verbose > 1) {
printf STDERR ("%s: assigned=(%s) correct=(%s)\n",
$d->name,
join(', ', $h->categories),
join(', ', map $_->name, $d->categories));
}
}
print STDERR "\n" if $self->verbose;
return $experiment;
}
sub categorize {
my ($self, $doc) = @_;
my ($scores, $threshold) = $self->get_scores($doc);
if ($self->verbose > 2) {
warn "scores: @{[ %$scores ]}" if $self->verbose > 3;
foreach my $key (sort {$scores->{$b} <=> $scores->{$a}} keys %$scores) {
print "$key: $scores->{$key}\n";
}
lib/AI/Categorizer/Learner/Boolean.pm view on Meta::CPAN
use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
use AI::Categorizer::Util qw(random_elements);
__PACKAGE__->valid_params
(
max_instances => {type => SCALAR, default => 0},
threshold => {type => SCALAR, default => 0.5},
);
sub create_model {
my $self = shift;
my $m = $self->{model} ||= {};
my $mi = $self->{max_instances};
foreach my $cat ($self->knowledge_set->categories) {
my (@p, @n);
foreach my $doc ($self->knowledge_set->documents) {
if ($doc->is_in_category($cat)) {
push @p, $doc;
} else {
lib/AI/Categorizer/Learner/Boolean.pm view on Meta::CPAN
@n = random_elements(\@n, @n * $ratio);
warn "Limiting to ". @p ." positives and ". @n ." negatives\n" if $self->verbose;
}
warn "Creating model for ", $cat->name, "\n" if $self->verbose;
$m->{learners}{ $cat->name } = $self->create_boolean_model(\@p, \@n, $cat);
}
}
sub create_boolean_model; # Abstract method
sub get_scores {
my ($self, $doc) = @_;
my $m = $self->{model};
my %scores;
foreach my $cat (keys %{$m->{learners}}) {
$scores{$cat} = $self->get_boolean_score($doc, $m->{learners}{$cat});
}
return (\%scores, $self->{threshold});
}
sub get_boolean_score; # Abstract method
sub threshold {
my $self = shift;
$self->{threshold} = shift if @_;
return $self->{threshold};
}
sub categories {
my $self = shift;
return map AI::Categorizer::Category->by_name( name => $_ ), keys %{ $self->{model}{learners} };
}
1;
__END__
=head1 NAME
AI::Categorizer::Learner::Boolean - Abstract class for boolean categorizers
=head1 SYNOPSIS
package AI::Categorizer::Learner::SomethingNew;
use AI::Categorizer::Learner::Boolean;
@ISA = qw(AI::Categorizer::Learner::Boolean);
sub create_boolean_model {
my ($self, $positives, $negatives, $category) = @_;
...
return $something_helpful;
}
sub get_boolean_score {
my ($self, $document, $something_helpful) = @_;
...
return $score;
}
=head1 DESCRIPTION
This is an abstract class which turns boolean categorizers
(categorizers based on algorithms that can just provide yes/no
categorization decisions for a single document and single category)
lib/AI/Categorizer/Learner/DecisionTree.pm view on Meta::CPAN
package AI::Categorizer::Learner::DecisionTree;
$VERSION = '0.01';
use strict;
use AI::DecisionTree;
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);
sub create_model {
my $self = shift;
$self->SUPER::create_model;
$self->{model}{first_tree}->do_purge;
delete $self->{model}{first_tree};
}
sub create_boolean_model {
my ($self, $positives, $negatives, $cat) = @_;
my $t = new AI::DecisionTree(noise_mode => 'pick_best',
verbose => $self->verbose);
my %results;
for ($positives, $negatives) {
foreach my $doc (@$_) {
$results{$doc->name} = $_ eq $positives ? 1 : 0;
}
lib/AI/Categorizer/Learner/DecisionTree.pm view on Meta::CPAN
}
$t->purge(0);
$self->{model}{first_tree} = $t;
}
print STDERR "\nBuilding tree for category '", $cat->name, "'" if $self->verbose;
$t->train;
return $t;
}
sub get_scores {
my ($self, $doc) = @_;
local $self->{current_doc} = $doc->features->as_boolean_hash;
return $self->SUPER::get_scores($doc);
}
sub get_boolean_score {
my ($self, $doc, $t) = @_;
return $t->get_result( attributes => $self->{current_doc} ) || 0;
}
1;
__END__
=head1 NAME
AI::Categorizer::Learner::DecisionTree - Decision Tree Learner
lib/AI/Categorizer/Learner/Guesser.pm view on Meta::CPAN
package AI::Categorizer::Learner::Guesser;
use strict;
use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);
sub create_model {
my $self = shift;
my $k = $self->knowledge_set;
my $num_docs = $k->documents;
foreach my $cat ($k->categories) {
next unless $cat->documents;
$self->{model}{$cat->name} = $cat->documents / $num_docs;
}
}
sub get_scores {
my ($self, $newdoc) = @_;
my %scores;
while (my ($cat, $prob) = each %{$self->{model}}) {
$scores{$cat} = 0.5 + $prob - rand();
}
return (\%scores, 0.5);
}
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
use Params::Validate qw(:types);
__PACKAGE__->valid_params
(
threshold => {type => SCALAR, default => 0.4},
k_value => {type => SCALAR, default => 20},
knn_weighting => {type => SCALAR, default => 'score'},
max_instances => {type => SCALAR, default => 0},
);
sub create_model {
my $self = shift;
foreach my $doc ($self->knowledge_set->documents) {
$doc->features->normalize;
}
$self->knowledge_set->features; # Initialize
}
sub threshold {
my $self = shift;
$self->{threshold} = shift if @_;
return $self->{threshold};
}
sub categorize_collection {
my $self = shift;
my $f_class = $self->knowledge_set->contained_class('features');
if ($f_class->can('all_features')) {
$f_class->all_features([$self->knowledge_set->features->names]);
}
$self->SUPER::categorize_collection(@_);
}
sub get_scores {
my ($self, $newdoc) = @_;
my $currentDocName = $newdoc->name;
#print "classifying $currentDocName\n";
my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});
my @docset;
if ($self->{max_instances}) {
# Use (approximately) max_instances documents, chosen randomly from corpus
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
}
$_ /= $self->{k_value} foreach values %scores;
return (\%scores, $self->{threshold});
}
###################################################################
package AI::Categorizer::Learner::KNN::Queue;
sub new {
my ($pkg, %args) = @_;
return bless {
size => $args{size},
entries => [],
}, $pkg;
}
sub add {
my ($self, $thing, $score) = @_;
# scores may be (0.2, 0.4, 0.4, 0.8) - ascending
return unless (@{$self->{entries}} < $self->{size} # Queue not filled
or $score > $self->{entries}[0]{score}); # Found a better entry
my $i;
if (!@{$self->{entries}}) {
$i = 0;
lib/AI/Categorizer/Learner/KNN.pm view on Meta::CPAN
$i = @{$self->{entries}};
} else {
for ($i = 0; $i < @{$self->{entries}}; $i++) {
last if $score < $self->{entries}[$i]{score};
}
}
splice @{$self->{entries}}, $i, 0, { thing => $thing, score => $score};
shift @{$self->{entries}} if @{$self->{entries}} > $self->{size};
}
sub entries {
return shift->{entries};
}
1;
__END__
=head1 NAME
AI::Categorizer::Learner::KNN - K Nearest Neighbour Algorithm For AI::Categorizer
lib/AI/Categorizer/Learner/NaiveBayes.pm view on Meta::CPAN
use AI::Categorizer::Learner;
use base qw(AI::Categorizer::Learner);
use Params::Validate qw(:types);
use Algorithm::NaiveBayes;
__PACKAGE__->valid_params
(
threshold => {type => SCALAR, default => 0.3},
);
sub create_model {
my $self = shift;
my $m = $self->{model} = Algorithm::NaiveBayes->new;
foreach my $d ($self->knowledge_set->documents) {
$m->add_instance(attributes => $d->features->as_hash,
label => [ map $_->name, $d->categories ]);
}
$m->train;
}
sub get_scores {
my ($self, $newdoc) = @_;
return ($self->{model}->predict( attributes => $newdoc->features->as_hash ),
$self->{threshold});
}
sub threshold {
my $self = shift;
$self->{threshold} = shift if @_;
return $self->{threshold};
}
sub save_state {
my $self = shift;
local $self->{knowledge_set}; # Don't need the knowledge_set to categorize
$self->SUPER::save_state(@_);
}
sub categories {
my $self = shift;
return map AI::Categorizer::Category->by_name( name => $_ ), $self->{model}->labels;
}
1;
__END__
=head1 NAME
lib/AI/Categorizer/Learner/Rocchio.pm view on Meta::CPAN
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);
__PACKAGE__->valid_params
(
positive_setting => {type => SCALAR, default => 16 },
negative_setting => {type => SCALAR, default => 4 },
threshold => {type => SCALAR, default => 0.1},
);
sub create_model {
my $self = shift;
foreach my $doc ($self->knowledge_set->documents) {
$doc->features->normalize;
}
$self->{model}{all_features} = $self->knowledge_set->features(undef);
$self->SUPER::create_model(@_);
delete $self->{knowledge_set};
}
sub create_boolean_model {
my ($self, $positives, $negatives, $cat) = @_;
my $posdocnum = @$positives;
my $negdocnum = @$negatives;
my $beta = $self->{positive_setting};
my $gamma = $self->{negative_setting};
my $profile = $self->{model}{all_features}->clone->scale(-$gamma/$negdocnum);
my $f = $cat->features(undef)->clone->scale( $beta/$posdocnum + $gamma/$negdocnum );
$profile->add($f);
return $profile->normalize;
}
sub get_boolean_score {
my ($self, $newdoc, $profile) = @_;
return $newdoc->features->normalize->dot($profile);
}
1;
lib/AI/Categorizer/Learner/SVM.pm view on Meta::CPAN
use Algorithm::SVM;
use Algorithm::SVM::DataSet;
use Params::Validate qw(:types);
use File::Spec;
__PACKAGE__->valid_params
(
svm_kernel => {type => SCALAR, default => 'linear'},
);
sub create_model {
my $self = shift;
my $f = $self->knowledge_set->features->as_hash;
my $rmap = [ keys %$f ];
$self->{model}{feature_map} = { map { $rmap->[$_], $_ } 0..$#$rmap };
$self->{model}{feature_map_reverse} = $rmap;
$self->SUPER::create_model(@_);
}
sub _doc_2_dataset {
my ($self, $doc, $label, $fm) = @_;
my $ds = new Algorithm::SVM::DataSet(Label => $label);
my $f = $doc->features->as_hash;
while (my ($k, $v) = each %$f) {
next unless exists $fm->{$k};
$ds->attribute( $fm->{$k}, $v );
}
return $ds;
}
sub create_boolean_model {
my ($self, $positives, $negatives, $cat) = @_;
my $svm = new Algorithm::SVM(Kernel => $self->{svm_kernel});
my (@pos, @neg);
foreach my $doc (@$positives) {
push @pos, $self->_doc_2_dataset($doc, 1, $self->{model}{feature_map});
}
foreach my $doc (@$negatives) {
push @neg, $self->_doc_2_dataset($doc, 0, $self->{model}{feature_map});
}
$svm->train(@pos, @neg);
return $svm;
}
sub get_scores {
my ($self, $doc) = @_;
local $self->{current_doc} = $self->_doc_2_dataset($doc, -1, $self->{model}{feature_map});
return $self->SUPER::get_scores($doc);
}
sub get_boolean_score {
my ($self, $doc, $svm) = @_;
return $svm->predict($self->{current_doc});
}
sub save_state {
my ($self, $path) = @_;
{
local $self->{model}{learners};
local $self->{knowledge_set};
$self->SUPER::save_state($path);
}
return unless $self->{model};
my $svm_dir = File::Spec->catdir($path, 'svms');
mkdir($svm_dir, 0777) or die "Couldn't create $svm_dir: $!";
while (my ($name, $learner) = each %{$self->{model}{learners}}) {
my $path = File::Spec->catfile($svm_dir, $name);
$learner->save($path);
}
}
sub restore_state {
my ($self, $path) = @_;
$self = $self->SUPER::restore_state($path);
my $svm_dir = File::Spec->catdir($path, 'svms');
return $self unless -e $svm_dir;
opendir my($dh), $svm_dir or die "Can't open directory $svm_dir: $!";
while (defined (my $file = readdir $dh)) {
my $full_file = File::Spec->catfile($svm_dir, $file);
next if -d $full_file;
$self->{model}{learners}{$file} = new Algorithm::SVM(Model => $full_file);
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
weka_classifier => {type => SCALAR, default => 'weka.classifiers.NaiveBayes'},
weka_args => {type => SCALAR|ARRAYREF, optional => 1},
tmpdir => {type => SCALAR, default => File::Spec->tmpdir},
);
__PACKAGE__->contained_objects
(
features => {class => 'AI::Categorizer::FeatureVector', delayed => 1},
);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
for ('java_args', 'weka_args') {
$self->{$_} = [] unless defined $self->{$_};
$self->{$_} = [$self->{$_}] unless UNIVERSAL::isa($self->{$_}, 'ARRAY');
}
if (defined $self->{weka_path}) {
push @{$self->{java_args}}, '-classpath', $self->{weka_path};
delete $self->{weka_path};
}
return $self;
}
# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -t /tmp/train_file.arff -d /tmp/weka-machine
sub create_model {
my ($self) = shift;
my $m = $self->{model} ||= {};
$m->{all_features} = [ $self->knowledge_set->features->names ];
$m->{_in_dir} = File::Temp::tempdir( DIR => $self->{tmpdir} );
# Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement)
my $dummy_features = $self->create_delayed_object('features');
$m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]);
$self->SUPER::create_model(@_);
}
sub create_boolean_model {
my ($self, $pos, $neg, $cat) = @_;
my @docs = (map([$_->features, 1], @$pos),
map([$_->features, 0], @$neg));
my $train_file = $self->create_arff_file($cat->name . '_train', \@docs);
my %info = (machine_file => $cat->name . '_model');
my $outfile = File::Spec->catfile($self->{model}{_in_dir}, $info{machine_file});
my @args = ($self->{java_path},
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
'-p', '0',
);
$self->do_cmd(@args);
unlink $train_file or warn "Couldn't remove $train_file: $!";
return \%info;
}
# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -l out -T test.arff -p 0
sub get_boolean_score {
my ($self, $doc, $info) = @_;
# Create document file
my $doc_file = $self->create_arff_file('doc', [[$doc->features, 0]], $self->{tmpdir});
my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, $info->{machine_file});
my @args = ($self->{java_path},
@{$self->{java_args}},
$self->{weka_classifier},
'-l', $machine_file,
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
foreach (@output) {
# <doc> <category> <score> <real_category>
# 0 large.elem 0.4515551620220952 numberth.high
next unless my ($index, $predicted, $score) = /^([\d.]+)\s+(\S+)\s+([\d.]+)/;
$scores{$predicted} = $score;
}
return $scores{1} || 0; # Not sure what weka's scores represent...
}
sub categorize_collection {
my ($self, %args) = @_;
my $c = $args{collection} or die "No collection provided";
my @alldocs;
while (my $d = $c->next) {
push @alldocs, $d;
}
my $doc_file = $self->create_arff_file("docs", [map [$_->features, 0], @alldocs]);
my @assigned;
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]);
foreach my $i (0..$#alldocs) {
$experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name);
}
return $experiment;
}
sub do_cmd {
my ($self, @cmd) = @_;
print STDERR " % @cmd\n" if $self->verbose;
my @output;
local *KID_TO_READ;
my $pid = open(KID_TO_READ, "-|");
if ($pid) { # parent
@output = <KID_TO_READ>;
close(KID_TO_READ) or warn "@cmd exited $?";
} else { # child
exec(@cmd) or die "Can't exec @cmd: $!";
}
return @output;
}
sub create_arff_file {
my ($self, $name, $docs, $dir) = @_;
$dir = $self->{model}{_in_dir} unless defined $dir;
my ($fh, $filename) = File::Temp::tempfile(
$name . "_XXXX", # Template
DIR => $dir,
SUFFIX => '.arff',
);
print $fh "\@RELATION foo\n\n";
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
print $fh ("{",
join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"),
"}\n"
);
}
return $filename;
}
sub save_state {
my ($self, $path) = @_;
{
local $self->{knowledge_set};
$self->SUPER::save_state($path);
}
return unless $self->{model};
my $model_dir = File::Spec->catdir($path, 'models');
mkdir($model_dir, 0777) or die "Couldn't create $model_dir: $!";
while (my ($name, $learner) = each %{$self->{model}{learners}}) {
my $oldpath = File::Spec->catdir($self->{model}{_in_dir}, $learner->{machine_file});
my $newpath = File::Spec->catfile($model_dir, "${name}_model");
File::Copy::copy($oldpath, $newpath);
}
$self->{model}{_in_dir} = $model_dir;
}
sub restore_state {
my ($pkg, $path) = @_;
my $self = $pkg->SUPER::restore_state($path);
my $model_dir = File::Spec->catdir($path, 'models');
return $self unless -e $model_dir;
$self->{model}{_in_dir} = $model_dir;
return $self;
}
lib/AI/Categorizer/ObjectSet.pm view on Meta::CPAN
package AI::Categorizer::ObjectSet;
use strict;
sub new {
my $pkg = shift;
my $self = bless {}, $pkg;
$self->insert(@_) if @_;
return $self;
}
sub members {
return values %{$_[0]};
}
sub size {
return scalar keys %{$_[0]};
}
sub insert {
my $self = shift;
foreach my $element (@_) {
#warn "types are ", @_;
$self->{ $element->name } = $element;
}
}
sub retrieve { $_[0]->{$_[1]} }
sub includes { exists $_[0]->{ $_[1]->name } }
sub includes_name { exists $_[0]->{ $_[1] } }
1;