AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
my $self = shift;
$self->{verbose} = shift if @_;
return $self->{verbose};
}
sub trim_doc_features {
my ($self) = @_;
foreach my $doc ($self->documents) {
$doc->features( $doc->features->intersection($self->features) );
}
}
sub prog_bar {
my ($self, $collection) = @_;
return sub {} unless $self->verbose;
return sub { print STDERR '.' } unless eval "use Time::Progress; 1";
my $count = $collection->can('count_documents') ? $collection->count_documents : 0;
my $pb = 'Time::Progress'->new;
$pb->attr(max => $count);
my $i = 0;
return sub {
$i++;
return if $i % 25;
print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
};
}
# A little utility method for several other methods like scan_stats(),
# load(), read(), etc.
sub _make_collection {
my ($self, $args) = @_;
return $args->{collection} || $self->create_delayed_object('collection', %$args);
}
sub scan_stats {
# Should determine:
# - number of documents
# - number of categories
# - avg. number of categories per document (whole corpus)
# - avg. number of tokens per document (whole corpus)
# - avg. number of types per document (whole corpus)
# - number of documents, tokens, & types for each category
# - "category skew index" (% variance?) by num. documents, tokens, and types
my ($self, %args) = @_;
my $collection = $self->_make_collection(\%args);
my $pb = $self->prog_bar($collection);
my %stats;
while (my $doc = $collection->next) {
$pb->();
$stats{category_count_with_duplicates} += $doc->categories;
my ($sum, $length) = ($doc->features->sum, $doc->features->length);
$stats{document_count}++;
$stats{token_count} += $sum;
$stats{type_count} += $length;
foreach my $cat ($doc->categories) {
#warn $doc->name, ": ", $cat->name, "\n";
$stats{categories}{$cat->name}{document_count}++;
$stats{categories}{$cat->name}{token_count} += $sum;
$stats{categories}{$cat->name}{type_count} += $length;
}
}
print "\n" if $self->verbose;
my @cats = keys %{ $stats{categories} };
$stats{category_count} = @cats;
$stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
$stats{tokens_per_document} = $stats{token_count} / $stats{document_count};
$stats{types_per_document} = $stats{type_count} / $stats{document_count};
foreach my $thing ('type', 'token', 'document') {
$stats{"${thing}s_per_category"} = AI::Categorizer::Util::average
( map { $stats{categories}{$_}{"${thing}_count"} } @cats );
next unless @cats;
# Compute the skews
my $ssum;
foreach my $cat (@cats) {
$ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
}
$stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
}
return \%stats;
}
sub load {
my ($self, %args) = @_;
my $c = $self->_make_collection(\%args);
if ($self->{features_kept}) {
# Read the whole thing in, then reduce
$self->read( collection => $c );
$self->select_features;
} elsif ($self->{scan_first}) {
# Figure out the feature set first, then read data in
$self->scan_features( collection => $c );
$c->rewind;
$self->read( collection => $c );
} else {
# Don't do any feature reduction, just read the data
$self->read( collection => $c );
}
}
sub read {
my ($self, %args) = @_;
my $collection = $self->_make_collection(\%args);
my $pb = $self->prog_bar($collection);
while (my $doc = $collection->next) {
$pb->();
$self->add_document($doc);
}
print "\n" if $self->verbose;
}
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
my $c = $self->_make_collection(\%args);
my $pb = $self->prog_bar($c);
my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );
$self->delayed_object_params('document', use_features => $ranked_features);
$self->delayed_object_params('collection', use_features => $ranked_features);
return $ranked_features;
}
sub select_features {
my $self = shift;
my $f = $self->feature_selector->select_features(knowledge_set => $self);
$self->features($f);
}
sub partition {
my ($self, @sizes) = @_;
my $num_docs = my @docs = $self->documents;
my @groups;
while (@sizes > 1) {
my $size = int ($num_docs * shift @sizes);
push @groups, [];
for (0..$size) {
push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
}
}
push @groups, \@docs;
return map { ref($self)->new( documents => $_ ) } @groups;
}
sub make_document {
my ($self, %args) = @_;
my $cats = delete $args{categories};
my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
my $d = $self->create_delayed_object('document', %args, categories => \@cats);
$self->add_document($d);
}
sub add_document {
my ($self, $doc) = @_;
foreach ($doc->categories) {
$_->add_document($doc);
}
$self->{documents}->insert($doc);
$self->{categories}->insert($doc->categories);
}
sub save_features {
my ($self, $file) = @_;
my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
or croak "No features to save";
open my($fh), "> $file" or croak "Can't create $file: $!";
my $h = $f->as_hash;
print $fh "# Total: ", $f->length, "\n";
foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
print $fh "$k\t$h->{$k}\n";
}
close $fh;
}
sub restore_features {
my ($self, $file, $n) = @_;
open my($fh), "< $file" or croak "Can't open $file: $!";
my %hash;
while (<$fh>) {
next if /^#/;
/^(.*)\t([\d.]+)$/ or croak "Malformed line: $_";
$hash{$1} = $2;
last if defined $n and $. >= $n;
}
my $features = $self->create_delayed_object('features', features => \%hash);
$self->delayed_object_params('document', use_features => $features);
$self->delayed_object_params('collection', use_features => $features);
}
1;
__END__
=head1 NAME
AI::Categorizer::KnowledgeSet - Encapsulates set of documents
=head1 SYNOPSIS
use AI::Categorizer::KnowledgeSet;
my $k = new AI::Categorizer::KnowledgeSet(...parameters...);
my $nb = new AI::Categorizer::Learner::NaiveBayes(...parameters...);
$nb->train(knowledge_set => $k);
=head1 DESCRIPTION
The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two. Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.
=head1 METHODS
=over 4
=item new()
Creates a new KnowledgeSet and returns it. Accepts the following
parameters:
=over 4
=item load
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
retrieval". The three characters indicate the three factors that will
be multiplied for each feature to find the final vector value for that
feature. The default weighting is C<xxx>.
The first character specifies the "term frequency" component, which
can take the following values:
=over 4
=item b
Binary weighting - 1 for terms present in a document, 0 for terms absent.
=item t
Raw term frequency - equal to the number of times a feature occurs in
the document.
=item x
A synonym for 't'.
=item n
Normalized term frequency - 0.5 + 0.5 * t/max(t). This is the same as
the 't' specification, but with term frequency normalized to lie
between 0.5 and 1.
=back
The second character specifies the "collection frequency" component, which
can take the following values:
=over 4
=item f
Inverse document frequency - multiply term C<t>'s value by C<log(N/n)>,
where C<N> is the total number of documents in the collection, and
C<n> is the number of documents in which term C<t> is found.
=item p
Probabilistic inverse document frequency - multiply term C<t>'s value
by C<log((N-n)/n)> (same variable meanings as above).
=item x
No change - multiply by 1.
=back
The third character specifies the "normalization" component, which
can take the following values:
=over 4
=item c
Apply cosine normalization - multiply by 1/length(document_vector).
=item x
No change - multiply by 1.
=back
The three components may alternatively be specified by the
C<term_weighting>, C<collection_weighting>, and C<normalize_weighting>
parameters respectively.
=item verbose
If set to a true value, some status/debugging information will be
output on C<STDOUT>.
=back
=item categories()
In a list context returns a list of all Category objects in this
KnowledgeSet. In a scalar context returns the number of such objects.
=item documents()
In a list context returns a list of all Document objects in this
KnowledgeSet. In a scalar context returns the number of such objects.
=item document()
Given a document name, returns the Document object with that name, or
C<undef> if no such Document object exists in this KnowledgeSet.
=item features()
Returns a FeatureSet object which represents the features of all the
documents in this KnowledgeSet.
=item verbose()
Returns the C<verbose> parameter of this KnowledgeSet, or sets it with
an optional argument.
=item scan_stats()
Scans all the documents of a Collection and returns a hash reference
containing several statistics about the Collection. (XXX need to describe stats)
=item scan_features()
This method scans through a Collection object and determines the
"best" features (words) to use when loading the documents and training
the Learner. This process is known as "feature selection", and it's a
very important part of categorization.
The Collection object should be specified as a C<collection> parameter,
or by giving the arguments to pass to the Collection's C<new()> method.
The process of feature selection is governed by the
( run in 0.874 second using v1.01-cache-2.11-cpan-39bf76dae61 )