AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
my $ssum;
foreach my $cat (@cats) {
$ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
}
$stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
}
return \%stats;
}
sub load {
my ($self, %args) = @_;
my $c = $self->_make_collection(\%args);
if ($self->{features_kept}) {
# Read the whole thing in, then reduce
$self->read( collection => $c );
$self->select_features;
} elsif ($self->{scan_first}) {
# Figure out the feature set first, then read data in
$self->scan_features( collection => $c );
$c->rewind;
$self->read( collection => $c );
} else {
# Don't do any feature reduction, just read the data
$self->read( collection => $c );
}
}
sub read {
my ($self, %args) = @_;
my $collection = $self->_make_collection(\%args);
my $pb = $self->prog_bar($collection);
while (my $doc = $collection->next) {
$pb->();
$self->add_document($doc);
}
print "\n" if $self->verbose;
}
sub finish {
my $self = shift;
return if $self->{finished}++;
$self->weigh_features;
}
sub weigh_features {
# This could be made more efficient by figuring out an execution
# plan in advance
my $self = shift;
if ( $self->{term_weighting} =~ /^(t|x)$/ ) {
# Nothing to do
} elsif ( $self->{term_weighting} eq 'l' ) {
foreach my $doc ($self->documents) {
my $f = $doc->features->as_hash;
$_ = 1 + log($_) foreach values %$f;
}
} elsif ( $self->{term_weighting} eq 'n' ) {
foreach my $doc ($self->documents) {
my $f = $doc->features->as_hash;
my $max_tf = AI::Categorizer::Util::max values %$f;
$_ = 0.5 + 0.5 * $_ / $max_tf foreach values %$f;
}
} elsif ( $self->{term_weighting} eq 'b' ) {
foreach my $doc ($self->documents) {
my $f = $doc->features->as_hash;
$_ = $_ ? 1 : 0 foreach values %$f;
}
} else {
die "term_weighting must be one of 'x', 't', 'l', 'b', or 'n'";
}
if ($self->{collection_weighting} eq 'x') {
# Nothing to do
} elsif ($self->{collection_weighting} =~ /^(f|p)$/) {
my $subtrahend = ($1 eq 'f' ? 0 : 1);
my $num_docs = $self->documents;
$self->document_frequency('foo'); # Initialize
foreach my $doc ($self->documents) {
my $f = $doc->features->as_hash;
$f->{$_} *= log($num_docs / $self->{doc_freq_vector}{$_} - $subtrahend) foreach keys %$f;
}
} else {
die "collection_weighting must be one of 'x', 'f', or 'p'";
}
if ( $self->{normalize_weighting} eq 'x' ) {
# Nothing to do
} elsif ( $self->{normalize_weighting} eq 'c' ) {
$_->features->normalize foreach $self->documents;
} else {
die "normalize_weighting must be one of 'x' or 'c'";
}
}
sub document_frequency {
my ($self, $term) = @_;
unless (exists $self->{doc_freq_vector}) {
die "No corpus has been scanned for features" unless $self->documents;
my $doc_freq = $self->create_delayed_object('features', features => {});
foreach my $doc ($self->documents) {
$doc_freq->add( $doc->features->as_boolean_hash );
}
$self->{doc_freq_vector} = $doc_freq->as_hash;
}
return exists $self->{doc_freq_vector}{$term} ? $self->{doc_freq_vector}{$term} : 0;
}
sub scan_features {
my ($self, %args) = @_;
my $c = $self->_make_collection(\%args);
my $pb = $self->prog_bar($c);
my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );
$self->delayed_object_params('document', use_features => $ranked_features);
$self->delayed_object_params('collection', use_features => $ranked_features);
return $ranked_features;
}
sub select_features {
my $self = shift;
my $f = $self->feature_selector->select_features(knowledge_set => $self);
$self->features($f);
}
sub partition {
my ($self, @sizes) = @_;
my $num_docs = my @docs = $self->documents;
my @groups;
while (@sizes > 1) {
my $size = int ($num_docs * shift @sizes);
push @groups, [];
for (0..$size) {
push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
}
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
=item features_kept
A number indicating how many features (words) should be considered
when training the Learner or categorizing new documents. May be
specified as a positive integer (e.g. 2000) indicating the absolute
number of features to be kept, or as a decimal between 0 and 1
(e.g. 0.2) indicating the fraction of the total number of features to
be kept, or as 0 to indicate that no feature selection should be done
and that the entire set of features should be used. The default is
0.2.
=item feature_selection
A string indicating the type of feature selection that should be
performed. Currently the only option is also the default option:
C<document_frequency>.
=item tfidf_weighting
Specifies how document word counts should be converted to vector
values. Uses the three-character specification strings from Salton &
Buckley's paper "Term-weighting approaches in automatic text
retrieval". The three characters indicate the three factors that will
be multiplied for each feature to find the final vector value for that
feature. The default weighting is C<xxx>.
The first character specifies the "term frequency" component, which
can take the following values:
=over 4
=item b
Binary weighting - 1 for terms present in a document, 0 for terms absent.
=item t
Raw term frequency - equal to the number of times a feature occurs in
the document.
=item x
A synonym for 't'.
=item n
Normalized term frequency - 0.5 + 0.5 * t/max(t). This is the same as
the 't' specification, but with term frequency normalized to lie
between 0.5 and 1.
=back
The second character specifies the "collection frequency" component, which
can take the following values:
=over 4
=item f
Inverse document frequency - multiply term C<t>'s value by C<log(N/n)>,
where C<N> is the total number of documents in the collection, and
C<n> is the number of documents in which term C<t> is found.
=item p
Probabilistic inverse document frequency - multiply term C<t>'s value
by C<log((N-n)/n)> (same variable meanings as above).
=item x
No change - multiply by 1.
=back
The third character specifies the "normalization" component, which
can take the following values:
=over 4
=item c
Apply cosine normalization - multiply by 1/length(document_vector).
=item x
No change - multiply by 1.
=back
The three components may alternatively be specified by the
C<term_weighting>, C<collection_weighting>, and C<normalize_weighting>
parameters respectively.
=item verbose
If set to a true value, some status/debugging information will be
output on C<STDOUT>.
=back
=item categories()
In a list context returns a list of all Category objects in this
KnowledgeSet. In a scalar context returns the number of such objects.
=item documents()
In a list context returns a list of all Document objects in this
KnowledgeSet. In a scalar context returns the number of such objects.
=item document()
Given a document name, returns the Document object with that name, or
C<undef> if no such Document object exists in this KnowledgeSet.
=item features()
Returns a FeatureSet object which represents the features of all the
documents in this KnowledgeSet.
=item verbose()
Returns the C<verbose> parameter of this KnowledgeSet, or sets it with
an optional argument.
( run in 0.509 second using v1.01-cache-2.11-cpan-39bf76dae61 )