AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
# Prefix $el->{Prefix}
# Value $el->{Value}
# Output: None
# Description:
# it is called whenever the parser meets the element
sub start_element{
my ($self, $el)= @_;
# find the last location of the content
# its meaning is to append the new data at this location
my $location= length $self->{content};
# save the last location of the current content
# so that at end_element the starting location of data of this element can be known
$self->{locationArray}[$self->{levelPointer}] = $location;
# for the next element, increase levelPointer
$self->{levelPointer}++;
#$self->SUPER::start_document($el);
}
lib/AI/Categorizer/FeatureSelector.pm view on Meta::CPAN
# 'features_kept' parameter. If it's zero, all features are kept.
# If it's between 0 and 1, we multiply by the present number of
# features. If it's greater than 1, we treat it as the number of
# features to use.
my ($self, $f, %args) = @_;
my $kept = defined $args{features_kept} ? $args{features_kept} : $self->{features_kept};
return $f unless $kept;
my $num_kept = ($kept < 1 ?
$f->length * $kept :
$kept);
print "Trimming features - # features = " . $f->length . "\n" if $self->verbose;
# This is algorithmic overkill, but the sort seems fast enough. Will revisit later.
my $features = $f->as_hash;
my @new_features = (sort {$features->{$b} <=> $features->{$a}} keys %$features)
[0 .. $num_kept-1];
my $result = $f->intersection( \@new_features );
print "Finished trimming features - # features = " . $result->length . "\n" if $self->verbose;
return $result;
}
# Abstract methods
sub rank_features;
sub scan_features;
sub select_features {
my ($self, %args) = @_;
lib/AI/Categorizer/FeatureSelector.pm view on Meta::CPAN
=back
The third character specifies the "normalization" component, which
can take the following values:
=over 4
=item c
Apply cosine normalization - multiply by 1/length(document_vector).
=item x
No change - multiply by 1.
=back
The three components may alternatively be specified by the
C<term_weighting>, C<collection_weighting>, and C<normalize_weighting>
parameters respectively.
lib/AI/Categorizer/FeatureVector.pm view on Meta::CPAN
sub set {
my $self = shift;
$self->{features} = (ref $_[0] ? $_[0] : {@_});
}
sub as_hash {
my $self = shift;
return $self->{features};
}
sub euclidean_length {
my $self = shift;
my $f = $self->{features};
my $total = 0;
foreach (values %$f) {
$total += $_**2;
}
return sqrt($total);
}
sub normalize {
my $self = shift;
my $length = $self->euclidean_length;
return $length ? $self->scale(1/$length) : $self;
}
sub scale {
my ($self, $scalar) = @_;
$_ *= $scalar foreach values %{$self->{features}};
return $self;
}
sub as_boolean_hash {
my $self = shift;
return { map {($_ => 1)} keys %{$self->{features}} };
}
sub length {
my $self = shift;
return scalar keys %{$self->{features}};
}
sub clone {
my $self = shift;
return ref($self)->new( features => { %{$self->{features}} } );
}
sub intersection {
lib/AI/Categorizer/FeatureVector.pm view on Meta::CPAN
AI::Categorizer::FeatureVector - Features vs. Values
=head1 SYNOPSIS
my $f1 = new AI::Categorizer::FeatureVector
(features => {howdy => 2, doody => 3});
my $f2 = new AI::Categorizer::FeatureVector
(features => {doody => 1, whopper => 2});
@names = $f1->names;
$x = $f1->length;
$x = $f1->sum;
$x = $f1->includes('howdy');
$x = $f1->value('howdy');
$x = $f1->dot($f2);
$f3 = $f1->clone;
$f3 = $f1->intersection($f2);
$f3 = $f1->add($f2);
$h = $f1->as_hash;
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
my $collection = $self->_make_collection(\%args);
my $pb = $self->prog_bar($collection);
my %stats;
while (my $doc = $collection->next) {
$pb->();
$stats{category_count_with_duplicates} += $doc->categories;
my ($sum, $length) = ($doc->features->sum, $doc->features->length);
$stats{document_count}++;
$stats{token_count} += $sum;
$stats{type_count} += $length;
foreach my $cat ($doc->categories) {
#warn $doc->name, ": ", $cat->name, "\n";
$stats{categories}{$cat->name}{document_count}++;
$stats{categories}{$cat->name}{token_count} += $sum;
$stats{categories}{$cat->name}{type_count} += $length;
}
}
print "\n" if $self->verbose;
my @cats = keys %{ $stats{categories} };
$stats{category_count} = @cats;
$stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
$stats{tokens_per_document} = $stats{token_count} / $stats{document_count};
$stats{types_per_document} = $stats{type_count} / $stats{document_count};
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
}
sub save_features {
my ($self, $file) = @_;
my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
or croak "No features to save";
open my($fh), "> $file" or croak "Can't create $file: $!";
my $h = $f->as_hash;
print $fh "# Total: ", $f->length, "\n";
foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
print $fh "$k\t$h->{$k}\n";
}
close $fh;
}
sub restore_features {
my ($self, $file, $n) = @_;
lib/AI/Categorizer/KnowledgeSet.pm view on Meta::CPAN
=back
The third character specifies the "normalization" component, which
can take the following values:
=over 4
=item c
Apply cosine normalization - multiply by 1/length(document_vector).
=item x
No change - multiply by 1.
=back
The three components may alternatively be specified by the
C<term_weighting>, C<collection_weighting>, and C<normalize_weighting>
parameters respectively.
t/11-feature_vector.t view on Meta::CPAN
ok $f1->dot($f2), 10;
ok $f2->dot($f1), 10;
} else {
skip "skip $pkg is not available", 1 for 1..5;
}
{
# Call normalize() on an empty vector
my $f = AI::Categorizer::FeatureVector->new(features => {});
ok $f->euclidean_length, 0;
eval {$f->normalize};
ok $@, '';
ok $f->normalize, $f;
}
( run in 0.553 second using v1.01-cache-2.11-cpan-65fba6d93b7 )