AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN

#		Prefix		$el->{Prefix}
#		Value		$el->{Value}
# Output: None
# Description:
# 	it is called whenever the parser meets the element
sub start_element{
  my ($self, $el)= @_;

  # find the last location of the content
  # its meaning is to append the new data at this location
  my $location= length $self->{content};

  # save the last location of the current content
  # so that at end_element the starting location of data of this element can be known
  $self->{locationArray}[$self->{levelPointer}] = $location;

  # for the next element, increase levelPointer
  $self->{levelPointer}++;

  #$self->SUPER::start_document($el);
}

lib/AI/Categorizer/FeatureSelector.pm  view on Meta::CPAN

  # 'features_kept' parameter.  If it's zero, all features are kept.
  # If it's between 0 and 1, we multiply by the present number of
  # features.  If it's greater than 1, we treat it as the number of
  # features to use.

  my ($self, $f, %args) = @_;
  my $kept = defined $args{features_kept} ? $args{features_kept} : $self->{features_kept};
  return $f unless $kept;

  my $num_kept = ($kept < 1 ? 
		  $f->length * $kept :
		  $kept);

  print "Trimming features - # features = " . $f->length . "\n" if $self->verbose;
  
  # This is algorithmic overkill, but the sort seems fast enough.  Will revisit later.
  my $features = $f->as_hash;
  my @new_features = (sort {$features->{$b} <=> $features->{$a}} keys %$features)
                      [0 .. $num_kept-1];

  my $result = $f->intersection( \@new_features );
  print "Finished trimming features - # features = " . $result->length . "\n" if $self->verbose;
  return $result;
}

# Abstract methods
sub rank_features;
sub scan_features;

sub select_features {
  my ($self, %args) = @_;
  

lib/AI/Categorizer/FeatureSelector.pm  view on Meta::CPAN

=back


The third character specifies the "normalization" component, which
can take the following values:

=over 4

=item c

Apply cosine normalization - multiply by 1/length(document_vector).

=item x

No change - multiply by 1.

=back

The three components may alternatively be specified by the
C<term_weighting>, C<collection_weighting>, and C<normalize_weighting>
parameters respectively.

lib/AI/Categorizer/FeatureVector.pm  view on Meta::CPAN

sub set {
  my $self = shift;
  $self->{features} = (ref $_[0] ? $_[0] : {@_});
}

sub as_hash {
  my $self = shift;
  return $self->{features};
}

sub euclidean_length {
  my $self = shift;
  my $f = $self->{features};

  my $total = 0;
  foreach (values %$f) {
    $total += $_**2;
  }
  return sqrt($total);
}

sub normalize {
  my $self = shift;

  my $length = $self->euclidean_length;
  return $length ? $self->scale(1/$length) : $self;
}

sub scale {
  my ($self, $scalar) = @_;
  $_ *= $scalar foreach values %{$self->{features}};
  return $self;
}

sub as_boolean_hash {
  my $self = shift;
  return { map {($_ => 1)} keys %{$self->{features}} };
}

sub length {
  my $self = shift;
  return scalar keys %{$self->{features}};
}

sub clone {
  my $self = shift;
  return ref($self)->new( features => { %{$self->{features}} } );
}

sub intersection {

lib/AI/Categorizer/FeatureVector.pm  view on Meta::CPAN

AI::Categorizer::FeatureVector - Features vs. Values

=head1 SYNOPSIS

  my $f1 = new AI::Categorizer::FeatureVector
    (features => {howdy => 2, doody => 3});
  my $f2 = new AI::Categorizer::FeatureVector
    (features => {doody => 1, whopper => 2});
   
  @names = $f1->names;
  $x = $f1->length;
  $x = $f1->sum;
  $x = $f1->includes('howdy');
  $x = $f1->value('howdy');
  $x = $f1->dot($f2);
  
  $f3 = $f1->clone;
  $f3 = $f1->intersection($f2);
  $f3 = $f1->add($f2);
  
  $h = $f1->as_hash;

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

  my $collection = $self->_make_collection(\%args);
  my $pb = $self->prog_bar($collection);

  my %stats;


  while (my $doc = $collection->next) {
    $pb->();
    $stats{category_count_with_duplicates} += $doc->categories;

    my ($sum, $length) = ($doc->features->sum, $doc->features->length);
    $stats{document_count}++;
    $stats{token_count} += $sum;
    $stats{type_count}  += $length;
    
    foreach my $cat ($doc->categories) {
#warn $doc->name, ": ", $cat->name, "\n";
      $stats{categories}{$cat->name}{document_count}++;
      $stats{categories}{$cat->name}{token_count} += $sum;
      $stats{categories}{$cat->name}{type_count}  += $length;
    }
  }
  print "\n" if $self->verbose;

  my @cats = keys %{ $stats{categories} };

  $stats{category_count}          = @cats;
  $stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
  $stats{tokens_per_document}     = $stats{token_count} / $stats{document_count};
  $stats{types_per_document}      = $stats{type_count}  / $stats{document_count};

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

}

sub save_features {
  my ($self, $file) = @_;
  
  my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
    or croak "No features to save";
  
  open my($fh), "> $file" or croak "Can't create $file: $!";
  my $h = $f->as_hash;
  print $fh "# Total: ", $f->length, "\n";
  
  foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
    print $fh "$k\t$h->{$k}\n";
  }
  close $fh;
}

sub restore_features {
  my ($self, $file, $n) = @_;
  

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

=back


The third character specifies the "normalization" component, which
can take the following values:

=over 4

=item c

Apply cosine normalization - multiply by 1/length(document_vector).

=item x

No change - multiply by 1.

=back

The three components may alternatively be specified by the
C<term_weighting>, C<collection_weighting>, and C<normalize_weighting>
parameters respectively.

t/11-feature_vector.t  view on Meta::CPAN


  ok $f1->dot($f2), 10;
  ok $f2->dot($f1), 10;
} else {
  skip "skip $pkg is not available", 1 for 1..5;
}

{
  # Call normalize() on an empty vector
  my $f = AI::Categorizer::FeatureVector->new(features => {});
  ok $f->euclidean_length, 0;
  eval {$f->normalize};
  ok $@, '';
  ok $f->normalize, $f;
}



( run in 0.553 second using v1.01-cache-2.11-cpan-65fba6d93b7 )