AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

  return $tokens_in;
}

sub _weigh_tokens {
  my ($self, $tokens, $weight) = @_;

  my %counts;
  if (my $b = 0+$self->{front_bias}) {
    die "'front_bias' value must be between -1 and 1"
      unless -1 < $b and $b < 1;
    
    my $n = @$tokens;
    my $r = ($b-1)**2 / ($b+1);
    my $mult = $weight * log($r)/($r-1);
    
    my $i = 0;
    foreach my $feature (@$tokens) {
      $counts{$feature} += $mult * $r**($i/$n);
      $i++;
    }
    
  } else {
    foreach my $feature (@$tokens) {
      $counts{$feature} += $weight;
    }
  }

  return \%counts;
}

sub vectorize {
  my ($self, %args) = @_;
  if ($self->{stem_stopwords}) {
    my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
    $self->{stopwords} = { map {+$_, 1} @$s };
    $args{tokens} = $self->_filter_tokens($args{tokens});
  }
  return $self->_weigh_tokens($args{tokens}, $args{weight});
}

sub read {
  my ($class, %args) = @_;
  my $path = delete $args{path} or die "Must specify 'path' argument to read()";
  
  my $self = $class->new(%args);
  
  open my($fh), "< $path" or die "$path: $!";
  $self->parse_handle(handle => $fh);
  close $fh;
  
  $self->finish;
  return $self;
}

sub dump_features {
  my ($self, %args) = @_;
  my $path = $args{path} or die "No 'path' argument given to dump_features()";
  open my($fh), "> $path" or die "Can't create $path: $!";
  my $f = $self->features->as_hash;
  while (my ($k, $v) = each %$f) {
    print $fh "$k\t$v\n";
  }
}

1;

__END__

=head1 NAME

AI::Categorizer::Document - Embodies a document

=head1 SYNOPSIS

 use AI::Categorizer::Document;
 
 # Simplest way to create a document:
 my $d = new AI::Categorizer::Document(name => $string,
                                       content => $string);
 
 # Other parameters are accepted:
 my $d = new AI::Categorizer::Document(name => $string,
                                       categories => \@category_objects,
                                       content => { subject => $string,
                                                    body => $string2, ... },
                                       content_weights => { subject => 3,
                                                            body => 1, ... },
                                       stopwords => \%skip_these_words,
                                       stemming => $string,
                                       front_bias => $float,
                                       use_features => $feature_vector,
                                      );
 
 # Specify explicit feature vector:
 my $d = new AI::Categorizer::Document(name => $string);
 $d->features( $feature_vector );
 
 # Now pass the document to a categorization algorithm:
 my $learner = AI::Categorizer::Learner::NaiveBayes->restore_state($path);
 my $hypothesis = $learner->categorize($document);

=head1 DESCRIPTION

The Document class embodies the data in a single document, and
contains methods for turning this data into a FeatureVector.  Usually
documents are plain text, but subclasses of the Document class may
handle any kind of data.

=head1 METHODS

=over 4

=item new(%parameters)

Creates a new Document object.  Document objects are used during
training (for the training documents), testing (for the test
documents), and when categorizing new unseen documents in an
application (for the unseen documents).  However, you'll typically
only call C<new()> in the latter case, since the KnowledgeSet or
Collection classes will create Document objects for you in the former
cases.



( run in 0.967 second using v1.01-cache-2.11-cpan-39bf76dae61 )