Algorithm-NGram
view release on metacpan or search on metacpan
lib/Algorithm/NGram.pm view on Meta::CPAN
my $ng = Algorithm::NGram->new(ngram_width => 3); # use trigrams
# feed in text
$ng->add_text($text1); # analyze $text1
$ng->add_text($text2); # analyze $text2
# feed in arbitrary sequence of tokens
$ng->add_start_token;
$ng->add_tokens(qw/token1 token2 token3/);
$ng->add_end_token;
my $output = $ng->generate_text;
=head1 DESCRIPTION
This is a module for analyzing token sequences with n-grams. You can
use it to parse a block of text, or feed in your own tokens. It can
generate new sequences of tokens from what has been fed in.
=head1 EXPORT
None.
=head1 METHODS
=over 4
=item new
Create a new n-gram analyzer instance.
B<Options:>
=over 4
=item ngram_width
This is the "window size" of how many tokens the analyzer will keep
track of. A ngram_width of two will make a bigram, a ngram_width of
three will make a trigram, etc...
=back
=cut
sub new {
my ($class, %opts) = @_;
# trigram by default
my $ngram_width = delete $opts{ngram_width} || 3;
my $token_table = delete $opts{token_table} || {};
my $tokens = delete $opts{tokens} || [];
my $self = {
ngram_width => $ngram_width,
token_table => $token_table,
tokens => $tokens,
};
bless $self, $class;
$self->dirty(1);
return $self;
}
=item ngram_width
Returns token window size (e.g. the "n" in n-gram)
=cut
=item token_table
Returns n-gram table
=cut
=item add_text
Splits a block of text up by whitespace and processes each word as a
token. Automatically calls C<add_start_token()> at the beginning of
the text and C<add_end_token()> at the end.
=cut
# process a block of text, auto-tokenizing it
sub add_text {
my ($self, $text) = @_;
$self->add_start_token;
# tokenize text
foreach my $tok (split(/ /, $text)) {
$tok =~ s/ +//g; # remove spaces
next unless $tok;
$self->add_token($tok);
}
$self->add_end_token;
}
=item add_tokens
Adds an arbitrary list of tokens.
=cut
*add_token = \&add_tokens;
sub add_tokens {
my ($self, @tokens) = @_;
push @{$self->{tokens}}, @tokens;
$self->dirty(1);
}
=item add_start_token
Adds the "start token." This is useful because you often will want to
mark the beginnings and ends of a token sequence so that when
( run in 0.474 second using v1.01-cache-2.11-cpan-39bf76dae61 )