Algorithm-NGram
view release on metacpan or search on metacpan
lib/Algorithm/NGram.pm view on Meta::CPAN
None.
=head1 METHODS
=over 4
=item new
Create a new n-gram analyzer instance.
B<Options:>
=over 4
=item ngram_width
This is the "window size" of how many tokens the analyzer will keep
track of. A ngram_width of two will make a bigram, a ngram_width of
three will make a trigram, etc...
=back
=cut
sub new {
my ($class, %opts) = @_;
# trigram by default
my $ngram_width = delete $opts{ngram_width} || 3;
my $token_table = delete $opts{token_table} || {};
my $tokens = delete $opts{tokens} || [];
my $self = {
ngram_width => $ngram_width,
token_table => $token_table,
tokens => $tokens,
};
bless $self, $class;
$self->dirty(1);
return $self;
}
=item ngram_width
Returns token window size (e.g. the "n" in n-gram)
=cut
=item token_table
Returns n-gram table
=cut
=item add_text
Splits a block of text up by whitespace and processes each word as a
token. Automatically calls C<add_start_token()> at the beginning of
the text and C<add_end_token()> at the end.
=cut
# process a block of text, auto-tokenizing it
sub add_text {
my ($self, $text) = @_;
$self->add_start_token;
# tokenize text
foreach my $tok (split(/ /, $text)) {
$tok =~ s/ +//g; # remove spaces
next unless $tok;
$self->add_token($tok);
}
$self->add_end_token;
}
=item add_tokens
Adds an arbitrary list of tokens.
=cut
*add_token = \&add_tokens;
sub add_tokens {
my ($self, @tokens) = @_;
push @{$self->{tokens}}, @tokens;
$self->dirty(1);
}
=item add_start_token
Adds the "start token." This is useful because you often will want to
mark the beginnings and ends of a token sequence so that when
generating your output the generator will know what tokens start a
sequence and when to end.
=cut
sub add_start_token {
my ($self) = @_;
$self->add_token(START_TOK);
}
=item add_end_token
Adds the "end token." See C<add_start_token()>.
=cut
sub add_end_token {
my ($self) = @_;
$self->add_token(END_TOK);
}
=item analyze
Generates an n-gram frequency table. Returns a hashref of
I<< N => tokens => count >>, where N is the number of tokens (will be from 2
to ngram_width). You will not normally need to call this unless you
( run in 0.898 second using v1.01-cache-2.11-cpan-13bb782fe5a )