Search-Fulltext-Tokenizer-Ngram
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/Search/Fulltext/Tokenizer/Ngram.pm view on Meta::CPAN
package Search::Fulltext::Tokenizer::Ngram;
# ABSTRACT: Character n-gram tokenizer for Search::Fulltext
use strict;
use warnings;
use Carp ();
use Scalar::Util qw/looks_like_number/;
our $VERSION = 0.01;
sub new {
my ($class, $token_length) = @_;
unless (looks_like_number $token_length and $token_length > 0) {
Carp::croak('Token length must be 1+.');
}
bless +{ token_length => $token_length } => $class;
}
sub create_token_iterator {
my ($self, $text) = @_;
my $token_index = -1;
my $n = $self->token_length;
return sub {
GET_NEXT_TOKEN:
{
++$token_index;
return if $token_index + $n > length($text);
my $token = substr $text, $token_index, $n;
redo GET_NEXT_TOKEN if $token =~ /\s/;
return ($token, $n, $token_index, $token_index + $n, $token_index);
}
};
}
sub token_length { $_[0]->{token_length} }
1;
__END__
=pod
=head1 NAME
Search::Fulltext::Tokenizer::Ngram - Character n-gram tokenizer for Search::Fulltext
=head1 VERSION
version 0.01
=head1 SYNOPSIS
use utf8;
use Search::Fulltext;
use Search::Fulltext::Tokenizer::Bigramm;
my $searcher = Search::Fulltext->new(
docs => [
'ãã³ããã£ã»ãã³ãã㣠å¡ã®ä¸',
'ãã³ããã£ã»ãã³ãã㣠è½ã£ãã¡ã',
'çæ§ã®é¦¬ã¿ããªã¨ çæ§ã®å®¶æ¥ã¿ããªã§ã',
'ãã³ããã£ãå
ã« æ»ããªãã£ã',
],
tokenizer => q/perl 'Search::Fulltext::Tokenizer::Bigram::get_tokenizer'/,
);
my $hit_document_ids = $searcher->search('ãã³ããã£'); # [0, 1, 3]
=head1 DESCRIPTION
This module provides character N-gram tokenizers for L<Search::Fulltext>.
By default {1,2,3}-gram tokenzers are available.
=head1 CREATING A N(> 3)-GRAM TOKENIZER
view all matches for this distributionview release on metacpan - search on metacpan
( run in 1.884 second using v1.00-cache-2.02-grep-82fe00e-cpan-c30982ac1bc3 )