Search-Fulltext-Tokenizer-Ngram

 view release on metacpan or  search on metacpan

lib/Search/Fulltext/Tokenizer/Ngram.pm  view on Meta::CPAN

package Search::Fulltext::Tokenizer::Ngram;

# ABSTRACT: Character n-gram tokenizer for Search::Fulltext

use strict;
use warnings;
use Carp ();
use Scalar::Util qw/looks_like_number/;

our $VERSION = 0.01;

sub new {
  my ($class, $token_length) = @_;

  unless (looks_like_number $token_length and $token_length > 0) {
    Carp::croak('Token length must be 1+.');
  }

  bless +{ token_length => $token_length } => $class;
}

sub create_token_iterator {
  my ($self, $text) = @_;

  my $token_index = -1;
  my $n = $self->token_length;
  return sub {
  GET_NEXT_TOKEN:
    {
      ++$token_index;
      return if $token_index + $n > length($text);
      my $token = substr $text, $token_index, $n;
      redo GET_NEXT_TOKEN if $token =~ /\s/;
      return ($token, $n, $token_index, $token_index + $n, $token_index);
    }
  };
}

sub token_length { $_[0]->{token_length} }

1;

__END__

=pod

=head1 NAME

Search::Fulltext::Tokenizer::Ngram - Character n-gram tokenizer for Search::Fulltext

=head1 VERSION

version 0.01

=head1 SYNOPSIS

  use utf8;
  use Search::Fulltext;
  use Search::Fulltext::Tokenizer::Bigramm;
  
  my $searcher = Search::Fulltext->new(
      docs => [
          'ハンプティ・ダンプティ 塀の上',
          'ハンプティ・ダンプティ 落っこちた',
          '王様の馬みんなと 王様の家来みんなでも',
          'ハンプティを元に 戻せなかった',
      ],
      tokenizer => q/perl 'Search::Fulltext::Tokenizer::Bigram::get_tokenizer'/,
  );
  my $hit_document_ids = $searcher->search('ハンプティ');  # [0, 1, 3]

=head1 DESCRIPTION

This module provides character N-gram tokenizers for L<Search::Fulltext>.

By default {1,2,3}-gram tokenzers are available.

=head1 CREATING A N(> 3)-GRAM TOKENIZER

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 1.884 second using v1.00-cache-2.02-grep-82fe00e-cpan-c30982ac1bc3 )