Algorithm-AhoCorasick-XS

 view release on metacpan or  search on metacpan

README.md  view on Meta::CPAN


If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of `matches`, `match_details` etc. will also
be in terms of bytes.

You can simply call ` decode('UTF-8', ...) ` on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:

    use Encode qw(decode);
    my $unicode_start = length(decode('UTF-8', bytes::substr($string, 0, $start)));
    my $unicode_end   = $start + length(decode('UTF-8', $word)) - 1;

This will be handled for you in a future version.

# CAVEATS

This is an early release and has not been tested thoroughly, use at your own risk.
The API is subject to change until version 1.0.

If your keyword list contains duplicates, you will get duplicate matches.

Trie.hpp  view on Meta::CPAN

            else {
                while (child) {
                    if (child->label == ch) return child;
                    child = child->next;
                }
                return nullptr;
            }
        }

        Trie *add_word(std::string s) {
            return add_cstring(s.data(), s.length());
        }

        private:

        Trie *add_cstring(const char *word, int len) {
            unsigned char first = *word;
            Trie *n = get_child(first);
            if (!n) {
                n = new Trie(first, this);
                int b = bucket(first);

lib/Algorithm/AhoCorasick/XS.pm  view on Meta::CPAN


If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of C<matches>, C<match_details> etc. will also
be in terms of bytes.

You can simply call C< decode('UTF-8', ...) > on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:

 use Encode qw(decode);
 my $unicode_start = length(decode('UTF-8', bytes::substr($string, 0, $start)));
 my $unicode_end   = $start + length(decode('UTF-8', $word)) - 1;

This will be handled for you in a future version.

=head1 CAVEATS

This is an early release and has not been tested thoroughly, use at your own risk.
The API is subject to change until version 1.0.

If your keyword list contains duplicates, you will get duplicate matches.

ppport.h  view on Meta::CPAN

mfree||5.007002|n
mg_clear|||
mg_copy|||
mg_dup|||
mg_find_mglob|||
mg_findext|5.013008||pn
mg_find|||n
mg_free_type||5.013006|
mg_free|||
mg_get|||
mg_length||5.005000|
mg_localize|||
mg_magical|||n
mg_set|||
mg_size||5.005000|
mini_mktime||5.007002|n
minus_v|||
missingterm|||
mode_from_discipline|||
modkids|||
more_bodies|||

ppport.h  view on Meta::CPAN

reg_named_buff_all||5.009005|
reg_named_buff_exists||5.009005|
reg_named_buff_fetch||5.009005|
reg_named_buff_firstkey||5.009005|
reg_named_buff_iter|||
reg_named_buff_nextkey||5.009005|
reg_named_buff_scalar||5.009005|
reg_named_buff|||
reg_node|||
reg_numbered_buff_fetch|||
reg_numbered_buff_length|||
reg_numbered_buff_store|||
reg_qr_package|||
reg_recode|||
reg_scan_name|||
reg_skipcomment|||n
reg_temp_copy|||
reganode|||
regatom|||
regbranch|||
regclass_swash||5.009004|

ppport.h  view on Meta::CPAN

unsharepvn||5.003070|
unwind_handler_stack|||
update_debugger_info|||
upg_version||5.009005|
usage|||
utf16_textfilter|||
utf16_to_utf8_reversed||5.006001|
utf16_to_utf8||5.006001|
utf8_distance||5.006000|
utf8_hop||5.006000|n
utf8_length||5.007001|
utf8_mg_len_cache_update|||
utf8_mg_pos_cache_update|||
utf8_to_bytes||5.006001|
utf8_to_uvchr_buf||5.015009|
utf8_to_uvchr||5.007001|
utf8_to_uvuni_buf||5.015009|
utf8_to_uvuni||5.007001|
utf8n_to_uvchr||5.007001|
utf8n_to_uvuni||5.007001|
utilize|||

ppport.h  view on Meta::CPAN

yylex|||
yyparse|||
yyunlex|||
yywarn|||
);

if (exists $opt{'list-unsupported'}) {
  my $f;
  for $f (sort { lc $a cmp lc $b } keys %API) {
    next unless $API{$f}{todo};
    print "$f ", '.'x(40-length($f)), " ", format_version($API{$f}{todo}), "\n";
  }
  exit 0;
}

# Scan for possible replacement candidates

my(%replace, %need, %hints, %warnings, %depends);
my $replace = 0;
my($hint, $define, $function);

ppport.h  view on Meta::CPAN


/* Older perls (<=5.003) lack AvFILLp */
#ifndef AvFILLp
#  define AvFILLp                        AvFILL
#endif
#ifndef ERRSV
#  define ERRSV                          get_sv("@",FALSE)
#endif

/* Hint: gv_stashpvn
 * This function's backport doesn't support the length parameter, but
 * rather ignores it. Portability can only be ensured if the length
 * parameter is used for speed reasons, but the length can always be
 * correctly computed from the string argument.
 */
#ifndef gv_stashpvn
#  define gv_stashpvn(str,len,create)    gv_stashpv(str,create)
#endif

/* Replace: 1 */
#ifndef get_cv
#  define get_cv                         perl_get_cv
#endif

ppport.h  view on Meta::CPAN

#endif

#define my_strlcat DPPP_(my_my_strlcat)
#define Perl_my_strlcat DPPP_(my_my_strlcat)

#if defined(NEED_my_strlcat) || defined(NEED_my_strlcat_GLOBAL)

Size_t
DPPP_(my_my_strlcat)(char *dst, const char *src, Size_t size)
{
    Size_t used, length, copy;

    used = strlen(dst);
    length = strlen(src);
    if (size > 0 && used < size - 1) {
        copy = (length >= size - used) ? size - used - 1 : length;
        memcpy(dst + used, src, copy);
        dst[used + copy] = '\0';
    }
    return used + length;
}
#endif
#endif

#if !defined(my_strlcpy)
#if defined(NEED_my_strlcpy)
static Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
static
#else
extern Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
#endif

#define my_strlcpy DPPP_(my_my_strlcpy)
#define Perl_my_strlcpy DPPP_(my_my_strlcpy)

#if defined(NEED_my_strlcpy) || defined(NEED_my_strlcpy_GLOBAL)

Size_t
DPPP_(my_my_strlcpy)(char *dst, const char *src, Size_t size)
{
    Size_t length, copy;

    length = strlen(src);
    if (size > 0) {
        copy = (length >= size) ? size - 1 : length;
        memcpy(dst, src, copy);
        dst[copy] = '\0';
    }
    return length;
}

#endif
#endif
#ifndef PERL_PV_ESCAPE_QUOTE
#  define PERL_PV_ESCAPE_QUOTE           0x0001
#endif

#ifndef PERL_PV_PRETTY_QUOTE
#  define PERL_PV_PRETTY_QUOTE           PERL_PV_ESCAPE_QUOTE

typemap  view on Meta::CPAN

        ${var}[i] = std::string(\"\");
    }
  }
  else
    Perl_croak(aTHX_ \"%s: %s is not an array reference\",
               ${$ALIAS?\q[GvNAME(CvGV(cv))]:\qq[\"$pname\"]},
               \"$var\");

OUTPUT
T_STD_STRING
  $arg = sv_2mortal(newSVpvn($var.c_str(), $var.length()));

// Output a list.
T_STD_VECTOR_STRING
   {
	    U32 ix_$var;
        size_t size_$var = $var.size();
        SSize_t extend_size =
            /* The weird way this is written is because g++ is dumb
                * enough to warn "comparison is always false" on something
                * like:
                *
                * sizeof(a) > sizeof(b) && a > B_t_MAX
                *
                * (where the LH condition is false)
                */
            (size_$var > (sizeof(size_$var) > sizeof(SSize_t)
                            ? AV_SIZE_MAX : size_$var))
            ? -1 : (SSize_t)size_$var;
	    EXTEND(SP, extend_size);
	    for (ix_$var = 0; ix_$var < size_$var; ix_$var++) {
    		ST(ix_$var) = sv_2mortal(newSVpvn(${var}[ix_$var].c_str(), ${var}[ix_$var].length()));
	    }
        // xsubpp adds 'ST(0) = RETVALSV' which is undef, screwing this up
        XSRETURN(size_$var);
    }

// Output a list
T_STD_VECTOR_MATCH
  {
    U32 ix_$var;
    size_t size_$var = $var.size();
    SSize_t extend_size =
        (size_$var > (sizeof(size_$var) > sizeof(SSize_t)
                        ? AV_SIZE_MAX : size_$var))
        ? -1 : (SSize_t)size_$var;
    EXTEND(SP, extend_size);
    for (ix_$var = 0; ix_$var < size_$var; ix_$var++) {
        // Build a hashref from the match object
        const AhoCorasick::match &m = ${var}[ix_$var];
        HV *hv = newHV();

        STRLEN klen = m.keyword.length();

        // 7=strlen("keyword"), 0=tell hv_store to calculate the hash
        hv_store(hv, "word", 4, newSVpv(m.keyword.c_str(), klen), 0);
        hv_store(hv, "start", 5, newSViv(m.start), 0);
        hv_store(hv, "end", 3, newSViv(m.end), 0);

        ST(ix_$var) = newRV_noinc((SV *)hv);
    }
    XSRETURN(size_$var);
  }



( run in 0.610 second using v1.01-cache-2.11-cpan-65fba6d93b7 )