Algorithm-AhoCorasick-XS

 view release on metacpan or  search on metacpan

README.md  view on Meta::CPAN

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of `matches`, `match_details` etc. will also
be in terms of bytes.
 
You can simply call ` decode('UTF-8', ...) ` on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:
 
    use Encode qw(decode);
    my $unicode_start = length(decode('UTF-8', bytes::substr($string, 0, $start)));
    my $unicode_end   = $start + length(decode('UTF-8', $word)) - 1;
 
This will be handled for you in a future version.
 
# CAVEATS
 
This is an early release and has not been tested thoroughly, use at your own risk.
The API is subject to change until version 1.0.
 
If your keyword list contains duplicates, you will get duplicate matches.

Trie.hpp  view on Meta::CPAN

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
    else {
        while (child) {
            if (child->label == ch) return child;
            child = child->next;
        }
        return nullptr;
    }
}
 
Trie *add_word(std::string s) {
    return add_cstring(s.data(), s.length());
}
 
private:
 
Trie *add_cstring(const char *word, int len) {
    unsigned char first = *word;
    Trie *n = get_child(first);
    if (!n) {
        n = new Trie(first, this);
        int b = bucket(first);

lib/Algorithm/AhoCorasick/XS.pm  view on Meta::CPAN

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of C<matches>, C<match_details> etc. will also
be in terms of bytes.
 
You can simply call C< decode('UTF-8', ...) > on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:
 
 use Encode qw(decode);
 my $unicode_start = length(decode('UTF-8', bytes::substr($string, 0, $start)));
 my $unicode_end   = $start + length(decode('UTF-8', $word)) - 1;
 
This will be handled for you in a future version.
 
=head1 CAVEATS
 
This is an early release and has not been tested thoroughly, use at your own risk.
The API is subject to change until version 1.0.
 
If your keyword list contains duplicates, you will get duplicate matches.

ppport.h  view on Meta::CPAN

1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
mfree||5.007002|n
mg_clear|||
mg_copy|||
mg_dup|||
mg_find_mglob|||
mg_findext|5.013008||pn
mg_find|||n
mg_free_type||5.013006|
mg_free|||
mg_get|||
mg_length||5.005000|
mg_localize|||
mg_magical|||n
mg_set|||
mg_size||5.005000|
mini_mktime||5.007002|n
minus_v|||
missingterm|||
mode_from_discipline|||
modkids|||
more_bodies|||

ppport.h  view on Meta::CPAN

2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
reg_named_buff_all||5.009005|
reg_named_buff_exists||5.009005|
reg_named_buff_fetch||5.009005|
reg_named_buff_firstkey||5.009005|
reg_named_buff_iter|||
reg_named_buff_nextkey||5.009005|
reg_named_buff_scalar||5.009005|
reg_named_buff|||
reg_node|||
reg_numbered_buff_fetch|||
reg_numbered_buff_length|||
reg_numbered_buff_store|||
reg_qr_package|||
reg_recode|||
reg_scan_name|||
reg_skipcomment|||n
reg_temp_copy|||
reganode|||
regatom|||
regbranch|||
regclass_swash||5.009004|

ppport.h  view on Meta::CPAN

2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
unsharepvn||5.003070|
unwind_handler_stack|||
update_debugger_info|||
upg_version||5.009005|
usage|||
utf16_textfilter|||
utf16_to_utf8_reversed||5.006001|
utf16_to_utf8||5.006001|
utf8_distance||5.006000|
utf8_hop||5.006000|n
utf8_length||5.007001|
utf8_mg_len_cache_update|||
utf8_mg_pos_cache_update|||
utf8_to_bytes||5.006001|
utf8_to_uvchr_buf||5.015009|
utf8_to_uvchr||5.007001|
utf8_to_uvuni_buf||5.015009|
utf8_to_uvuni||5.007001|
utf8n_to_uvchr||5.007001|
utf8n_to_uvuni||5.007001|
utilize|||

ppport.h  view on Meta::CPAN

2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
yylex|||
yyparse|||
yyunlex|||
yywarn|||
);
 
if (exists $opt{'list-unsupported'}) {
  my $f;
  for $f (sort { lc $a cmp lc $b } keys %API) {
    next unless $API{$f}{todo};
    print "$f ", '.'x(40-length($f)), " ", format_version($API{$f}{todo}), "\n";
  }
  exit 0;
}
 
# Scan for possible replacement candidates
 
my(%replace, %need, %hints, %warnings, %depends);
my $replace = 0;
my($hint, $define, $function);

ppport.h  view on Meta::CPAN

4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
/* Older perls (<=5.003) lack AvFILLp */
#ifndef AvFILLp
#  define AvFILLp                        AvFILL
#endif
#ifndef ERRSV
#  define ERRSV                          get_sv("@",FALSE)
#endif
 
/* Hint: gv_stashpvn
 * This function's backport doesn't support the length parameter, but
 * rather ignores it. Portability can only be ensured if the length
 * parameter is used for speed reasons, but the length can always be
 * correctly computed from the string argument.
 */
#ifndef gv_stashpvn
#  define gv_stashpvn(str,len,create)    gv_stashpv(str,create)
#endif
 
/* Replace: 1 */
#ifndef get_cv
#  define get_cv                         perl_get_cv
#endif

ppport.h  view on Meta::CPAN

7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
#endif
 
#define my_strlcat DPPP_(my_my_strlcat)
#define Perl_my_strlcat DPPP_(my_my_strlcat)
 
#if defined(NEED_my_strlcat) || defined(NEED_my_strlcat_GLOBAL)
 
Size_t
DPPP_(my_my_strlcat)(char *dst, const char *src, Size_t size)
{
    Size_t used, length, copy;
 
    used = strlen(dst);
    length = strlen(src);
    if (size > 0 && used < size - 1) {
        copy = (length >= size - used) ? size - used - 1 : length;
        memcpy(dst + used, src, copy);
        dst[used + copy] = '\0';
    }
    return used + length;
}
#endif
#endif
 
#if !defined(my_strlcpy)
#if defined(NEED_my_strlcpy)
static Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
static
#else
extern Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
#endif
 
#define my_strlcpy DPPP_(my_my_strlcpy)
#define Perl_my_strlcpy DPPP_(my_my_strlcpy)
 
#if defined(NEED_my_strlcpy) || defined(NEED_my_strlcpy_GLOBAL)
 
Size_t
DPPP_(my_my_strlcpy)(char *dst, const char *src, Size_t size)
{
    Size_t length, copy;
 
    length = strlen(src);
    if (size > 0) {
        copy = (length >= size) ? size - 1 : length;
        memcpy(dst, src, copy);
        dst[copy] = '\0';
    }
    return length;
}
 
#endif
#endif
#ifndef PERL_PV_ESCAPE_QUOTE
#  define PERL_PV_ESCAPE_QUOTE           0x0001
#endif
 
#ifndef PERL_PV_PRETTY_QUOTE
#  define PERL_PV_PRETTY_QUOTE           PERL_PV_ESCAPE_QUOTE

typemap  view on Meta::CPAN

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
        ${var}[i] = std::string(\"\");
    }
  }
  else
    Perl_croak(aTHX_ \"%s: %s is not an array reference\",
               ${$ALIAS?\q[GvNAME(CvGV(cv))]:\qq[\"$pname\"]},
               \"$var\");
 
OUTPUT
T_STD_STRING
  $arg = sv_2mortal(newSVpvn($var.c_str(), $var.length()));
 
// Output a list.
T_STD_VECTOR_STRING
   {
            U32 ix_$var;
        size_t size_$var = $var.size();
        SSize_t extend_size =
            /* The weird way this is written is because g++ is dumb
                * enough to warn "comparison is always false" on something
                * like:
                *
                * sizeof(a) > sizeof(b) && a > B_t_MAX
                *
                * (where the LH condition is false)
                */
            (size_$var > (sizeof(size_$var) > sizeof(SSize_t)
                            ? AV_SIZE_MAX : size_$var))
            ? -1 : (SSize_t)size_$var;
            EXTEND(SP, extend_size);
            for (ix_$var = 0; ix_$var < size_$var; ix_$var++) {
                ST(ix_$var) = sv_2mortal(newSVpvn(${var}[ix_$var].c_str(), ${var}[ix_$var].length()));
            }
        // xsubpp adds 'ST(0) = RETVALSV' which is undef, screwing this up
        XSRETURN(size_$var);
    }
 
// Output a list
T_STD_VECTOR_MATCH
  {
    U32 ix_$var;
    size_t size_$var = $var.size();
    SSize_t extend_size =
        (size_$var > (sizeof(size_$var) > sizeof(SSize_t)
                        ? AV_SIZE_MAX : size_$var))
        ? -1 : (SSize_t)size_$var;
    EXTEND(SP, extend_size);
    for (ix_$var = 0; ix_$var < size_$var; ix_$var++) {
        // Build a hashref from the match object
        const AhoCorasick::match &m = ${var}[ix_$var];
        HV *hv = newHV();
 
        STRLEN klen = m.keyword.length();
 
        // 7=strlen("keyword"), 0=tell hv_store to calculate the hash
        hv_store(hv, "word", 4, newSVpv(m.keyword.c_str(), klen), 0);
        hv_store(hv, "start", 5, newSViv(m.start), 0);
        hv_store(hv, "end", 3, newSViv(m.end), 0);
 
        ST(ix_$var) = newRV_noinc((SV *)hv);
    }
    XSRETURN(size_$var);
  }



( run in 0.287 second using v1.01-cache-2.11-cpan-87723dcf8b7 )