Algorithm-AhoCorasick-XS
view release on metacpan or search on metacpan
If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of `matches`, `match_details` etc. will also
be in terms of bytes.
You can simply call ` decode('UTF-8', ...) ` on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:
use Encode qw(decode);
my $unicode_start = length(decode('UTF-8', bytes::substr($string, 0, $start)));
my $unicode_end = $start + length(decode('UTF-8', $word)) - 1;
This will be handled for you in a future version.
# CAVEATS
This is an early release and has not been tested thoroughly, use at your own risk.
The API is subject to change until version 1.0.
If your keyword list contains duplicates, you will get duplicate matches.
else {
while (child) {
if (child->label == ch) return child;
child = child->next;
}
return nullptr;
}
}
Trie *add_word(std::string s) {
return add_cstring(s.data(), s.length());
}
private:
Trie *add_cstring(const char *word, int len) {
unsigned char first = *word;
Trie *n = get_child(first);
if (!n) {
n = new Trie(first, this);
int b = bucket(first);
lib/Algorithm/AhoCorasick/XS.pm view on Meta::CPAN
If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of C<matches>, C<match_details> etc. will also
be in terms of bytes.
You can simply call C< decode('UTF-8', ...) > on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:
use Encode qw(decode);
my $unicode_start = length(decode('UTF-8', bytes::substr($string, 0, $start)));
my $unicode_end = $start + length(decode('UTF-8', $word)) - 1;
This will be handled for you in a future version.
=head1 CAVEATS
This is an early release and has not been tested thoroughly, use at your own risk.
The API is subject to change until version 1.0.
If your keyword list contains duplicates, you will get duplicate matches.
mfree||5.007002|n
mg_clear|||
mg_copy|||
mg_dup|||
mg_find_mglob|||
mg_findext|5.013008||pn
mg_find|||n
mg_free_type||5.013006|
mg_free|||
mg_get|||
mg_length||5.005000|
mg_localize|||
mg_magical|||n
mg_set|||
mg_size||5.005000|
mini_mktime||5.007002|n
minus_v|||
missingterm|||
mode_from_discipline|||
modkids|||
more_bodies|||
reg_named_buff_all||5.009005|
reg_named_buff_exists||5.009005|
reg_named_buff_fetch||5.009005|
reg_named_buff_firstkey||5.009005|
reg_named_buff_iter|||
reg_named_buff_nextkey||5.009005|
reg_named_buff_scalar||5.009005|
reg_named_buff|||
reg_node|||
reg_numbered_buff_fetch|||
reg_numbered_buff_length|||
reg_numbered_buff_store|||
reg_qr_package|||
reg_recode|||
reg_scan_name|||
reg_skipcomment|||n
reg_temp_copy|||
reganode|||
regatom|||
regbranch|||
regclass_swash||5.009004|
unsharepvn||5.003070|
unwind_handler_stack|||
update_debugger_info|||
upg_version||5.009005|
usage|||
utf16_textfilter|||
utf16_to_utf8_reversed||5.006001|
utf16_to_utf8||5.006001|
utf8_distance||5.006000|
utf8_hop||5.006000|n
utf8_length||5.007001|
utf8_mg_len_cache_update|||
utf8_mg_pos_cache_update|||
utf8_to_bytes||5.006001|
utf8_to_uvchr_buf||5.015009|
utf8_to_uvchr||5.007001|
utf8_to_uvuni_buf||5.015009|
utf8_to_uvuni||5.007001|
utf8n_to_uvchr||5.007001|
utf8n_to_uvuni||5.007001|
utilize|||
yylex|||
yyparse|||
yyunlex|||
yywarn|||
);
if (exists $opt{'list-unsupported'}) {
my $f;
for $f (sort { lc $a cmp lc $b } keys %API) {
next unless $API{$f}{todo};
print "$f ", '.'x(40-length($f)), " ", format_version($API{$f}{todo}), "\n";
}
exit 0;
}
# Scan for possible replacement candidates
my(%replace, %need, %hints, %warnings, %depends);
my $replace = 0;
my($hint, $define, $function);
/* Older perls (<=5.003) lack AvFILLp */
#ifndef AvFILLp
# define AvFILLp AvFILL
#endif
#ifndef ERRSV
# define ERRSV get_sv("@",FALSE)
#endif
/* Hint: gv_stashpvn
* This function's backport doesn't support the length parameter, but
* rather ignores it. Portability can only be ensured if the length
* parameter is used for speed reasons, but the length can always be
* correctly computed from the string argument.
*/
#ifndef gv_stashpvn
# define gv_stashpvn(str,len,create) gv_stashpv(str,create)
#endif
/* Replace: 1 */
#ifndef get_cv
# define get_cv perl_get_cv
#endif
#endif
#define my_strlcat DPPP_(my_my_strlcat)
#define Perl_my_strlcat DPPP_(my_my_strlcat)
#if defined(NEED_my_strlcat) || defined(NEED_my_strlcat_GLOBAL)
Size_t
DPPP_(my_my_strlcat)(char *dst, const char *src, Size_t size)
{
Size_t used, length, copy;
used = strlen(dst);
length = strlen(src);
if (size > 0 && used < size - 1) {
copy = (length >= size - used) ? size - used - 1 : length;
memcpy(dst + used, src, copy);
dst[used + copy] = '\0';
}
return used + length;
}
#endif
#endif
#if !defined(my_strlcpy)
#if defined(NEED_my_strlcpy)
static Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
static
#else
extern Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
#endif
#define my_strlcpy DPPP_(my_my_strlcpy)
#define Perl_my_strlcpy DPPP_(my_my_strlcpy)
#if defined(NEED_my_strlcpy) || defined(NEED_my_strlcpy_GLOBAL)
Size_t
DPPP_(my_my_strlcpy)(char *dst, const char *src, Size_t size)
{
Size_t length, copy;
length = strlen(src);
if (size > 0) {
copy = (length >= size) ? size - 1 : length;
memcpy(dst, src, copy);
dst[copy] = '\0';
}
return length;
}
#endif
#endif
#ifndef PERL_PV_ESCAPE_QUOTE
# define PERL_PV_ESCAPE_QUOTE 0x0001
#endif
#ifndef PERL_PV_PRETTY_QUOTE
# define PERL_PV_PRETTY_QUOTE PERL_PV_ESCAPE_QUOTE
${var}[i] = std::string(\"\");
}
}
else
Perl_croak(aTHX_ \"%s: %s is not an array reference\",
${$ALIAS?\q[GvNAME(CvGV(cv))]:\qq[\"$pname\"]},
\"$var\");
OUTPUT
T_STD_STRING
$arg = sv_2mortal(newSVpvn($var.c_str(), $var.length()));
// Output a list.
T_STD_VECTOR_STRING
{
U32 ix_$var;
size_t size_$var = $var.size();
SSize_t extend_size =
/* The weird way this is written is because g++ is dumb
* enough to warn "comparison is always false" on something
* like:
*
* sizeof(a) > sizeof(b) && a > B_t_MAX
*
* (where the LH condition is false)
*/
(size_$var > (sizeof(size_$var) > sizeof(SSize_t)
? AV_SIZE_MAX : size_$var))
? -1 : (SSize_t)size_$var;
EXTEND(SP, extend_size);
for (ix_$var = 0; ix_$var < size_$var; ix_$var++) {
ST(ix_$var) = sv_2mortal(newSVpvn(${var}[ix_$var].c_str(), ${var}[ix_$var].length()));
}
// xsubpp adds 'ST(0) = RETVALSV' which is undef, screwing this up
XSRETURN(size_$var);
}
// Output a list
T_STD_VECTOR_MATCH
{
U32 ix_$var;
size_t size_$var = $var.size();
SSize_t extend_size =
(size_$var > (sizeof(size_$var) > sizeof(SSize_t)
? AV_SIZE_MAX : size_$var))
? -1 : (SSize_t)size_$var;
EXTEND(SP, extend_size);
for (ix_$var = 0; ix_$var < size_$var; ix_$var++) {
// Build a hashref from the match object
const AhoCorasick::match &m = ${var}[ix_$var];
HV *hv = newHV();
STRLEN klen = m.keyword.length();
// 7=strlen("keyword"), 0=tell hv_store to calculate the hash
hv_store(hv, "word", 4, newSVpv(m.keyword.c_str(), klen), 0);
hv_store(hv, "start", 5, newSViv(m.start), 0);
hv_store(hv, "end", 3, newSViv(m.end), 0);
ST(ix_$var) = newRV_noinc((SV *)hv);
}
XSRETURN(size_$var);
}
( run in 0.610 second using v1.01-cache-2.11-cpan-65fba6d93b7 )