Algorithm-AhoCorasick-XS
view release on metacpan or search on metacpan
757677787980818283848586878889909192939495If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of `matches`, `match_details` etc. will also
be in terms of bytes.
You can simply call ` decode(
'UTF-8'
, ...) ` on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:
my
$unicode_start
=
length
(decode(
'UTF-8'
, bytes::
substr
(
$string
, 0,
$start
)));
my
$unicode_end
=
$start
+
length
(decode(
'UTF-8'
,
$word
)) - 1;
This will be handled
for
you in a future version.
# CAVEATS
The API is subject to change
until
version 1.0.
If your keyword list contains duplicates, you will get duplicate matches.
323334353637383940414243444546474849505152
else
{
while
(child) {
if
(child->label == ch)
return
child;
child = child->
next
;
}
return
nullptr;
}
}
Trie
*add_word
(std::string s) {
return
add_cstring(s.data(), s.
length
());
}
private:
Trie
*add_cstring
(const char
*word
,
int
len) {
unsigned char first =
*word
;
Trie
*n
= get_child(first);
if
(!n) {
n = new Trie(first, this);
int
b = bucket(first);
lib/Algorithm/AhoCorasick/XS.pm view on Meta::CPAN
96979899100101102103104105106107108109110111112113114115116If you pass Unicode strings to the matcher, they will be interpreted as a sequence
of UTF-8 bytes. This means the output of C<matches>, C<match_details> etc. will also
be in terms of bytes.
You can simply call C< decode(
'UTF-8'
, ...) > on the substrings to get their
Unicode versions. The offsets will be in bytes though; converting them to character
offsets in the Unicode string is a little more tricky:
my
$unicode_start
=
length
(decode(
'UTF-8'
, bytes::
substr
(
$string
, 0,
$start
)));
my
$unicode_end
=
$start
+
length
(decode(
'UTF-8'
,
$word
)) - 1;
This will be handled
for
you in a future version.
=head1 CAVEATS
This is an early release and has not been tested thoroughly, use at your own risk.
The API is subject to change until version 1.0.
If your keyword list contains duplicates, you will get duplicate matches.
198919901991199219931994199519961997199819992000200120022003200420052006200720082009mfree||5.007002|n
mg_clear|||
mg_copy|||
mg_dup|||
mg_find_mglob|||
mg_findext|5.013008||pn
mg_find|||n
mg_free_type||5.013006|
mg_free|||
mg_get|||
mg_length||5.005000|
mg_localize|||
mg_magical|||n
mg_set|||
mg_size||5.005000|
mini_mktime||5.007002|n
minus_v|||
missingterm|||
mode_from_discipline|||
modkids|||
more_bodies|||
235223532354235523562357235823592360236123622363236423652366236723682369237023712372reg_named_buff_all||5.009005|
reg_named_buff_exists||5.009005|
reg_named_buff_fetch||5.009005|
reg_named_buff_firstkey||5.009005|
reg_named_buff_iter|||
reg_named_buff_nextkey||5.009005|
reg_named_buff_scalar||5.009005|
reg_named_buff|||
reg_node|||
reg_numbered_buff_fetch|||
reg_numbered_buff_length|||
reg_numbered_buff_store|||
reg_qr_package|||
reg_recode|||
reg_scan_name|||
reg_skipcomment|||n
reg_temp_copy|||
reganode|||
regatom|||
regbranch|||
regclass_swash||5.009004|
283828392840284128422843284428452846284728482849285028512852285328542855285628572858unsharepvn||5.003070|
unwind_handler_stack|||
update_debugger_info|||
upg_version||5.009005|
usage|||
utf16_textfilter|||
utf16_to_utf8_reversed||5.006001|
utf16_to_utf8||5.006001|
utf8_distance||5.006000|
utf8_hop||5.006000|n
utf8_length||5.007001|
utf8_mg_len_cache_update|||
utf8_mg_pos_cache_update|||
utf8_to_bytes||5.006001|
utf8_to_uvchr_buf||5.015009|
utf8_to_uvchr||5.007001|
utf8_to_uvuni_buf||5.015009|
utf8_to_uvuni||5.007001|
utf8n_to_uvchr||5.007001|
utf8n_to_uvuni||5.007001|
utilize|||
29072908290929102911291229132914291529162917291829192920292129222923292429252926yylex|||
yyparse|||
yyunlex|||
yywarn|||
);
if
(
exists
$opt
{
'list-unsupported'
}) {
my
$f
;
for
$f
(
sort
{
lc
$a
cmp
lc
$b
}
keys
%API
) {
next
unless
$API
{
$f
}{todo};
"$f "
,
'.'
x(40-
length
(
$f
)),
" "
, format_version(
$API
{
$f
}{todo}),
"\n"
;
}
exit
0;
}
# Scan for possible replacement candidates
my
(
%replace
,
%need
,
%hints
,
%warnings
,
%depends
);
my
$replace
= 0;
my
(
$hint
,
$define
,
$function
);
44194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441/* Older perls (<=5.003) lack AvFILLp */
#ifndef AvFILLp
# define AvFILLp AvFILL
#endif
#ifndef ERRSV
# define ERRSV get_sv("@",FALSE)
#endif
/* Hint: gv_stashpvn
* This function
's backport doesn'
t support the
length
parameter, but
* rather ignores it. Portability can only be ensured
if
the
length
* parameter is used
for
speed reasons, but the
length
can always be
* correctly computed from the string argument.
*/
#ifndef gv_stashpvn
# define gv_stashpvn(str,len,create) gv_stashpv(str,create)
#endif
/* Replace: 1 */
#ifndef get_cv
# define get_cv perl_get_cv
#endif
76017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659#endif
#define my_strlcat DPPP_(my_my_strlcat)
#define Perl_my_strlcat DPPP_(my_my_strlcat)
#if defined(NEED_my_strlcat) || defined(NEED_my_strlcat_GLOBAL)
Size_t
DPPP_(my_my_strlcat)(char
*dst
, const char
*src
, Size_t size)
{
Size_t used,
length
, copy;
used = strlen(dst);
length
= strlen(src);
if
(size > 0 && used < size - 1) {
copy = (
length
>= size - used) ? size - used - 1 :
length
;
memcpy(dst + used, src, copy);
dst[used + copy] =
'\0'
;
}
return
used +
length
;
}
#endif
#endif
#if !defined(my_strlcpy)
#if defined(NEED_my_strlcpy)
static Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
static
#else
extern Size_t DPPP_(my_my_strlcpy)(char * dst, const char * src, Size_t size);
#endif
#define my_strlcpy DPPP_(my_my_strlcpy)
#define Perl_my_strlcpy DPPP_(my_my_strlcpy)
#if defined(NEED_my_strlcpy) || defined(NEED_my_strlcpy_GLOBAL)
Size_t
DPPP_(my_my_strlcpy)(char
*dst
, const char
*src
, Size_t size)
{
Size_t
length
, copy;
length
= strlen(src);
if
(size > 0) {
copy = (
length
>= size) ? size - 1 :
length
;
memcpy(dst, src, copy);
dst[copy] =
'\0'
;
}
return
length
;
}
#endif
#endif
#ifndef PERL_PV_ESCAPE_QUOTE
# define PERL_PV_ESCAPE_QUOTE 0x0001
#endif
#ifndef PERL_PV_PRETTY_QUOTE
# define PERL_PV_PRETTY_QUOTE PERL_PV_ESCAPE_QUOTE
353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
${var}[i] = std::string(\"\");
}
}
else
Perl_croak(aTHX_ \"
%s
:
%s
is not an array reference\",
${
$ALIAS
?\
q[GvNAME(CvGV(cv))]
:\
qq[\"$pname\"]
},
\"
$var
\");
OUTPUT
T_STD_STRING
$arg
= sv_2mortal(newSVpvn(
$var
.c_str(),
$var
.
length
()));
// Output a list.
T_STD_VECTOR_STRING
{
U32 ix_
$var
;
size_t size_
$var
=
$var
.size();
SSize_t extend_size =
/* The weird way this is written is because g++ is dumb
* enough to
warn
"comparison is always false"
on something
* like:
*
* sizeof(a) > sizeof(b) && a > B_t_MAX
*
* (where the LH condition is false)
*/
(size_
$var
> (sizeof(size_
$var
) > sizeof(SSize_t)
? AV_SIZE_MAX : size_
$var
))
? -1 : (SSize_t)size_
$var
;
EXTEND(SP, extend_size);
for
(ix_
$var
= 0; ix_
$var
< size_
$var
; ix_
$var
++) {
ST(ix_
$var
) = sv_2mortal(newSVpvn(${var}[ix_
$var
].c_str(), ${var}[ix_
$var
].
length
()));
}
// xsubpp adds
'ST(0) = RETVALSV'
which is
undef
, screwing this up
XSRETURN(size_
$var
);
}
// Output a list
T_STD_VECTOR_MATCH
{
U32 ix_
$var
;
size_t size_
$var
=
$var
.size();
SSize_t extend_size =
(size_
$var
> (sizeof(size_
$var
) > sizeof(SSize_t)
? AV_SIZE_MAX : size_
$var
))
? -1 : (SSize_t)size_
$var
;
EXTEND(SP, extend_size);
for
(ix_
$var
= 0; ix_
$var
< size_
$var
; ix_
$var
++) {
// Build a hashref from the match object
const AhoCorasick::match
&m
= ${var}[ix_
$var
];
HV
*hv
= newHV();
STRLEN klen = m.keyword.
length
();
// 7=strlen(
"keyword"
), 0=
tell
hv_store to calculate the hash
hv_store(hv,
"word"
, 4, newSVpv(m.keyword.c_str(), klen), 0);
hv_store(hv,
"start"
, 5, newSViv(m.start), 0);
hv_store(hv,
"end"
, 3, newSViv(m.end), 0);
ST(ix_
$var
) = newRV_noinc((SV *)hv);
}
XSRETURN(size_
$var
);
}
( run in 0.287 second using v1.01-cache-2.11-cpan-87723dcf8b7 )