Big5
view release on metacpan or search on metacpan
lib/Ebig5.pm view on Meta::CPAN
sub Ebig5::d_();
sub Ebig5::l_();
sub Ebig5::p_();
sub Ebig5::S_();
sub Ebig5::b_();
sub Ebig5::c_();
sub Ebig5::u_();
sub Ebig5::g_();
sub Ebig5::k_();
sub Ebig5::T_();
sub Ebig5::B_();
sub Ebig5::M_();
sub Ebig5::A_();
sub Ebig5::C_();
sub Ebig5::glob($);
sub Ebig5::glob_();
sub Ebig5::lstat(*);
sub Ebig5::lstat_();
sub Ebig5::opendir(*$);
sub Ebig5::stat(*);
sub Ebig5::stat_();
sub Ebig5::unlink(@);
sub Ebig5::chdir(;$);
sub Ebig5::do($);
sub Ebig5::require(;$);
sub Ebig5::telldir(*);
sub Big5::ord(;$);
sub Big5::ord_();
sub Big5::reverse(@);
sub Big5::getc(;*@);
sub Big5::length(;$);
sub Big5::substr($$;$$);
sub Big5::index($$;$);
sub Big5::rindex($$;$);
sub Big5::escape(;$);
#
# Regexp work
#
use vars qw(
$re_a
$re_t
$re_n
$re_r
);
#
# Character class
#
use vars qw(
$dot
$dot_s
$eD
$eS
$eW
$eH
$eV
$eR
$eN
$not_alnum
$not_alpha
$not_ascii
$not_blank
$not_cntrl
$not_digit
$not_graph
$not_lower
$not_lower_i
$not_print
$not_punct
$not_space
$not_upper
$not_upper_i
$not_word
$not_xdigit
$eb
$eB
);
use vars qw(
$anchor
$matched
);
${Ebig5::anchor} = qr{\G(?>[^\x81-\xFE]|[\x81-\xFE][\x00-\xFF])*?}oxms;
my $q_char_SADAHIRO_Tomoyuki_2002_01_17 = '';
# Quantifiers
# {n,m} --- Match at least n but not more than m times
#
# n and m are limited to non-negative integral values less than a
# preset limit defined when perl is built. This is usually 32766 on
# the most common platforms.
#
# The following code is an attempt to solve the above limitations
# in a multi-byte anchoring.
# avoid "Segmentation fault" and "Error: Parse exception"
# perl5101delta
# http://perldoc.perl.org/perl5101delta.html
# In 5.10.0, the * quantifier in patterns was sometimes treated as {0,32767}
# [RT #60034, #60464]. For example, this match would fail:
# ("ab" x 32768) =~ /^(ab)*$/
# SEE ALSO
#
# Complex regular subexpression recursion limit
# http://www.perlmonks.org/?node_id=810857
#
# regexp iteration limits
# http://www.nntp.perl.org/group/perl.perl5.porters/2009/02/msg144065.html
#
# latest Perl won't match certain regexes more than 32768 characters long
# http://stackoverflow.com/questions/26226630/latest-perl-wont-match-certain-regexes-more-than-32768-characters-long
#
# Break through the limitations of regular expressions of Perl
# http://d.hatena.ne.jp/gfx/20110212/1297512479
if (($] >= 5.010001) or
# ActivePerl 5.6 or later (include 5.10.0)
(defined($ActivePerl::VERSION) and ($ActivePerl::VERSION > 800)) or
(($^O eq 'MSWin32') and ($] =~ /\A 5\.006/oxms))
) {
my $sbcs = ''; # Single Byte Character Set
for my $range (@{ $range_tr{1} }) {
$sbcs .= sprintf('\\x%02X-\\x%02X', $range->[0], $range->[-1]);
}
if (0) {
}
# other encoding
else {
${Ebig5::q_char_SADAHIRO_Tomoyuki_2002_01_17} = qr{.*?[$sbcs](?:[^$sbcs][^$sbcs])*?}oxms;
# ******* octets not in multiple octet char (always char boundary)
lib/Ebig5.pm view on Meta::CPAN
# P.130 Advanced Use of \G with Perl
# in Chapter3: Over view of Regular Expression Features and Flavors
# of ISBN 0-596-00289-0 Mastering Regular Expressions, Second edition
# (2)
# P.255 Use leading anchors
# P.256 Expose ^ and \G at the front of expressions
# in Chapter6: Crafting an Efficient Expression
# of ISBN 0-596-00289-0 Mastering Regular Expressions, Second edition
# (3)
# P.138 Conditional: (? if then| else)
# in Chapter3: Over view of Regular Expression Features and Flavors
# of ISBN 0-596-00289-0 Mastering Regular Expressions, Second edition
# (4)
# perlre
# http://perldoc.perl.org/perlre.html
# The "*" quantifier is equivalent to {0,} , the "+" quantifier to {1,} ,
# and the "?" quantifier to {0,1}, ., n, and m are limited to non-negative
# integral values less than a preset limit defined when perl is built.
# This is usually 32766 on the most common platforms. The actual limit
# can be seen in the error message generated by code such as this:
# $_ **= $_ , / {$_} / for 2 .. 42;
# (5)
# P.1023 Multiple-Byte Anchoring
# in Appendix W Perl Code Examples
# of ISBN 1-56592-224-7 CJKV Information Processing
# (6)
# if string has only SBCS (Single Byte Character Set)
# (7)
# then .*? (isn't limited to 32766)
# (8)
# else Big5::Regexp::Const (SADAHIRO Tomoyuki)
# http://homepage1.nifty.com/nomenclator/perl/shiftjis.htm#long
# http://search.cpan.org/~sadahiro/Big5-Regexp/
# $PadA = ' (?:\A| [\x00-\x80\xA0-\xDF])(?:[\x81-\xFE]{2})*?';
# $PadG = '\G(?: |[\x00-\xFF]*?[\x00-\x80\xA0-\xDF])(?:[\x81-\xFE]{2})*?';
# $PadGA = '\G(?:\A|(?:[\x81-\xFE]{2})+?|[\x00-\xFF]*?[\x00-\x80\xA0-\xDF] (?:[\x81-\xFE]{2})*?)';
${Ebig5::dot} = qr{(?>[^\x81-\xFE\x0A]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::dot_s} = qr{(?>[^\x81-\xFE]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::eD} = qr{(?>[^\x81-\xFE0-9]|[\x81-\xFE][\x00-\xFF])};
# Vertical tabs are now whitespace
# \s in a regex now matches a vertical tab in all circumstances.
# http://search.cpan.org/dist/perl-5.18.0/pod/perldelta.pod#Vertical_tabs_are_now_whitespace
# ${Ebig5::eS} = qr{(?>[^\x81-\xFE\x09\x0A \x0C\x0D\x20]|[\x81-\xFE][\x00-\xFF])};
# ${Ebig5::eS} = qr{(?>[^\x81-\xFE\x09\x0A\x0B\x0C\x0D\x20]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::eS} = qr{(?>[^\x81-\xFE\s]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::eW} = qr{(?>[^\x81-\xFE0-9A-Z_a-z]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::eH} = qr{(?>[^\x81-\xFE\x09\x20]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::eV} = qr{(?>[^\x81-\xFE\x0A\x0B\x0C\x0D]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::eR} = qr{(?>\x0D\x0A|[\x0A\x0D])};
${Ebig5::eN} = qr{(?>[^\x81-\xFE\x0A]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_alnum} = qr{(?>[^\x81-\xFE\x30-\x39\x41-\x5A\x61-\x7A]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_alpha} = qr{(?>[^\x81-\xFE\x41-\x5A\x61-\x7A]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_ascii} = qr{(?>[^\x81-\xFE\x00-\x7F]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_blank} = qr{(?>[^\x81-\xFE\x09\x20]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_cntrl} = qr{(?>[^\x81-\xFE\x00-\x1F\x7F]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_digit} = qr{(?>[^\x81-\xFE\x30-\x39]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_graph} = qr{(?>[^\x81-\xFE\x21-\x7F]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_lower} = qr{(?>[^\x81-\xFE\x61-\x7A]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_lower_i} = qr{(?>[^\x81-\xFE\x41-\x5A\x61-\x7A]|[\x81-\xFE][\x00-\xFF])}; # Perl 5.16 compatible
# ${Ebig5::not_lower_i} = qr{(?>[^\x81-\xFE]|[\x81-\xFE][\x00-\xFF])}; # older Perl compatible
${Ebig5::not_print} = qr{(?>[^\x81-\xFE\x20-\x7F]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_punct} = qr{(?>[^\x81-\xFE\x21-\x2F\x3A-\x3F\x40\x5B-\x5F\x60\x7B-\x7E]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_space} = qr{(?>[^\x81-\xFE\s\x0B]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_upper} = qr{(?>[^\x81-\xFE\x41-\x5A]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_upper_i} = qr{(?>[^\x81-\xFE\x41-\x5A\x61-\x7A]|[\x81-\xFE][\x00-\xFF])}; # Perl 5.16 compatible
# ${Ebig5::not_upper_i} = qr{(?>[^\x81-\xFE]|[\x81-\xFE][\x00-\xFF])}; # older Perl compatible
${Ebig5::not_word} = qr{(?>[^\x81-\xFE\x30-\x39\x41-\x5A\x5F\x61-\x7A]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::not_xdigit} = qr{(?>[^\x81-\xFE\x30-\x39\x41-\x46\x61-\x66]|[\x81-\xFE][\x00-\xFF])};
${Ebig5::eb} = qr{(?:\A(?=[0-9A-Z_a-z])|(?<=[\x00-\x2F\x40\x5B-\x5E\x60\x7B-\xFF])(?=[0-9A-Z_a-z])|(?<=[0-9A-Z_a-z])(?=[\x00-\x2F\x40\x5B-\x5E\x60\x7B-\xFF]|\z))};
${Ebig5::eB} = qr{(?:(?<=[0-9A-Z_a-z])(?=[0-9A-Z_a-z])|(?<=[\x00-\x2F\x40\x5B-\x5E\x60\x7B-\xFF])(?=[\x00-\x2F\x40\x5B-\x5E\x60\x7B-\xFF]))};
# avoid: Name "Ebig5::foo" used only once: possible typo at here.
${Ebig5::dot} = ${Ebig5::dot};
${Ebig5::dot_s} = ${Ebig5::dot_s};
${Ebig5::eD} = ${Ebig5::eD};
${Ebig5::eS} = ${Ebig5::eS};
${Ebig5::eW} = ${Ebig5::eW};
${Ebig5::eH} = ${Ebig5::eH};
${Ebig5::eV} = ${Ebig5::eV};
${Ebig5::eR} = ${Ebig5::eR};
${Ebig5::eN} = ${Ebig5::eN};
${Ebig5::not_alnum} = ${Ebig5::not_alnum};
${Ebig5::not_alpha} = ${Ebig5::not_alpha};
${Ebig5::not_ascii} = ${Ebig5::not_ascii};
${Ebig5::not_blank} = ${Ebig5::not_blank};
${Ebig5::not_cntrl} = ${Ebig5::not_cntrl};
${Ebig5::not_digit} = ${Ebig5::not_digit};
${Ebig5::not_graph} = ${Ebig5::not_graph};
${Ebig5::not_lower} = ${Ebig5::not_lower};
${Ebig5::not_lower_i} = ${Ebig5::not_lower_i};
${Ebig5::not_print} = ${Ebig5::not_print};
${Ebig5::not_punct} = ${Ebig5::not_punct};
${Ebig5::not_space} = ${Ebig5::not_space};
${Ebig5::not_upper} = ${Ebig5::not_upper};
${Ebig5::not_upper_i} = ${Ebig5::not_upper_i};
${Ebig5::not_word} = ${Ebig5::not_word};
${Ebig5::not_xdigit} = ${Ebig5::not_xdigit};
${Ebig5::eb} = ${Ebig5::eb};
${Ebig5::eB} = ${Ebig5::eB};
#
# Big5 split
#
sub Ebig5::split(;$$$) {
# P.794 29.2.161. split
# in Chapter 29: Functions
# of ISBN 0-596-00027-8 Programming Perl Third Edition.
# P.951 split
# in Chapter 27: Functions
# of ISBN 978-0-596-00492-7 Programming Perl 4th Edition.
my $pattern = $_[0];
my $string = $_[1];
my $limit = $_[2];
# if $pattern is also omitted or is the literal space, " "
if (not defined $pattern) {
$pattern = ' ';
}
# if $string is omitted, the function splits the $_ string
if (not defined $string) {
if (defined $_) {
$string = $_;
}
else {
$string = '';
}
}
my @split = ();
# when string is empty
if ($string eq '') {
# resulting list value in list context
if (wantarray) {
return @split;
}
# count of substrings in scalar context
else {
carp "Use of implicit split to \@_ is deprecated" if $^W;
@_ = @split;
return scalar @_;
}
}
# split's first argument is more consistently interpreted
#
# After some changes earlier in v5.17, split's behavior has been simplified:
# if the PATTERN argument evaluates to a string containing one space, it is
# treated the way that a literal string containing one space once was.
# http://search.cpan.org/dist/perl-5.18.0/pod/perldelta.pod#split's_first_argument_is_more_consistently_interpreted
lib/Ebig5.pm view on Meta::CPAN
for (my $i=0; $i <= $#char; $i++) {
next if not defined $char[$i];
# open character class [...]
if ($char[$i] eq '[') {
my $left = $i;
# [] make die "unmatched [] in regexp ...\n"
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
croak "Unmatched [] in regexp";
}
if ($char[$i] eq ']') {
my $right = $i;
my @charlist = charlist_qr(@char[$left+1..$right-1], 'i');
# escape character
for my $char (@charlist) {
if (0) {
}
# do not use quotemeta here
elsif ($char =~ /\A ([\x80-\xFF].*) ($metachar) \z/oxms) {
$char = $1 . '\\' . $2;
}
elsif ($char =~ /\A [.|)] \z/oxms) {
$char = '\\' . $char;
}
}
# [...]
splice @char, $left, $right-$left+1, '(?:' . join('|', @charlist) . ')';
$i = $left;
last;
}
}
}
# open character class [^...]
elsif ($char[$i] eq '[^') {
my $left = $i;
# [^] make die "unmatched [] in regexp ...\n"
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
croak "Unmatched [] in regexp";
}
if ($char[$i] eq ']') {
my $right = $i;
my @charlist = charlist_not_qr(@char[$left+1..$right-1], 'i');
# escape character
for my $char (@charlist) {
if (0) {
}
# do not use quotemeta here
elsif ($char =~ /\A ([\x80-\xFF].*) ($metachar) \z/oxms) {
$char = $1 . '\\' . $2;
}
elsif ($char =~ /\A [.|)] \z/oxms) {
$char = '\\' . $char;
}
}
# [^...]
splice @char, $left, $right-$left+1, '(?!' . join('|', @charlist) . ")(?:$your_char)";
$i = $left;
last;
}
}
}
# rewrite classic character class or escape character
elsif (my $char = classic_character_class($char[$i])) {
$char[$i] = $char;
}
# with /i modifier
elsif ($char[$i] =~ /\A [\x00-\xFF] \z/oxms) {
my $uc = Ebig5::uc($char[$i]);
my $fc = Ebig5::fc($char[$i]);
if ($uc ne $fc) {
if (CORE::length($fc) == 1) {
$char[$i] = '[' . $uc . $fc . ']';
}
else {
$char[$i] = '(?:' . $uc . '|' . $fc . ')';
}
}
}
}
# characterize
for (my $i=0; $i <= $#char; $i++) {
next if not defined $char[$i];
if (0) {
}
# escape last octet of multiple-octet
elsif ($char[$i] =~ /\A ([\x80-\xFF].*) ($metachar) \z/oxms) {
$char[$i] = $1 . '\\' . $2;
}
# quote character before ? + * {
elsif (($i >= 1) and ($char[$i] =~ /\A [\?\+\*\{] \z/oxms)) {
if ($char[$i-1] !~ /\A [\x00-\xFF] \z/oxms) {
$char[$i-1] = '(?:' . $char[$i-1] . ')';
lib/Ebig5.pm view on Meta::CPAN
# \p{PROPERTY} --> p\{PROPERTY}
# \P{PROPERTY} --> P\{PROPERTY}
elsif ($char[$i] =~ /\A \\ ([bBNpP]) ( \{ ([^\x81-\xFE0-9\}][^\x81-\xFE\}]*) \} ) \z/oxms) {
$char[$i] = $1 . '\\' . $2;
}
# \p, \P, \X --> p, P, X
elsif ($char[$i] =~ /\A \\ ( [pPX] ) \z/oxms) {
$char[$i] = $1;
}
elsif ($char[$i] =~ /\A \\ ([0-7]{2,3}) \z/oxms) {
$char[$i] = CORE::chr oct $1;
}
elsif ($char[$i] =~ /\A \\x ([0-9A-Fa-f]{1,2}) \z/oxms) {
$char[$i] = CORE::chr hex $1;
}
elsif ($char[$i] =~ /\A \\c ([\x40-\x5F]) \z/oxms) {
$char[$i] = CORE::chr(CORE::ord($1) & 0x1F);
}
elsif ($char[$i] =~ /\A (\\ [0nrtfbaedswDSWHVhvR]) \z/oxms) {
$char[$i] = {
'\0' => "\0",
'\n' => "\n",
'\r' => "\r",
'\t' => "\t",
'\f' => "\f",
'\b' => "\x08", # \b means backspace in character class
'\a' => "\a",
'\e' => "\e",
'\d' => '[0-9]',
# Vertical tabs are now whitespace
# \s in a regex now matches a vertical tab in all circumstances.
# http://search.cpan.org/dist/perl-5.18.0/pod/perldelta.pod#Vertical_tabs_are_now_whitespace
# \t \n \v \f \r space
# '\s' => '[\x09\x0A \x0C\x0D\x20]',
# '\s' => '[\x09\x0A\x0B\x0C\x0D\x20]',
'\s' => '\s',
'\w' => '[0-9A-Z_a-z]',
'\D' => '${Ebig5::eD}',
'\S' => '${Ebig5::eS}',
'\W' => '${Ebig5::eW}',
'\H' => '${Ebig5::eH}',
'\V' => '${Ebig5::eV}',
'\h' => '[\x09\x20]',
'\v' => '[\x0A\x0B\x0C\x0D]',
'\R' => '${Ebig5::eR}',
}->{$1};
}
# POSIX-style character classes
elsif ($ignorecase and ($char[$i] =~ /\A ( \[\: \^? (?:lower|upper) :\] ) \z/oxms)) {
$char[$i] = {
'[:lower:]' => '[\x41-\x5A\x61-\x7A]',
'[:upper:]' => '[\x41-\x5A\x61-\x7A]',
'[:^lower:]' => '${Ebig5::not_lower_i}',
'[:^upper:]' => '${Ebig5::not_upper_i}',
}->{$1};
}
elsif ($char[$i] =~ /\A ( \[\: \^? (?:alnum|alpha|ascii|blank|cntrl|digit|graph|lower|print|punct|space|upper|word|xdigit) :\] ) \z/oxms) {
$char[$i] = {
'[:alnum:]' => '[\x30-\x39\x41-\x5A\x61-\x7A]',
'[:alpha:]' => '[\x41-\x5A\x61-\x7A]',
'[:ascii:]' => '[\x00-\x7F]',
'[:blank:]' => '[\x09\x20]',
'[:cntrl:]' => '[\x00-\x1F\x7F]',
'[:digit:]' => '[\x30-\x39]',
'[:graph:]' => '[\x21-\x7F]',
'[:lower:]' => '[\x61-\x7A]',
'[:print:]' => '[\x20-\x7F]',
'[:punct:]' => '[\x21-\x2F\x3A-\x3F\x40\x5B-\x5F\x60\x7B-\x7E]',
# P.174 POSIX-Style Character Classes
# in Chapter 5: Pattern Matching
# of ISBN 0-596-00027-8 Programming Perl Third Edition.
# P.311 11.2.4 Character Classes and other Special Escapes
# in Chapter 11: perlre: Perl regular expressions
# of ISBN-13: 978-1-906966-02-7 The Perl Language Reference Manual (for Perl version 5.12.1)
# P.210 POSIX-Style Character Classes
# in Chapter 5: Pattern Matching
# of ISBN 978-0-596-00492-7 Programming Perl 4th Edition.
'[:space:]' => '[\s\x0B]', # "\s" plus vertical tab ("\cK")
'[:upper:]' => '[\x41-\x5A]',
'[:word:]' => '[\x30-\x39\x41-\x5A\x5F\x61-\x7A]',
'[:xdigit:]' => '[\x30-\x39\x41-\x46\x61-\x66]',
'[:^alnum:]' => '${Ebig5::not_alnum}',
'[:^alpha:]' => '${Ebig5::not_alpha}',
'[:^ascii:]' => '${Ebig5::not_ascii}',
'[:^blank:]' => '${Ebig5::not_blank}',
'[:^cntrl:]' => '${Ebig5::not_cntrl}',
'[:^digit:]' => '${Ebig5::not_digit}',
'[:^graph:]' => '${Ebig5::not_graph}',
'[:^lower:]' => '${Ebig5::not_lower}',
'[:^print:]' => '${Ebig5::not_print}',
'[:^punct:]' => '${Ebig5::not_punct}',
'[:^space:]' => '${Ebig5::not_space}',
'[:^upper:]' => '${Ebig5::not_upper}',
'[:^word:]' => '${Ebig5::not_word}',
'[:^xdigit:]' => '${Ebig5::not_xdigit}',
}->{$1};
}
elsif ($char[$i] =~ /\A \\ ($q_char) \z/oxms) {
$char[$i] = $1;
}
}
# open character list
my @singleoctet = ();
my @multipleoctet = ();
for (my $i=0; $i <= $#char; ) {
# escaped -
if (defined($char[$i+1]) and ($char[$i+1] eq '...')) {
$i += 1;
next;
}
# make range regexp
elsif ($char[$i] eq '...') {
# range error
if (CORE::length($char[$i-1]) > CORE::length($char[$i+1])) {
croak 'Invalid [] range in regexp (length(A) > length(B)) ' . '\x' . unpack('H*',$char[$i-1]) . '-\x' . unpack('H*',$char[$i+1]);
}
elsif (CORE::length($char[$i-1]) == CORE::length($char[$i+1])) {
if ($char[$i-1] gt $char[$i+1]) {
croak 'Invalid [] range in regexp (CORE::ord(A) > CORE::ord(B)) ' . '\x' . unpack('H*',$char[$i-1]) . '-\x' . unpack('H*',$char[$i+1]);
}
}
# make range regexp per length
for my $length (CORE::length($char[$i-1]) .. CORE::length($char[$i+1])) {
my @regexp = ();
# is first and last
if (($length == CORE::length($char[$i-1])) and ($length == CORE::length($char[$i+1]))) {
push @regexp, _range_regexp($length, $char[$i-1], $char[$i+1]);
}
# is first
elsif ($length == CORE::length($char[$i-1])) {
push @regexp, _range_regexp($length, $char[$i-1], "\xFF" x $length);
}
# is inside in first and last
elsif ((CORE::length($char[$i-1]) < $length) and ($length < CORE::length($char[$i+1]))) {
push @regexp, _range_regexp($length, "\x00" x $length, "\xFF" x $length);
}
# is last
elsif ($length == CORE::length($char[$i+1])) {
push @regexp, _range_regexp($length, "\x00" x $length, $char[$i+1]);
}
else {
die __FILE__, ": subroutine make_regexp panic.\n";
}
lib/Ebig5.pm view on Meta::CPAN
my @singleoctet = @$singleoctet;
my @multipleoctet = @$multipleoctet;
# return character list
if (scalar(@singleoctet) >= 1) {
# with /i modifier
if ($modifier =~ m/i/oxms) {
my %singleoctet_ignorecase = ();
for (@singleoctet) {
while (s/ \A \\x(..) - \\x(..) //oxms or s/ \A \\x((..)) //oxms) {
for my $ord (hex($1) .. hex($2)) {
my $char = CORE::chr($ord);
my $uc = Ebig5::uc($char);
my $fc = Ebig5::fc($char);
if ($uc eq $fc) {
$singleoctet_ignorecase{unpack 'C*', $char} = 1;
}
else {
if (CORE::length($fc) == 1) {
$singleoctet_ignorecase{unpack 'C*', $uc} = 1;
$singleoctet_ignorecase{unpack 'C*', $fc} = 1;
}
else {
$singleoctet_ignorecase{unpack 'C*', $uc} = 1;
push @multipleoctet, join '', map {sprintf('\x%02X',$_)} unpack 'C*', $fc;
}
}
}
}
if ($_ ne '') {
$singleoctet_ignorecase{unpack 'C*', $_} = 1;
}
}
my $i = 0;
my @singleoctet_ignorecase = ();
for my $ord (0 .. 255) {
if (exists $singleoctet_ignorecase{$ord}) {
push @{$singleoctet_ignorecase[$i]}, $ord;
}
else {
$i++;
}
}
@singleoctet = ();
for my $range (@singleoctet_ignorecase) {
if (ref $range) {
if (scalar(@{$range}) == 1) {
push @singleoctet, sprintf('\x%02X', @{$range}[0]);
}
elsif (scalar(@{$range}) == 2) {
push @singleoctet, sprintf('\x%02X\x%02X', @{$range}[0], @{$range}[-1]);
}
else {
push @singleoctet, sprintf('\x%02X-\x%02X', @{$range}[0], @{$range}[-1]);
}
}
}
}
my $not_anchor = '';
$not_anchor = '(?![\x81-\xFE])';
push @multipleoctet, join('', $not_anchor, '[', @singleoctet, ']' );
}
if (scalar(@multipleoctet) >= 2) {
return '(?:' . join('|', @multipleoctet) . ')';
}
else {
return $multipleoctet[0];
}
}
#
# Big5 open character list for not qr
#
sub charlist_not_qr {
my $modifier = pop @_;
my @char = @_;
my($singleoctet, $multipleoctet) = _charlist(@char, $modifier);
my @singleoctet = @$singleoctet;
my @multipleoctet = @$multipleoctet;
# with /i modifier
if ($modifier =~ m/i/oxms) {
my %singleoctet_ignorecase = ();
for (@singleoctet) {
while (s/ \A \\x(..) - \\x(..) //oxms or s/ \A \\x((..)) //oxms) {
for my $ord (hex($1) .. hex($2)) {
my $char = CORE::chr($ord);
my $uc = Ebig5::uc($char);
my $fc = Ebig5::fc($char);
if ($uc eq $fc) {
$singleoctet_ignorecase{unpack 'C*', $char} = 1;
}
else {
if (CORE::length($fc) == 1) {
$singleoctet_ignorecase{unpack 'C*', $uc} = 1;
$singleoctet_ignorecase{unpack 'C*', $fc} = 1;
}
else {
$singleoctet_ignorecase{unpack 'C*', $uc} = 1;
push @multipleoctet, join '', map {sprintf('\x%02X',$_)} unpack 'C*', $fc;
}
}
}
}
if ($_ ne '') {
$singleoctet_ignorecase{unpack 'C*', $_} = 1;
}
}
my $i = 0;
my @singleoctet_ignorecase = ();
for my $ord (0 .. 255) {
if (exists $singleoctet_ignorecase{$ord}) {
push @{$singleoctet_ignorecase[$i]}, $ord;
}
else {
$i++;
}
}
@singleoctet = ();
for my $range (@singleoctet_ignorecase) {
if (ref $range) {
if (scalar(@{$range}) == 1) {
push @singleoctet, sprintf('\x%02X', @{$range}[0]);
}
elsif (scalar(@{$range}) == 2) {
push @singleoctet, sprintf('\x%02X\x%02X', @{$range}[0], @{$range}[-1]);
}
else {
push @singleoctet, sprintf('\x%02X-\x%02X', @{$range}[0], @{$range}[-1]);
}
}
}
lib/Ebig5.pm view on Meta::CPAN
elsif (($i+2 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, @char[$i+1..$i+2]) == 2) and (CORE::eval(sprintf '"%s%s%s"', @char[$i..$i+2]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 2;
}
elsif (($i+1 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, $char[$i+1 ]) == 1) and (CORE::eval(sprintf '"%s%s"', @char[$i..$i+1]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 1;
}
}
# open character class [...]
elsif ($char[$i] eq '[') {
my $left = $i;
# [] make die "Unmatched [] in regexp ...\n"
# (and so on)
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [...]
if (grep(/\A [\$\@]/oxms,@char[$left+1..$right-1]) >= 1) {
splice @char, $left, $right-$left+1, sprintf(q{@{[Ebig5::charlist_qr(%s,'%s')]}}, join(',', map {qq_stuff($delimiter,$end_delimiter,$_)} @char[$left+1..$right-1]), $modifier);
}
else {
splice @char, $left, $right-$left+1, Ebig5::charlist_qr(@char[$left+1..$right-1], $modifier);
}
$i = $left;
last;
}
}
}
# open character class [^...]
elsif ($char[$i] eq '[^') {
my $left = $i;
# [^] make die "Unmatched [] in regexp ...\n"
# (and so on)
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [^...]
if (grep(/\A [\$\@]/oxms,@char[$left+1..$right-1]) >= 1) {
splice @char, $left, $right-$left+1, sprintf(q{@{[Ebig5::charlist_not_qr(%s,'%s')]}}, join(',', map {qq_stuff($delimiter,$end_delimiter,$_)} @char[$left+1..$right-1]), $modifier);
}
else {
splice @char, $left, $right-$left+1, Ebig5::charlist_not_qr(@char[$left+1..$right-1], $modifier);
}
$i = $left;
last;
}
}
}
# rewrite character class or escape character
elsif (my $char = character_class($char[$i],$modifier)) {
$char[$i] = $char;
}
# /i modifier
elsif ($ignorecase and ($char[$i] =~ /\A [\x00-\xFF] \z/oxms) and (Ebig5::uc($char[$i]) ne Ebig5::fc($char[$i]))) {
if (CORE::length(Ebig5::fc($char[$i])) == 1) {
$char[$i] = '[' . Ebig5::uc($char[$i]) . Ebig5::fc($char[$i]) . ']';
}
else {
$char[$i] = '(?:' . Ebig5::uc($char[$i]) . '|' . Ebig5::fc($char[$i]) . ')';
}
}
# \u \l \U \L \F \Q \E
elsif ($char[$i] =~ /\A [<>] \z/oxms) {
if ($right_e < $left_e) {
$char[$i] = '\\' . $char[$i];
}
}
elsif ($char[$i] eq '\u') {
$char[$i] = '@{[Ebig5::ucfirst qq<';
$left_e++;
}
elsif ($char[$i] eq '\l') {
$char[$i] = '@{[Ebig5::lcfirst qq<';
$left_e++;
}
elsif ($char[$i] eq '\U') {
$char[$i] = '@{[Ebig5::uc qq<';
$left_e++;
}
elsif ($char[$i] eq '\L') {
$char[$i] = '@{[Ebig5::lc qq<';
$left_e++;
}
elsif ($char[$i] eq '\F') {
$char[$i] = '@{[Ebig5::fc qq<';
$left_e++;
}
elsif ($char[$i] eq '\Q') {
$char[$i] = '@{[CORE::quotemeta qq<';
$left_e++;
}
elsif ($char[$i] eq '\E') {
if ($right_e < $left_e) {
$char[$i] = '>]}';
$right_e++;
}
else {
$char[$i] = '';
lib/Ebig5.pm view on Meta::CPAN
my $ignorecase = ($modifier =~ /i/oxms) ? 1 : 0;
# split regexp
my @char = $string =~ /\G((?>
[^\x81-\xFE\\\[\$\@\/] |
[\x81-\xFE][\x00-\xFF] |
\[\^ |
\[\: (?>[a-z]+) \:\] |
\[\:\^ (?>[a-z]+) \:\] |
[\$\@\/] |
\\ (?:$q_char) |
(?:$q_char)
))/oxmsg;
# unescape character
for (my $i=0; $i <= $#char; $i++) {
if (0) {
}
# escape last octet of multiple-octet
elsif ($char[$i] =~ /\A ([\x80-\xFF].*) ([\\|\[\{\^]|\Q$delimiter\E|\Q$end_delimiter\E) \z/xms) {
$char[$i] = $1 . '\\' . $2;
}
# open character class [...]
elsif ($char[$i] eq '[') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [...]
splice @char, $left, $right-$left+1, Ebig5::charlist_qr(@char[$left+1..$right-1], $modifier);
$i = $left;
last;
}
}
}
# open character class [^...]
elsif ($char[$i] eq '[^') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [^...]
splice @char, $left, $right-$left+1, Ebig5::charlist_not_qr(@char[$left+1..$right-1], $modifier);
$i = $left;
last;
}
}
}
# escape $ @ / and \
elsif ($char[$i] =~ /\A [\$\@\/\\] \z/oxms) {
$char[$i] = '\\' . $char[$i];
}
# rewrite character class or escape character
elsif (my $char = character_class($char[$i],$modifier)) {
$char[$i] = $char;
}
# /i modifier
elsif ($ignorecase and ($char[$i] =~ /\A [\x00-\xFF] \z/oxms) and (Ebig5::uc($char[$i]) ne Ebig5::fc($char[$i]))) {
if (CORE::length(Ebig5::fc($char[$i])) == 1) {
$char[$i] = '[' . Ebig5::uc($char[$i]) . Ebig5::fc($char[$i]) . ']';
}
else {
$char[$i] = '(?:' . Ebig5::uc($char[$i]) . '|' . Ebig5::fc($char[$i]) . ')';
}
}
# quote character before ? + * {
elsif (($i >= 1) and ($char[$i] =~ /\A [\?\+\*\{] \z/oxms)) {
if ($char[$i-1] =~ /\A [\x00-\xFF] \z/oxms) {
}
else {
$char[$i-1] = '(?:' . $char[$i-1] . ')';
}
}
}
$delimiter = '/';
$end_delimiter = '/';
$modifier =~ tr/i//d;
return join '', $ope, $delimiter, $anchor, '(?:', @char, ')', $matched, $end_delimiter, $modifier;
}
#
# escape regexp (m''b, qr''b)
#
sub e_qr_qb {
my($ope,$delimiter,$end_delimiter,$string,$modifier) = @_;
# split regexp
my @char = $string =~ /\G ((?>[^\\]|\\\\|[\x00-\xFF])) /oxmsg;
# unescape character
for (my $i=0; $i <= $#char; $i++) {
if (0) {
}
# remain \\
elsif ($char[$i] eq '\\\\') {
lib/Ebig5.pm view on Meta::CPAN
# escape last octet of multiple-octet
elsif ($char[$i] =~ /\A \\? ([\x80-\xFF].*) ($metachar|\Q$delimiter\E|\Q$end_delimiter\E) \z/xms) {
$char[$i] = $1 . '\\' . $2;
}
# join separated multiple-octet
elsif ($char[$i] =~ /\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms) {
if ( ($i+3 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, @char[$i+1..$i+3]) == 3) and (CORE::eval(sprintf '"%s%s%s%s"', @char[$i..$i+3]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 3;
}
elsif (($i+2 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, @char[$i+1..$i+2]) == 2) and (CORE::eval(sprintf '"%s%s%s"', @char[$i..$i+2]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 2;
}
elsif (($i+1 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, $char[$i+1 ]) == 1) and (CORE::eval(sprintf '"%s%s"', @char[$i..$i+1]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 1;
}
}
# open character class [...]
elsif ($char[$i] eq '[') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [...]
if (grep(/\A [\$\@]/oxms,@char[$left+1..$right-1]) >= 1) {
splice @char, $left, $right-$left+1, sprintf(q{@{[Ebig5::charlist_qr(%s,'%s')]}}, join(',', map {qq_stuff($delimiter,$end_delimiter,$_)} @char[$left+1..$right-1]), $modifier);
}
else {
splice @char, $left, $right-$left+1, Ebig5::charlist_qr(@char[$left+1..$right-1], $modifier);
}
$i = $left;
last;
}
}
}
# open character class [^...]
elsif ($char[$i] eq '[^') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [^...]
if (grep(/\A [\$\@]/oxms,@char[$left+1..$right-1]) >= 1) {
splice @char, $left, $right-$left+1, sprintf(q{@{[Ebig5::charlist_not_qr(%s,'%s')]}}, join(',', map {qq_stuff($delimiter,$end_delimiter,$_)} @char[$left+1..$right-1]), $modifier);
}
else {
splice @char, $left, $right-$left+1, Ebig5::charlist_not_qr(@char[$left+1..$right-1], $modifier);
}
$i = $left;
last;
}
}
}
# rewrite character class or escape character
elsif (my $char = character_class($char[$i],$modifier)) {
$char[$i] = $char;
}
# /i modifier
elsif ($ignorecase and ($char[$i] =~ /\A [\x00-\xFF] \z/oxms) and (Ebig5::uc($char[$i]) ne Ebig5::fc($char[$i]))) {
if (CORE::length(Ebig5::fc($char[$i])) == 1) {
$char[$i] = '[' . Ebig5::uc($char[$i]) . Ebig5::fc($char[$i]) . ']';
}
else {
$char[$i] = '(?:' . Ebig5::uc($char[$i]) . '|' . Ebig5::fc($char[$i]) . ')';
}
}
# \u \l \U \L \F \Q \E
elsif ($char[$i] =~ /\A [<>] \z/oxms) {
if ($right_e < $left_e) {
$char[$i] = '\\' . $char[$i];
}
}
elsif ($char[$i] eq '\u') {
$char[$i] = '@{[Ebig5::ucfirst qq<';
$left_e++;
}
elsif ($char[$i] eq '\l') {
$char[$i] = '@{[Ebig5::lcfirst qq<';
$left_e++;
}
elsif ($char[$i] eq '\U') {
$char[$i] = '@{[Ebig5::uc qq<';
$left_e++;
}
elsif ($char[$i] eq '\L') {
$char[$i] = '@{[Ebig5::lc qq<';
$left_e++;
}
elsif ($char[$i] eq '\F') {
$char[$i] = '@{[Ebig5::fc qq<';
$left_e++;
}
elsif ($char[$i] eq '\Q') {
$char[$i] = '@{[CORE::quotemeta qq<';
$left_e++;
}
elsif ($char[$i] eq '\E') {
if ($right_e < $left_e) {
$char[$i] = '>]}';
$right_e++;
}
else {
$char[$i] = '';
lib/Ebig5.pm view on Meta::CPAN
my $ignorecase = ($modifier =~ /i/oxms) ? 1 : 0;
# split regexp
my @char = $string =~ /\G((?>
[^\x81-\xFE\\\[\$\@\/] |
[\x81-\xFE][\x00-\xFF] |
\[\^ |
\[\: (?>[a-z]+) \:\] |
\[\:\^ (?>[a-z]+) \:\] |
[\$\@\/] |
\\ (?:$q_char) |
(?:$q_char)
))/oxmsg;
# unescape character
for (my $i=0; $i <= $#char; $i++) {
if (0) {
}
# escape last octet of multiple-octet
elsif ($char[$i] =~ /\A ([\x80-\xFF].*) ([\\|\[\{\^]|\Q$delimiter\E|\Q$end_delimiter\E) \z/xms) {
$char[$i] = $1 . '\\' . $2;
}
# open character class [...]
elsif ($char[$i] eq '[') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [...]
splice @char, $left, $right-$left+1, Ebig5::charlist_qr(@char[$left+1..$right-1], $modifier);
$i = $left;
last;
}
}
}
# open character class [^...]
elsif ($char[$i] eq '[^') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [^...]
splice @char, $left, $right-$left+1, Ebig5::charlist_not_qr(@char[$left+1..$right-1], $modifier);
$i = $left;
last;
}
}
}
# escape $ @ / and \
elsif ($char[$i] =~ /\A [\$\@\/\\] \z/oxms) {
$char[$i] = '\\' . $char[$i];
}
# rewrite character class or escape character
elsif (my $char = character_class($char[$i],$modifier)) {
$char[$i] = $char;
}
# /i modifier
elsif ($ignorecase and ($char[$i] =~ /\A [\x00-\xFF] \z/oxms) and (Ebig5::uc($char[$i]) ne Ebig5::fc($char[$i]))) {
if (CORE::length(Ebig5::fc($char[$i])) == 1) {
$char[$i] = '[' . Ebig5::uc($char[$i]) . Ebig5::fc($char[$i]) . ']';
}
else {
$char[$i] = '(?:' . Ebig5::uc($char[$i]) . '|' . Ebig5::fc($char[$i]) . ')';
}
}
# quote character before ? + * {
elsif (($i >= 1) and ($char[$i] =~ /\A [\?\+\*\{] \z/oxms)) {
if ($char[$i-1] =~ /\A [\x00-\xFF] \z/oxms) {
}
else {
$char[$i-1] = '(?:' . $char[$i-1] . ')';
}
}
}
$modifier =~ tr/i//d;
$delimiter = '/';
$end_delimiter = '/';
my $prematch = '';
$prematch = "($anchor)";
return join '', $ope, $delimiter, $prematch, '(?:', @char, ')', $matched, $end_delimiter, $modifier;
}
#
# escape regexp (s'here''b)
#
sub e_s1_qb {
my($ope,$delimiter,$end_delimiter,$string,$modifier) = @_;
# split regexp
my @char = $string =~ /\G (?>[^\\]|\\\\|[\x00-\xFF]) /oxmsg;
# unescape character
for (my $i=0; $i <= $#char; $i++) {
if (0) {
}
# remain \\
lib/Ebig5.pm view on Meta::CPAN
# escape last octet of multiple-octet
elsif ($char[$i] =~ /\A \\? ([\x80-\xFF].*) ($metachar|\Q$delimiter\E|\Q$end_delimiter\E) \z/xms) {
$char[$i] = $1 . '\\' . $2;
}
# join separated multiple-octet
elsif ($char[$i] =~ /\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms) {
if ( ($i+3 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, @char[$i+1..$i+3]) == 3) and (CORE::eval(sprintf '"%s%s%s%s"', @char[$i..$i+3]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 3;
}
elsif (($i+2 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, @char[$i+1..$i+2]) == 2) and (CORE::eval(sprintf '"%s%s%s"', @char[$i..$i+2]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 2;
}
elsif (($i+1 <= $#char) and (grep(/\A (?: \\ [0-7]{2,3} | \\x [0-9A-Fa-f]{1,2}) \z/oxms, $char[$i+1 ]) == 1) and (CORE::eval(sprintf '"%s%s"', @char[$i..$i+1]) =~ /\A $q_char \z/oxms)) {
$char[$i] .= join '', splice @char, $i+1, 1;
}
}
# open character class [...]
elsif ($char[$i] eq '[') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [...]
if (grep(/\A [\$\@]/oxms,@char[$left+1..$right-1]) >= 1) {
splice @char, $left, $right-$left+1, sprintf(q{@{[Ebig5::charlist_qr(%s,'%s')]}}, join(',', map {qq_stuff($delimiter,$end_delimiter,$_)} @char[$left+1..$right-1]), $modifier);
}
else {
splice @char, $left, $right-$left+1, Ebig5::charlist_qr(@char[$left+1..$right-1], $modifier);
}
$i = $left;
last;
}
}
}
# open character class [^...]
elsif ($char[$i] eq '[^') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [^...]
if (grep(/\A [\$\@]/oxms,@char[$left+1..$right-1]) >= 1) {
splice @char, $left, $right-$left+1, sprintf(q{@{[Ebig5::charlist_not_qr(%s,'%s')]}}, join(',', map {qq_stuff($delimiter,$end_delimiter,$_)} @char[$left+1..$right-1]), $modifier);
}
else {
splice @char, $left, $right-$left+1, Ebig5::charlist_not_qr(@char[$left+1..$right-1], $modifier);
}
$i = $left;
last;
}
}
}
# rewrite character class or escape character
elsif (my $char = character_class($char[$i],$modifier)) {
$char[$i] = $char;
}
# P.794 29.2.161. split
# in Chapter 29: Functions
# of ISBN 0-596-00027-8 Programming Perl Third Edition.
# P.951 split
# in Chapter 27: Functions
# of ISBN 978-0-596-00492-7 Programming Perl 4th Edition.
# said "The //m modifier is assumed when you split on the pattern /^/",
# but perl5.008 is not so. Therefore, this software adds //m.
# (and so on)
# split(m/^/) --> split(m/^/m)
elsif (($char[$i] eq '^') and ($modifier !~ /m/oxms)) {
$modifier .= 'm';
}
# /i modifier
elsif ($ignorecase and ($char[$i] =~ /\A [\x00-\xFF] \z/oxms) and (Ebig5::uc($char[$i]) ne Ebig5::fc($char[$i]))) {
if (CORE::length(Ebig5::fc($char[$i])) == 1) {
$char[$i] = '[' . Ebig5::uc($char[$i]) . Ebig5::fc($char[$i]) . ']';
}
else {
$char[$i] = '(?:' . Ebig5::uc($char[$i]) . '|' . Ebig5::fc($char[$i]) . ')';
}
}
# \u \l \U \L \F \Q \E
elsif ($char[$i] =~ /\A ([<>]) \z/oxms) {
if ($right_e < $left_e) {
$char[$i] = '\\' . $char[$i];
}
}
elsif ($char[$i] eq '\u') {
$char[$i] = '@{[Ebig5::ucfirst qq<';
$left_e++;
}
elsif ($char[$i] eq '\l') {
$char[$i] = '@{[Ebig5::lcfirst qq<';
$left_e++;
}
elsif ($char[$i] eq '\U') {
$char[$i] = '@{[Ebig5::uc qq<';
$left_e++;
}
elsif ($char[$i] eq '\L') {
$char[$i] = '@{[Ebig5::lc qq<';
lib/Ebig5.pm view on Meta::CPAN
my $ignorecase = ($modifier =~ /i/oxms) ? 1 : 0;
# split regexp
my @char = $string =~ /\G((?>
[^\x81-\xFE\\\[] |
[\x81-\xFE][\x00-\xFF] |
\[\^ |
\[\: (?>[a-z]+) \:\] |
\[\:\^ (?>[a-z]+) \:\] |
\\ (?:$q_char) |
(?:$q_char)
))/oxmsg;
# unescape character
for (my $i=0; $i <= $#char; $i++) {
if (0) {
}
# escape last octet of multiple-octet
elsif ($char[$i] =~ /\A ([\x80-\xFF].*) ([\\|\[\{\^]|\Q$delimiter\E|\Q$end_delimiter\E) \z/xms) {
$char[$i] = $1 . '\\' . $2;
}
# open character class [...]
elsif ($char[$i] eq '[') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [...]
splice @char, $left, $right-$left+1, Ebig5::charlist_qr(@char[$left+1..$right-1], $modifier);
$i = $left;
last;
}
}
}
# open character class [^...]
elsif ($char[$i] eq '[^') {
my $left = $i;
if ($char[$i+1] eq ']') {
$i++;
}
while (1) {
if (++$i > $#char) {
die __FILE__, ": Unmatched [] in regexp\n";
}
if ($char[$i] eq ']') {
my $right = $i;
# [^...]
splice @char, $left, $right-$left+1, Ebig5::charlist_not_qr(@char[$left+1..$right-1], $modifier);
$i = $left;
last;
}
}
}
# rewrite character class or escape character
elsif (my $char = character_class($char[$i],$modifier)) {
$char[$i] = $char;
}
# split(m/^/) --> split(m/^/m)
elsif (($char[$i] eq '^') and ($modifier !~ /m/oxms)) {
$modifier .= 'm';
}
# /i modifier
elsif ($ignorecase and ($char[$i] =~ /\A [\x00-\xFF] \z/oxms) and (Ebig5::uc($char[$i]) ne Ebig5::fc($char[$i]))) {
if (CORE::length(Ebig5::fc($char[$i])) == 1) {
$char[$i] = '[' . Ebig5::uc($char[$i]) . Ebig5::fc($char[$i]) . ']';
}
else {
$char[$i] = '(?:' . Ebig5::uc($char[$i]) . '|' . Ebig5::fc($char[$i]) . ')';
}
}
# quote character before ? + * {
elsif (($i >= 1) and ($char[$i] =~ /\A [\?\+\*\{] \z/oxms)) {
if ($char[$i-1] =~ /\A [\x00-\xFF] \z/oxms) {
}
else {
$char[$i-1] = '(?:' . $char[$i-1] . ')';
}
}
}
$modifier =~ tr/i//d;
return join '', 'Ebig5::split', $ope, $delimiter, @char, $end_delimiter, $modifier;
}
#
# escape use without import
#
sub e_use_noimport {
my($module) = @_;
my $expr = _pathof($module);
my $fh = gensym();
for my $realfilename (_realfilename($expr)) {
if (Ebig5::_open_r($fh, $realfilename)) {
local $/ = undef; # slurp mode
my $script = <$fh>;
close($fh) or die "Can't close file: $realfilename: $!";
if ($script =~ /^ (?>\s*) use (?>\s+) Big5 (?>\s*) ([^\x81-\xFE;]*) ; (?>\s*) \n? $/oxms) {
return qq<BEGIN { Ebig5::require '$expr'; }>;
}
lib/Ebig5.pm view on Meta::CPAN
}
=item * Lower case string
$lc = Ebig5::lc($string);
$lc = Ebig5::lc_;
This subroutine returns a lowercased version of Big5 $string (or $_, if
$string is omitted). This is the internal subroutine implementing the \L escape
in double-quoted strings.
You can use the Ebig5::fc subroutine for case-insensitive comparisons via Big5
software.
=item * Lower case first character of string
$lcfirst = Ebig5::lcfirst($string);
$lcfirst = Ebig5::lcfirst_;
This subroutine returns a version of Big5 $string with the first character
lowercased (or $_, if $string is omitted). This is the internal subroutine
implementing the \l escape in double-quoted strings.
=item * Upper case string
$uc = Ebig5::uc($string);
$uc = Ebig5::uc_;
This subroutine returns an uppercased version of Big5 $string (or $_, if
$string is omitted). This is the internal subroutine implementing the \U escape
in interpolated strings. For titlecase, use Ebig5::ucfirst instead.
You can use the Ebig5::fc subroutine for case-insensitive comparisons via Big5
software.
=item * Upper case first character of string
$ucfirst = Ebig5::ucfirst($string);
$ucfirst = Ebig5::ucfirst_;
This subroutine returns a version of Big5 $string with the first character
titlecased and other characters left alone (or $_, if $string is omitted).
Titlecase is "Camel" for an initial capital that has (or expects to have)
lowercase characters following it, not uppercase ones. Exsamples are the first
letter of a sentence, of a person's name, of a newspaper headline, or of most
words in a title. Characters with no titlecase mapping return the uppercase
mapping instead. This is the internal subroutine implementing the \u escape in
double-quoted strings.
To capitalize a string by mapping its first character to titlecase and the rest
to lowercase, use:
$titlecase = Ebig5::ucfirst(substr($word,0,1)) . Ebig5::lc(substr($word,1));
or
$string =~ s/(\w)((?>\w*))/\u$1\L$2/g;
Do not use:
$do_not_use = Ebig5::ucfirst(Ebig5::lc($word));
or "\u\L$word", because that can produce a different and incorrect answer with
certain characters. The titlecase of something that's been lowercased doesn't
always produce the same thing titlecasing the original produces.
Because titlecasing only makes sense at the start of a string that's followed
by lowercase characters, we can't think of any reason you might want to titlecase
every character in a string.
See also P.287 A Case of Mistaken Identity
in Chapter 6: Unicode
of ISBN 978-0-596-00492-7 Programming Perl 4th Edition.
=item * Fold case string
P.860 fc
in Chapter 27: Functions
of ISBN 978-0-596-00492-7 Programming Perl 4th Edition.
$fc = Ebig5::fc($string);
$fc = Ebig5::fc_;
New to Big5 software, this subroutine returns the full Unicode-like casefold of
Big5 $string (or $_, if omitted). This is the internal subroutine implementing
the \F escape in double-quoted strings.
Just as title-case is based on uppercase but different, foldcase is based on
lowercase but different. In ASCII there is a one-to-one mapping between only
two cases, but in other encoding there is a one-to-many mapping and between three
cases. Because that's too many combinations to check manually each time, a fourth
casemap called foldcase was invented as a common intermediary for the other three.
It is not a case itself, but it is a casemap.
To compare whether two strings are the same without regard to case, do this:
Ebig5::fc($a) eq Ebig5::fc($b)
The reliable way to compare string case-insensitively was with the /i pattern
modifier, because Big5 software has always used casefolding semantics for
case-insensitive pattern matches. Knowing this, you can emulate equality
comparisons like this:
sub fc_eq ($$) {
my($a,$b) = @_;
return $a =~ /\A\Q$b\E\z/i;
}
=item * Make ignore case string
@ignorecase = Ebig5::ignorecase(@string);
This subroutine is internal use to m/ /i, s/ / /i, split / /i, and qr/ /i.
=item * Make capture number
$capturenumber = Ebig5::capture($string);
This subroutine is internal use to m/ /, s/ / /, split / /, and qr/ /.
=item * Make character
( run in 1.534 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )