Char-UTF2
view release on metacpan or search on metacpan
lib/Eutf2.pm view on Meta::CPAN
# instead of Carp.pm
sub carp;
sub croak;
sub cluck;
sub confess;
# 6.18. Matching Multiple-Byte Characters
# in Chapter 6. Pattern Matching
# of ISBN 978-1-56592-243-3 Perl Perl Cookbook.
# (and so on)
# regexp of character
my $your_char = q{(?:[\xC2-\xDF]|[\xE0-\xE0][\xA0-\xBF]|[\xE1-\xEC][\x80-\xBF]|[\xED-\xED][\x80-\x9F]|[\xEE-\xEF][\x80-\xBF]|[\xF0-\xF0][\x90-\xBF][\x80-\xBF]|[\xF1-\xF3][\x80-\xBF][\x80-\xBF]|[\xF4-\xF4][\x80-\x8F][\x80-\xBF])[\x80-\xBF]|[\x00-\x7F\...
use vars qw($qq_char); $qq_char = qr/\\c[\x40-\x5F]|\\?(?:$your_char)/oxms;
use vars qw($q_char); $q_char = qr/$your_char/oxms;
#
# UTF-8 character range per length
#
my %range_tr = ();
#
# UTF-8 case conversion
#
my %lc = ();
@lc{qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)} =
qw(a b c d e f g h i j k l m n o p q r s t u v w x y z);
my %uc = ();
@uc{qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)} =
qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z);
my %fc = ();
@fc{qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)} =
qw(a b c d e f g h i j k l m n o p q r s t u v w x y z);
if (0) {
}
elsif (__PACKAGE__ =~ / \b Eutf2 \z/oxms) {
%range_tr = (
1 => [ [0x00..0x7F],
[0xF5..0xFF], # malformed octet
],
2 => [ [0xC2..0xDF],[0x80..0xBF],
],
3 => [ [0xE0..0xE0],[0xA0..0xBF],[0x80..0xBF],
[0xE1..0xEC],[0x80..0xBF],[0x80..0xBF],
[0xED..0xED],[0x80..0x9F],[0x80..0xBF],
[0xEE..0xEF],[0x80..0xBF],[0x80..0xBF],
],
4 => [ [0xF0..0xF0],[0x90..0xBF],[0x80..0xBF],[0x80..0xBF],
[0xF1..0xF3],[0x80..0xBF],[0x80..0xBF],[0x80..0xBF],
[0xF4..0xF4],[0x80..0x8F],[0x80..0xBF],[0x80..0xBF],
],
);
# CaseFolding-12.0.0.txt
# Date: 2019-01-22, 08:18:22 GMT
# c 2019 UnicodeR, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
# you can use "make_CaseFolding.pl" to update this hash
%fc = (
"\x41" => "\x61", # LATIN CAPITAL LETTER A
"\x42" => "\x62", # LATIN CAPITAL LETTER B
"\x43" => "\x63", # LATIN CAPITAL LETTER C
"\x44" => "\x64", # LATIN CAPITAL LETTER D
"\x45" => "\x65", # LATIN CAPITAL LETTER E
"\x46" => "\x66", # LATIN CAPITAL LETTER F
"\x47" => "\x67", # LATIN CAPITAL LETTER G
"\x48" => "\x68", # LATIN CAPITAL LETTER H
"\x49" => "\x69", # LATIN CAPITAL LETTER I
"\x4A" => "\x6A", # LATIN CAPITAL LETTER J
"\x4B" => "\x6B", # LATIN CAPITAL LETTER K
"\x4C" => "\x6C", # LATIN CAPITAL LETTER L
"\x4D" => "\x6D", # LATIN CAPITAL LETTER M
"\x4E" => "\x6E", # LATIN CAPITAL LETTER N
"\x4F" => "\x6F", # LATIN CAPITAL LETTER O
"\x50" => "\x70", # LATIN CAPITAL LETTER P
"\x51" => "\x71", # LATIN CAPITAL LETTER Q
"\x52" => "\x72", # LATIN CAPITAL LETTER R
"\x53" => "\x73", # LATIN CAPITAL LETTER S
"\x54" => "\x74", # LATIN CAPITAL LETTER T
"\x55" => "\x75", # LATIN CAPITAL LETTER U
"\x56" => "\x76", # LATIN CAPITAL LETTER V
"\x57" => "\x77", # LATIN CAPITAL LETTER W
"\x58" => "\x78", # LATIN CAPITAL LETTER X
"\x59" => "\x79", # LATIN CAPITAL LETTER Y
"\x5A" => "\x7A", # LATIN CAPITAL LETTER Z
"\xC2\xB5" => "\xCE\xBC", # MICRO SIGN
"\xC3\x80" => "\xC3\xA0", # LATIN CAPITAL LETTER A WITH GRAVE
"\xC3\x81" => "\xC3\xA1", # LATIN CAPITAL LETTER A WITH ACUTE
"\xC3\x82" => "\xC3\xA2", # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
"\xC3\x83" => "\xC3\xA3", # LATIN CAPITAL LETTER A WITH TILDE
"\xC3\x84" => "\xC3\xA4", # LATIN CAPITAL LETTER A WITH DIAERESIS
"\xC3\x85" => "\xC3\xA5", # LATIN CAPITAL LETTER A WITH RING ABOVE
"\xC3\x86" => "\xC3\xA6", # LATIN CAPITAL LETTER AE
"\xC3\x87" => "\xC3\xA7", # LATIN CAPITAL LETTER C WITH CEDILLA
"\xC3\x88" => "\xC3\xA8", # LATIN CAPITAL LETTER E WITH GRAVE
"\xC3\x89" => "\xC3\xA9", # LATIN CAPITAL LETTER E WITH ACUTE
"\xC3\x8A" => "\xC3\xAA", # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
"\xC3\x8B" => "\xC3\xAB", # LATIN CAPITAL LETTER E WITH DIAERESIS
"\xC3\x8C" => "\xC3\xAC", # LATIN CAPITAL LETTER I WITH GRAVE
"\xC3\x8D" => "\xC3\xAD", # LATIN CAPITAL LETTER I WITH ACUTE
"\xC3\x8E" => "\xC3\xAE", # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
"\xC3\x8F" => "\xC3\xAF", # LATIN CAPITAL LETTER I WITH DIAERESIS
"\xC3\x90" => "\xC3\xB0", # LATIN CAPITAL LETTER ETH
"\xC3\x91" => "\xC3\xB1", # LATIN CAPITAL LETTER N WITH TILDE
"\xC3\x92" => "\xC3\xB2", # LATIN CAPITAL LETTER O WITH GRAVE
"\xC3\x93" => "\xC3\xB3", # LATIN CAPITAL LETTER O WITH ACUTE
"\xC3\x94" => "\xC3\xB4", # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
"\xC3\x95" => "\xC3\xB5", # LATIN CAPITAL LETTER O WITH TILDE
"\xC3\x96" => "\xC3\xB6", # LATIN CAPITAL LETTER O WITH DIAERESIS
"\xC3\x98" => "\xC3\xB8", # LATIN CAPITAL LETTER O WITH STROKE
"\xC3\x99" => "\xC3\xB9", # LATIN CAPITAL LETTER U WITH GRAVE
"\xC3\x9A" => "\xC3\xBA", # LATIN CAPITAL LETTER U WITH ACUTE
"\xC3\x9B" => "\xC3\xBB", # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
"\xC3\x9C" => "\xC3\xBC", # LATIN CAPITAL LETTER U WITH DIAERESIS
"\xC3\x9D" => "\xC3\xBD", # LATIN CAPITAL LETTER Y WITH ACUTE
( run in 1.273 second using v1.01-cache-2.11-cpan-39bf76dae61 )