Char-UTF2

 view release on metacpan or  search on metacpan

lib/Eutf2.pm  view on Meta::CPAN


# instead of Carp.pm
sub carp;
sub croak;
sub cluck;
sub confess;

# 6.18. Matching Multiple-Byte Characters
# in Chapter 6. Pattern Matching
# of ISBN 978-1-56592-243-3 Perl Perl Cookbook.
# (and so on)

# regexp of character
my $your_char = q{(?:[\xC2-\xDF]|[\xE0-\xE0][\xA0-\xBF]|[\xE1-\xEC][\x80-\xBF]|[\xED-\xED][\x80-\x9F]|[\xEE-\xEF][\x80-\xBF]|[\xF0-\xF0][\x90-\xBF][\x80-\xBF]|[\xF1-\xF3][\x80-\xBF][\x80-\xBF]|[\xF4-\xF4][\x80-\x8F][\x80-\xBF])[\x80-\xBF]|[\x00-\x7F\...
use vars qw($qq_char); $qq_char = qr/\\c[\x40-\x5F]|\\?(?:$your_char)/oxms;
use vars qw($q_char);  $q_char  = qr/$your_char/oxms;

#
# UTF-8 character range per length
#
my %range_tr = ();

#
# UTF-8 case conversion
#
my %lc = ();
@lc{qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)} =
    qw(a b c d e f g h i j k l m n o p q r s t u v w x y z);
my %uc = ();
@uc{qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)} =
    qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z);
my %fc = ();
@fc{qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)} =
    qw(a b c d e f g h i j k l m n o p q r s t u v w x y z);

if (0) {
}

elsif (__PACKAGE__ =~ / \b Eutf2 \z/oxms) {
    %range_tr = (
        1 => [ [0x00..0x7F],
               [0xF5..0xFF], # malformed octet
             ],
        2 => [ [0xC2..0xDF],[0x80..0xBF],
             ],
        3 => [ [0xE0..0xE0],[0xA0..0xBF],[0x80..0xBF],
               [0xE1..0xEC],[0x80..0xBF],[0x80..0xBF],
               [0xED..0xED],[0x80..0x9F],[0x80..0xBF],
               [0xEE..0xEF],[0x80..0xBF],[0x80..0xBF],
             ],
        4 => [ [0xF0..0xF0],[0x90..0xBF],[0x80..0xBF],[0x80..0xBF],
               [0xF1..0xF3],[0x80..0xBF],[0x80..0xBF],[0x80..0xBF],
               [0xF4..0xF4],[0x80..0x8F],[0x80..0xBF],[0x80..0xBF],
             ],
    );

    # CaseFolding-12.0.0.txt
    # Date: 2019-01-22, 08:18:22 GMT
    # c 2019 UnicodeR, Inc.
    # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
    # For terms of use, see http://www.unicode.org/terms_of_use.html
    #
    # Unicode Character Database
    #   For documentation, see http://www.unicode.org/reports/tr44/

    # you can use "make_CaseFolding.pl" to update this hash

    %fc = (
        "\x41"             => "\x61",                     # LATIN CAPITAL LETTER A
        "\x42"             => "\x62",                     # LATIN CAPITAL LETTER B
        "\x43"             => "\x63",                     # LATIN CAPITAL LETTER C
        "\x44"             => "\x64",                     # LATIN CAPITAL LETTER D
        "\x45"             => "\x65",                     # LATIN CAPITAL LETTER E
        "\x46"             => "\x66",                     # LATIN CAPITAL LETTER F
        "\x47"             => "\x67",                     # LATIN CAPITAL LETTER G
        "\x48"             => "\x68",                     # LATIN CAPITAL LETTER H
        "\x49"             => "\x69",                     # LATIN CAPITAL LETTER I
        "\x4A"             => "\x6A",                     # LATIN CAPITAL LETTER J
        "\x4B"             => "\x6B",                     # LATIN CAPITAL LETTER K
        "\x4C"             => "\x6C",                     # LATIN CAPITAL LETTER L
        "\x4D"             => "\x6D",                     # LATIN CAPITAL LETTER M
        "\x4E"             => "\x6E",                     # LATIN CAPITAL LETTER N
        "\x4F"             => "\x6F",                     # LATIN CAPITAL LETTER O
        "\x50"             => "\x70",                     # LATIN CAPITAL LETTER P
        "\x51"             => "\x71",                     # LATIN CAPITAL LETTER Q
        "\x52"             => "\x72",                     # LATIN CAPITAL LETTER R
        "\x53"             => "\x73",                     # LATIN CAPITAL LETTER S
        "\x54"             => "\x74",                     # LATIN CAPITAL LETTER T
        "\x55"             => "\x75",                     # LATIN CAPITAL LETTER U
        "\x56"             => "\x76",                     # LATIN CAPITAL LETTER V
        "\x57"             => "\x77",                     # LATIN CAPITAL LETTER W
        "\x58"             => "\x78",                     # LATIN CAPITAL LETTER X
        "\x59"             => "\x79",                     # LATIN CAPITAL LETTER Y
        "\x5A"             => "\x7A",                     # LATIN CAPITAL LETTER Z
        "\xC2\xB5"         => "\xCE\xBC",                 # MICRO SIGN
        "\xC3\x80"         => "\xC3\xA0",                 # LATIN CAPITAL LETTER A WITH GRAVE
        "\xC3\x81"         => "\xC3\xA1",                 # LATIN CAPITAL LETTER A WITH ACUTE
        "\xC3\x82"         => "\xC3\xA2",                 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
        "\xC3\x83"         => "\xC3\xA3",                 # LATIN CAPITAL LETTER A WITH TILDE
        "\xC3\x84"         => "\xC3\xA4",                 # LATIN CAPITAL LETTER A WITH DIAERESIS
        "\xC3\x85"         => "\xC3\xA5",                 # LATIN CAPITAL LETTER A WITH RING ABOVE
        "\xC3\x86"         => "\xC3\xA6",                 # LATIN CAPITAL LETTER AE
        "\xC3\x87"         => "\xC3\xA7",                 # LATIN CAPITAL LETTER C WITH CEDILLA
        "\xC3\x88"         => "\xC3\xA8",                 # LATIN CAPITAL LETTER E WITH GRAVE
        "\xC3\x89"         => "\xC3\xA9",                 # LATIN CAPITAL LETTER E WITH ACUTE
        "\xC3\x8A"         => "\xC3\xAA",                 # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
        "\xC3\x8B"         => "\xC3\xAB",                 # LATIN CAPITAL LETTER E WITH DIAERESIS
        "\xC3\x8C"         => "\xC3\xAC",                 # LATIN CAPITAL LETTER I WITH GRAVE
        "\xC3\x8D"         => "\xC3\xAD",                 # LATIN CAPITAL LETTER I WITH ACUTE
        "\xC3\x8E"         => "\xC3\xAE",                 # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
        "\xC3\x8F"         => "\xC3\xAF",                 # LATIN CAPITAL LETTER I WITH DIAERESIS
        "\xC3\x90"         => "\xC3\xB0",                 # LATIN CAPITAL LETTER ETH
        "\xC3\x91"         => "\xC3\xB1",                 # LATIN CAPITAL LETTER N WITH TILDE
        "\xC3\x92"         => "\xC3\xB2",                 # LATIN CAPITAL LETTER O WITH GRAVE
        "\xC3\x93"         => "\xC3\xB3",                 # LATIN CAPITAL LETTER O WITH ACUTE
        "\xC3\x94"         => "\xC3\xB4",                 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
        "\xC3\x95"         => "\xC3\xB5",                 # LATIN CAPITAL LETTER O WITH TILDE
        "\xC3\x96"         => "\xC3\xB6",                 # LATIN CAPITAL LETTER O WITH DIAERESIS
        "\xC3\x98"         => "\xC3\xB8",                 # LATIN CAPITAL LETTER O WITH STROKE
        "\xC3\x99"         => "\xC3\xB9",                 # LATIN CAPITAL LETTER U WITH GRAVE
        "\xC3\x9A"         => "\xC3\xBA",                 # LATIN CAPITAL LETTER U WITH ACUTE
        "\xC3\x9B"         => "\xC3\xBB",                 # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
        "\xC3\x9C"         => "\xC3\xBC",                 # LATIN CAPITAL LETTER U WITH DIAERESIS
        "\xC3\x9D"         => "\xC3\xBD",                 # LATIN CAPITAL LETTER Y WITH ACUTE



( run in 1.273 second using v1.01-cache-2.11-cpan-39bf76dae61 )