Unicode-Stringprep
view release on metacpan or search on metacpan
lib/Unicode/Stringprep.pm view on Meta::CPAN
my @collect = ();
sub _set_tables {
my $set = shift;
while(@_) {
my $data = shift;
if(ref($data) eq 'HASH') { _set_tables($set, %{$data}); }
elsif(ref($data) eq 'ARRAY') { _set_tables($set, @{$data}); }
elsif(defined $data){ push @{$set}, [$data,shift || $data] };
}
}
_set_tables(\@collect,@_);
# NB: This destroys @collect as it modifies the anonymous ARRAYs
# referenced in @collect.
# This is harmless as it only modifies ARRAYs after they've been
# inspected.
my @set = ();
foreach my $d (sort { $a->[0]<=>$b->[0] } @collect) {
if(!@set || $set[$#set]->[1]+1 < $d->[0]) {
push @set, $d;
} elsif($set[$#set]->[1] < $d->[1]) {
$set[$#set]->[1] = $d->[1];
}
}
return undef if !@set;
return '['.join('', map {
$_->[0] >= $_->[1]
? sprintf("\\x{%X}", $_->[0])
: sprintf("\\x{%X}-\\x{%X}", @{$_}[0,1])
} @set ).']';
}
## specific functions for individual stringprep steps
##
sub _compile_normalization {
my $unicode_normalization = uc shift;
$unicode_normalization =~ s/^NF//;
return '$string = _NFKC_3_2($string)' if $unicode_normalization eq 'KC';
return undef if !$unicode_normalization;
croak 'Unsupported Unicode normalization (NF)'.$unicode_normalization.'.';
}
my $is_Unassigned = _compile_set(@Unicode::Stringprep::Unassigned::A1);
sub _NFKC_3_2 {
my $string = shift;
## pre-map characters corrected in Corrigendum #4
##
no warnings 'utf8';
$string =~ tr/\x{2F868}\x{2F874}\x{2F91F}\x{2F95F}\x{2F9BF}/\x{2136A}\x{5F33}\x{43AB}\x{7AAE}\x{4D57}/;
## only normalize runs of assigned characters
##
my @s = split m/($is_Unassigned+)/o, $string;
for( my $i = 0; $i <= $#s ; $i+=2 ) { # skips delimiters == is_Unassigned
no warnings 'utf8';
$s[$i] = Unicode::Normalize::NFKC($s[$i]);
}
return join '', @s;
}
sub _check_unassigned {
if( shift =~ m/($is_Unassigned)/os ) {
die sprintf("unassigned character U+%04X",ord($1));
}
}
sub _compile_prohibited {
my $prohibited = _compile_set(@_);
if($prohibited) {
return
'if($string =~ m/('.$prohibited.')/os) {'.
'die sprintf("prohibited character U+%04X",ord($1))'.
'}';
}
}
my $is_RandAL = _compile_set(@Unicode::Stringprep::BiDi::D1);
my $is_L = _compile_set(@Unicode::Stringprep::BiDi::D2);
sub _check_bidi {
my $string = shift;
if($string =~ m/$is_RandAL/os) {
if($string =~ m/$is_L/os) {
die "string contains both RandALCat and LCat characters"
} elsif($string !~ m/^(?:$is_RandAL)/os) {
die "string contains RandALCat character but does not start with one"
} elsif($string !~ m/(?:$is_RandAL)$/os) {
die "string contains RandALCat character but does not end with one"
}
}
}
my $is_Combining = _compile_set( 0x0300,0x0314, 0x0316,0x0319, 0x031C,0x0320,
0x0321,0x0322, 0x0323,0x0326, 0x0327,0x0328, 0x0329,0x0333, 0x0334,0x0338,
0x0339,0x033C, 0x033D,0x0344, 0x0347,0x0349, 0x034A,0x034C, 0x034D,0x034E,
0x0360,0x0361, 0x0363,0x036F, 0x0483,0x0486, 0x0592,0x0595, 0x0597,0x0599,
0x059C,0x05A1, 0x05A3,0x05A7, 0x05A8,0x05A9, 0x05AB,0x05AC, 0x0653,0x0654,
0x06D6,0x06DC, 0x06DF,0x06E2, 0x06E7,0x06E8, 0x06EB,0x06EC, 0x0732,0x0733,
0x0735,0x0736, 0x0737,0x0739, 0x073B,0x073C, 0x073F,0x0741, 0x0749,0x074A,
0x0953,0x0954, 0x0E38,0x0E39, 0x0E48,0x0E4B, 0x0EB8,0x0EB9, 0x0EC8,0x0ECB,
0x0F18,0x0F19, 0x0F7A,0x0F7D, 0x0F82,0x0F83, 0x0F86,0x0F87, 0x20D0,0x20D1,
0x20D2,0x20D3, 0x20D4,0x20D7, 0x20D8,0x20DA, 0x20DB,0x20DC, 0x20E5,0x20E6,
0x302E,0x302F, 0x3099,0x309A, 0xFE20,0xFE23,
0x1D165,0x1D166, 0x1D167,0x1D169, 0x1D16E,0x1D172, 0x1D17B,0x1D182,
0x1D185,0x1D189, 0x1D18A,0x1D18B, 0x1D1AA,0x1D1AD,
map { ($_,$_) } 0x0315, 0x031A, 0x031B, 0x0345, 0x0346, 0x0362, 0x0591,
0x0596, 0x059A, 0x059B, 0x05AA, 0x05AD, 0x05AE, 0x05AF, 0x05B0, 0x05B1,
0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BB,
0x05BC, 0x05BD, 0x05BF, 0x05C1, 0x05C2, 0x05C4, 0x064B, 0x064C, 0x064D,
0x064E, 0x064F, 0x0650, 0x0651, 0x0652, 0x0655, 0x0670, 0x06E3, 0x06E4,
( run in 1.801 second using v1.01-cache-2.11-cpan-71847e10f99 )