Unicode-Stringprep

 view release on metacpan or  search on metacpan

lib/Unicode/Stringprep.pm  view on Meta::CPAN

  my @collect = ();
  sub _set_tables {
    my $set = shift;
    while(@_) {
      my $data = shift;
      if(ref($data) eq 'HASH') { _set_tables($set, %{$data}); }
      elsif(ref($data) eq 'ARRAY') { _set_tables($set, @{$data}); }
      elsif(defined $data){ push @{$set}, [$data,shift || $data] };
    }
  }
  _set_tables(\@collect,@_);

  # NB: This destroys @collect as it modifies the anonymous ARRAYs
  # referenced in @collect.
  # This is harmless as it only modifies ARRAYs after they've been
  # inspected.

  my @set = ();
  foreach my $d (sort { $a->[0]<=>$b->[0] } @collect) {
    if(!@set || $set[$#set]->[1]+1 < $d->[0]) {
      push @set, $d;
    } elsif($set[$#set]->[1] < $d->[1]) {
      $set[$#set]->[1] = $d->[1];
    }
  }

  return undef if !@set;

  return '['.join('', map {
    $_->[0] >= $_->[1]
        ? sprintf("\\x{%X}", $_->[0])
        : sprintf("\\x{%X}-\\x{%X}", @{$_}[0,1])
    } @set ).']';
}

## specific functions for individual stringprep steps
##

sub _compile_normalization {
  my $unicode_normalization = uc shift;
  $unicode_normalization =~ s/^NF//;

  return '$string = _NFKC_3_2($string)' if $unicode_normalization eq 'KC';
  return undef if !$unicode_normalization;

  croak 'Unsupported Unicode normalization (NF)'.$unicode_normalization.'.';
}

my $is_Unassigned = _compile_set(@Unicode::Stringprep::Unassigned::A1);

sub _NFKC_3_2 {
  my $string = shift;

  ## pre-map characters corrected in Corrigendum #4
  ##
  no warnings 'utf8';
  $string =~ tr/\x{2F868}\x{2F874}\x{2F91F}\x{2F95F}\x{2F9BF}/\x{2136A}\x{5F33}\x{43AB}\x{7AAE}\x{4D57}/;

  ## only normalize runs of assigned characters
  ##
  my @s = split m/($is_Unassigned+)/o, $string;

  for( my $i = 0; $i <= $#s ; $i+=2 ) { # skips delimiters == is_Unassigned
    no warnings 'utf8';
    $s[$i] = Unicode::Normalize::NFKC($s[$i]);
  }
  return join '', @s;
}

sub _check_unassigned {
  if( shift =~ m/($is_Unassigned)/os ) {
    die sprintf("unassigned character U+%04X",ord($1));
  }
}

sub _compile_prohibited {
  my $prohibited = _compile_set(@_);

  if($prohibited) {
    return 
      'if($string =~ m/('.$prohibited.')/os) {'.
          'die sprintf("prohibited character U+%04X",ord($1))'.
      '}';
  }
}

my $is_RandAL = _compile_set(@Unicode::Stringprep::BiDi::D1);
my $is_L = _compile_set(@Unicode::Stringprep::BiDi::D2);

sub _check_bidi {
  my $string = shift;

  if($string =~ m/$is_RandAL/os) {
    if($string =~ m/$is_L/os) {
      die "string contains both RandALCat and LCat characters"
    } elsif($string !~ m/^(?:$is_RandAL)/os) {
      die "string contains RandALCat character but does not start with one"
    } elsif($string !~ m/(?:$is_RandAL)$/os) {
      die "string contains RandALCat character but does not end with one"
    }
  }
}

my $is_Combining = _compile_set(  0x0300,0x0314, 0x0316,0x0319, 0x031C,0x0320,
    0x0321,0x0322, 0x0323,0x0326, 0x0327,0x0328, 0x0329,0x0333, 0x0334,0x0338,
    0x0339,0x033C, 0x033D,0x0344, 0x0347,0x0349, 0x034A,0x034C, 0x034D,0x034E,
    0x0360,0x0361, 0x0363,0x036F, 0x0483,0x0486, 0x0592,0x0595, 0x0597,0x0599,
    0x059C,0x05A1, 0x05A3,0x05A7, 0x05A8,0x05A9, 0x05AB,0x05AC, 0x0653,0x0654,
    0x06D6,0x06DC, 0x06DF,0x06E2, 0x06E7,0x06E8, 0x06EB,0x06EC, 0x0732,0x0733,
    0x0735,0x0736, 0x0737,0x0739, 0x073B,0x073C, 0x073F,0x0741, 0x0749,0x074A,
    0x0953,0x0954, 0x0E38,0x0E39, 0x0E48,0x0E4B, 0x0EB8,0x0EB9, 0x0EC8,0x0ECB,
    0x0F18,0x0F19, 0x0F7A,0x0F7D, 0x0F82,0x0F83, 0x0F86,0x0F87, 0x20D0,0x20D1,
    0x20D2,0x20D3, 0x20D4,0x20D7, 0x20D8,0x20DA, 0x20DB,0x20DC, 0x20E5,0x20E6,
    0x302E,0x302F, 0x3099,0x309A, 0xFE20,0xFE23,
    0x1D165,0x1D166, 0x1D167,0x1D169, 0x1D16E,0x1D172, 0x1D17B,0x1D182,
    0x1D185,0x1D189, 0x1D18A,0x1D18B, 0x1D1AA,0x1D1AD, 
    map { ($_,$_) } 0x0315, 0x031A, 0x031B, 0x0345, 0x0346, 0x0362, 0x0591,
    0x0596, 0x059A, 0x059B, 0x05AA, 0x05AD, 0x05AE, 0x05AF, 0x05B0, 0x05B1,
    0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BB,
    0x05BC, 0x05BD, 0x05BF, 0x05C1, 0x05C2, 0x05C4, 0x064B, 0x064C, 0x064D,
    0x064E, 0x064F, 0x0650, 0x0651, 0x0652, 0x0655, 0x0670, 0x06E3, 0x06E4,



( run in 1.801 second using v1.01-cache-2.11-cpan-71847e10f99 )