Unicode-Tussle
view release on metacpan or search on metacpan
script/hantest view on Meta::CPAN
push @namelist, $_;
}
}
printf "Korean(int) %s is %s.\n", $hangul_string, lc join("-", @namelist);
}
banner_paragraph(Japanese => $japanese);
if (SHOW_UNIHAN) {
char_inform for $japanese =~ /[^\0-\x7f]/g;
print "\n";
}
for my $cjk_string ($japanese =~ /\p{Han}+/g) {
printf "CJK %s is %s in Japanese.\n", $cjk_string, said(JapaneseOn => $cjk_string);
}
print "\n";
for my $kanji_string ($japanese =~ /[\p{Han}\p{Kana}\p{InKatakana}\p{InHiragana}]+/g) {
printf "Japanese %s is %s.\n", $kanji_string, romaji($kanji_string);
}
exit;
sub entrapment() {
$SIG{__DIE__} = sub {
croak "\n$0: death trap caught exception: @_" unless $^S;
};
$SIG{__WARN__} = sub {
# dunno why, but carp and cluck do the same thing here
my $kvetch = "\n$0: warn trap caught warning: @_";
warn $kvetch;
die "fatal(ized) warning";
};
}
sub banner {
say "\n====@_====\n";
}
sub tabbed_sizing {
my ($self, $cols, $pre, $spc, $str) = @_;
my $spcstr = $spc.$str;
while ($spcstr =~ s/^( *)(\t+)//) {
$cols += length($1);
$cols += length($2) * 8 - $cols % 8;
}
$cols += $self->strsize(0, '', '', $spcstr);
return $cols;
};
sub banner_paragraph($$) {
my ($name, $text) = @_;
banner(uc $name);
wrap_paragraph($text);
}
UNITCHECK {
### Public Configuration Attributes (unused variable!!)
state $LB_default_config = {
BreakIndent => 'YES',
CharactersMax => 998,
ColumnsMin => 0,
ColumnsMax => 76,
ComplexBreaking => 'YES',
Context => 'NONEASTASIAN',
Format => "SIMPLE",
HangulAsAL => 'NO',
LegacyCM => 'YES',
Newline => "\n",
SizingMethod => 'UAX11',
TailorEA => [],
TailorLB => [],
UrgentBreaking => undef,
UserBreaking => [],
};
state $formatter = new Unicode::LineBreak (
# makes for fewer linebreaks on this dataset:
Context => "NONEASTASIAN", # EASTASIAN, NONEATSIAN
ColumnsMax => 72,
Format => "SIMPLE", # SIMPLE, NEWLINE, TRIM
HangulAsAL => "YES",
SizingMethod => \&tabbed_sizing, # for tab handling
TailorLB => [
ord("\t") => LB_SP,
LEFT_QUOTES() => LB_OP,
RIGHT_QUOTES() => LB_CL,
],
);
sub wrap_line($) {
my($text) = @_;
$formatter->config(Newline => ("\n" . " " x 4));
say $formatter->break($text);
}
sub wrap_paragraph($) {
my ($text) = @_;
$formatter->config(Newline => "\n");
for (split /\R{2,}/, $text) {
s/(?:(?![\N{NO-BREAK SPACE}\t])\p{White_Space})+/ /g;
s/^\s+//;
s/\s+$//;
say $formatter->break($_), "\n";
}
}
} # end UNITCHECK
UNITCHECK {
state $uh = new Unicode::Unihan;
sub char_inform(_) {
state $seen = { };
my $string = shift;
for my $char ( split //, $string ) {
# next if $seen->{$char}++;
my $ci = charinfo(ord $char);
my $name = $ci->{name};
my $script = $ci->{script};
my $cat = $ci->{category};
my $gcs = Unicode::GCString->new($char);
my $columns = $gcs->columns();
#next unless $columns == 2;
printf " %s%s U+%04X %2s", $char, " " x (2 - $columns), ord($char), $cat;
printf " %-6s %s\n", $script, $name;
for my $lang (@langs) {
my @data = $uh->$lang($char);
next unless @data && $data[0];
# dumb thing doesn't have the utf8 flag on
printf " %-12s %s\n", $lang, join(", ", map { utf8 } @data);
}
}
}
sub said($$) {
my ($lang, $string) = @_;
my @retlist = ();
for my $char ( split //, $string ) {
my @data = $uh->$lang($char);
next unless @data && $data[0];
my $best = lc utf8($data[0]);
if ($best =~ /\d/) {
$best = apply_tones($lang, $best);
}
for ($best) {
s/\h.*//;
}
push @retlist, $best;
}
return join(" ", @retlist);
}
} # end UNITCHECK
sub apply_tones($$) {
my ($lang, $string) = @_;
return $string unless $string =~ / \d \b /x;
state $mandarin_tones = {
# don't use COMBINING TONE MARKs because they don't evaporate when NFC'd
1 => "\N{COMBINING MACRON}", # 1 is macron é qÄ«ng qing1
2 => "\N{COMBINING ACUTE ACCENT}", # 2 is acute è lán lan2
3 => "\N{COMBINING CARON}", # 3 is caron 满 mÇn man3
4 => "\N{COMBINING GRAVE ACCENT}", # 4 is grave ç¶ lÇ lü4
5 => "", # tone 5 doesn't transliterate
};
state $cantonese_supers = {
1 => "\N{SUPERSCRIPT ONE}",
2 => "\N{SUPERSCRIPT TWO}",
3 => "\N{SUPERSCRIPT THREE}",
4 => "\N{SUPERSCRIPT FOUR}",
5 => "\N{SUPERSCRIPT FIVE}",
6 => "\N{SUPERSCRIPT SIX}",
7 => "\N{SUPERSCRIPT SEVEN}",
8 => "\N{SUPERSCRIPT EIGHT}",
9 => "\N{SUPERSCRIPT NINE}",
};
state $cantonese_tones = {
1 => "\N{MODIFIER LETTER EXTRA-HIGH TONE BAR}", # ˥
2 => "\N{MODIFIER LETTER MID TONE BAR}\N{MODIFIER LETTER EXTRA-HIGH TONE BAR}", # ˧˥
3 => "\N{MODIFIER LETTER MID TONE BAR}", # ˧
4 => "\N{MODIFIER LETTER LOW TONE BAR}\N{MODIFIER LETTER EXTRA-LOW TONE BAR}", # ˨˩
5 => "\N{MODIFIER LETTER EXTRA-LOW TONE BAR}\N{MODIFIER LETTER MID TONE BAR}", # ˩˧
6 => "\N{MODIFIER LETTER LOW TONE BAR}", # ˨
7 => "\N{MODIFIER LETTER EXTRA-HIGH TONE BAR}", # ˥
8 => "\N{MODIFIER LETTER MID TONE BAR}", # ˧
9 => "\N{MODIFIER LETTER LOW TONE BAR}", # ˨
};
my $tones = undef;
### Something is broken with given() here
### given ($lang) {
### when ("Mandarin") { $tones = $mandarin_tones }
### when ("Cantonese") { $tones = $cantonese_tones }
### default { die "unexpected language" }
### }
if ($lang eq "Cantonese") {
my ($tones, $supers) = ($string, $string);
$tones =~ s/(\d)\b/$cantonese_tones->{$1}/g;
$supers =~ s/(\d)\b/$cantonese_supers->{$1}/g;
return "$supers/$tones";
}
if ($lang ne "Mandarin") {
die "unknown tone language $lang";
}
( run in 3.142 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )