Unicode-Tussle

 view release on metacpan or  search on metacpan

script/hantest  view on Meta::CPAN

            push @namelist, $_;
        }
    }
    printf "Korean(int) %s is %s.\n", $hangul_string, lc join("-", @namelist);
}

banner_paragraph(Japanese => $japanese);

if (SHOW_UNIHAN) {
    char_inform for $japanese =~ /[^\0-\x7f]/g;
    print "\n";
}

for my $cjk_string ($japanese =~ /\p{Han}+/g) {
    printf "CJK %s is %s in Japanese.\n", $cjk_string, said(JapaneseOn => $cjk_string);
}
print "\n";

for my $kanji_string ($japanese =~ /[\p{Han}\p{Kana}\p{InKatakana}\p{InHiragana}]+/g) {
    printf "Japanese %s is %s.\n", $kanji_string, romaji($kanji_string);
}


exit;

sub entrapment() {
    $SIG{__DIE__} = sub {
        croak "\n$0: death trap caught exception: @_" unless $^S;
    };
    $SIG{__WARN__} = sub {
        # dunno why, but carp and cluck do the same thing here
        my $kvetch = "\n$0: warn trap caught warning: @_";
        warn $kvetch;
        die "fatal(ized) warning";
    };
}

sub banner {
    say "\n====@_====\n";
}


sub tabbed_sizing {
    my ($self, $cols, $pre, $spc, $str) = @_;
    my $spcstr = $spc.$str;
    while ($spcstr =~ s/^( *)(\t+)//) {
        $cols += length($1);
        $cols += length($2) * 8 - $cols % 8;
    }
    $cols += $self->strsize(0, '', '', $spcstr);
    return $cols;
};

sub banner_paragraph($$) {
    my ($name, $text) = @_;
    banner(uc $name);
    wrap_paragraph($text);
}


UNITCHECK {

### Public Configuration Attributes (unused variable!!)
state $LB_default_config = {
    BreakIndent => 'YES',
    CharactersMax => 998,
    ColumnsMin => 0,
    ColumnsMax => 76,
    ComplexBreaking => 'YES',
    Context => 'NONEASTASIAN',
    Format => "SIMPLE",
    HangulAsAL => 'NO',
    LegacyCM => 'YES',
    Newline => "\n",
    SizingMethod => 'UAX11',
    TailorEA => [],
    TailorLB => [],
    UrgentBreaking => undef,
    UserBreaking => [],
};

state $formatter = new Unicode::LineBreak (
# makes for fewer linebreaks on this dataset:
    Context => "NONEASTASIAN",      # EASTASIAN, NONEATSIAN
    ColumnsMax => 72,
    Format => "SIMPLE",             # SIMPLE, NEWLINE, TRIM
    HangulAsAL => "YES",
    SizingMethod    => \&tabbed_sizing,  # for tab handling
    TailorLB => [
        ord("\t") => LB_SP,
        LEFT_QUOTES()  => LB_OP,
        RIGHT_QUOTES() => LB_CL,
    ],
);

sub wrap_line($) {
    my($text) = @_;
    $formatter->config(Newline => ("\n" . " " x 4));
    say $formatter->break($text);
}

sub wrap_paragraph($) {
    my ($text) = @_;
    $formatter->config(Newline => "\n");

    for (split /\R{2,}/, $text) {
        s/(?:(?![\N{NO-BREAK SPACE}\t])\p{White_Space})+/ /g;
        s/^\s+//;
        s/\s+$//;
        say $formatter->break($_), "\n";
    }

}

} # end UNITCHECK

UNITCHECK {

state $uh = new Unicode::Unihan;

sub char_inform(_) {

    state $seen = { };

    my $string = shift;
    for my $char ( split //, $string ) {
        # next if $seen->{$char}++;
        my $ci = charinfo(ord $char);
        my $name   = $ci->{name};
        my $script = $ci->{script};
        my $cat = $ci->{category};

        my $gcs = Unicode::GCString->new($char);
        my $columns = $gcs->columns();
        #next unless $columns == 2;

        printf " %s%s U+%04X %2s", $char, " " x (2 - $columns), ord($char), $cat;
        printf " %-6s %s\n", $script, $name;

        for my $lang (@langs) {
            my @data = $uh->$lang($char);
            next unless @data && $data[0];
            # dumb thing doesn't have the utf8 flag on
            printf "  %-12s %s\n", $lang, join(", ", map { utf8 } @data);
        }
    }
}


sub said($$) {
    my ($lang, $string) = @_;
    my @retlist = ();
    for my $char ( split //, $string ) {
        my @data = $uh->$lang($char);
        next unless @data && $data[0];
        my $best = lc utf8($data[0]);
        if ($best =~ /\d/) {
            $best = apply_tones($lang, $best);
        }
        for ($best) {
            s/\h.*//;
        }
        push @retlist, $best;
    }
    return join(" ", @retlist);
}

}  # end UNITCHECK


sub apply_tones($$) {
    my ($lang, $string) = @_;

    return $string unless $string =~ / \d \b /x;

    state $mandarin_tones = {
    # don't use COMBINING TONE MARKs because they don't evaporate when NFC'd
        1 => "\N{COMBINING MACRON}",            # 1 is macron 青 qīng qing1
        2 => "\N{COMBINING ACUTE ACCENT}",      # 2 is acute  藍 lán  lan2
        3 => "\N{COMBINING CARON}",             # 3 is caron  满 mǎn  man3
        4 => "\N{COMBINING GRAVE ACCENT}",      # 4 is grave  綠 lǜ   lü4
        5 => "",   # tone 5 doesn't transliterate
    };

    state $cantonese_supers = {
        1 => "\N{SUPERSCRIPT ONE}",
        2 => "\N{SUPERSCRIPT TWO}",
        3 => "\N{SUPERSCRIPT THREE}",
        4 => "\N{SUPERSCRIPT FOUR}",
        5 => "\N{SUPERSCRIPT FIVE}",
        6 => "\N{SUPERSCRIPT SIX}",
        7 => "\N{SUPERSCRIPT SEVEN}",
        8 => "\N{SUPERSCRIPT EIGHT}",
        9 => "\N{SUPERSCRIPT NINE}",
    };

    state $cantonese_tones = {
        1 => "\N{MODIFIER LETTER EXTRA-HIGH TONE BAR}",                                  # ˥
        2 => "\N{MODIFIER LETTER MID TONE BAR}\N{MODIFIER LETTER EXTRA-HIGH TONE BAR}",  # ˧˥
        3 => "\N{MODIFIER LETTER MID TONE BAR}",                                         # ˧
        4 => "\N{MODIFIER LETTER LOW TONE BAR}\N{MODIFIER LETTER EXTRA-LOW TONE BAR}",   # ˨˩
        5 => "\N{MODIFIER LETTER EXTRA-LOW TONE BAR}\N{MODIFIER LETTER MID TONE BAR}",   # ˩˧
        6 => "\N{MODIFIER LETTER LOW TONE BAR}",                                         # ˨
        7 => "\N{MODIFIER LETTER EXTRA-HIGH TONE BAR}",                                  # ˥
        8 => "\N{MODIFIER LETTER MID TONE BAR}",                                         # ˧
        9 => "\N{MODIFIER LETTER LOW TONE BAR}",                                         # ˨
    };

    my $tones = undef;

### Something is broken with given() here
###    given ($lang) {
###        when ("Mandarin")  { $tones = $mandarin_tones  }
###        when ("Cantonese") { $tones = $cantonese_tones }
###        default            { die "unexpected language" }
###    }

    if ($lang eq "Cantonese") {
        my ($tones, $supers) = ($string, $string);
        $tones  =~ s/(\d)\b/$cantonese_tones->{$1}/g;
        $supers =~ s/(\d)\b/$cantonese_supers->{$1}/g;
        return "$supers/$tones";
    }

    if ($lang ne "Mandarin") {
        die "unknown tone language $lang";
    }



( run in 3.142 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )