unicode results from the CPAN

unicode

Aozora2Epub

view release on metacpan or search on metacpan

lib/Aozora2Epub/XHTML.pm view on Meta::CPAN

    # åŠæ¿ç‚¹ä»˜ãã‚«ã‚¿ã‚«ãƒŠ ãƒ• kindleã ã¨2æ–‡å—ã«è¦‹ãˆã‚‹ã®ãªã‚“ã¨ã‹ãªã‚‰ã‚“ã‹ï¼Ÿ
    return if $men == 1 && $ku == 6 && $ten == 88;

    # kindle font of these characters are broken.
    return if $men == 1 && $ku == 90 && $ten == 61;
    return if $men == 2 && $ku == 15 && $ten == 73;
    return jisx0213_to_utf8($men, $ku, $ten);
}

# kindle font of these characters are broken.
our %kindle_broken_font_unicode = map { $_ => 1 } (
    0x2152,
    0x2189,
    0x26bd,
    0x26be,
    0x3244,
);

our %kindle_ok_font_over0xffff = map { $_ => 1 } (
    0x20d58, 0x20e97, 0x20ed7, 0x210e4, 0x2124f, 0x2296b,
    0x22d07, 0x22e42, 0x22feb, 0x233fe, 0x23cbe, 0x249ad,
    0x24e04, 0x24ff2, 0x2546e, 0x2567f, 0x259cc, 0x2688a,
    0x279b4, 0x280e9, 0x28e17, 0x29170, 0x2a2b2,
);

sub kindle_unicode_hex2chr {
    my $unicode_hex = shift;
    my $unicode = hex($unicode_hex);

    return if $kindle_broken_font_unicode{$unicode};

    # kindle font is almost not avaliable in this range.
    return if $unicode > 0xffff && !$kindle_ok_font_over0xffff{$unicode};

    return chr($unicode);
}

sub _conv_gaiji_title_author {
    my ($unicode, $men, $ku, $ten) = @_;
    if ($unicode) {
        my $ch = kindle_unicode_hex2chr($unicode);
        return $ch if $ch;
        return;
    }
    my $ch = kindle_jis2chr(0+$men, 0+$ku, 0+$ten);
    return $ch if $ch;
    return;
}

sub conv_gaiji_title_author {
    my $s = shift;

lib/Aozora2Epub/XHTML.pm view on Meta::CPAN

            $style =~ s/margin-right/margin-bottom/sg;
            $div->attr('style', $style);
        })
        ->process('span.notes', sub {
            my $span = shift;
            my $note = $span->as_text;
            return unless $note =~ m{ï¼»ï¼ƒ[^\ï¼½]+?ã€([^\ï¼½]+)ï¼½};
            my $desc = $1;
            my $ch = do {
                if ($desc =~ /U\+([A-fa-f0-9]+)/) {
                    kindle_unicode_hex2chr($1);
                } elsif ($desc =~ /ç¬¬\dæ°´æº–(\d)-(\d+)-(\d+)/) {
                    kindle_jis2chr(0+$1, 0+$2, 0+$3);
                }
            };
            return unless $ch;

            # find nearest â€» and replace it to $ch
            my $left = $span->left;
            unless ($left->isa('HTML::Element')) {
                if ($left =~ s/â€»$/$ch/) {

t/gaiji-replace.t view on Meta::CPAN

use utf8;
use Test::More;
use Test::Base;
use Aozora2Epub;
use Aozora2Epub::Gensym;
use lib qw/./;
use t::Util;

plan tests => 1 * blocks;

sub eval_unicode_notation {
    my $s = shift;
    $s =~ s|\\x\{([0-9a-fA-F]+)\}|chr(hex($1))|esg;
    return $s;
}

filters {
    html => 'chomp',
    expected => ['chomp', 'eval_unicode_notation'],
};

run {
    my $block = shift;
    Aozora2Epub::Gensym->reset_counter;

    my $doc = Aozora2Epub->new($block->html, no_fetch_assets=>1);
    my $got = join('', map { $_->as_html } @{$doc->files});
    is_deeply($got, $block->expected, $block->name);
};

__DATA__

=== simple unicode
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã¦ã¸ã‚“ï¼‹åŽ»ã€ã€U+62BEã€369-2ï¼½</span>
--- expected
\x{62be}

=== non gaiji note
--- html
ã‚ã‚ã‚<span class="notes">ï¼»ï¼ƒ ã‚ã‚ã‚ã¯ãƒžãƒžï¼½</span>
--- expected
ã‚ã‚ã‚<span class="notes">ï¼»ï¼ƒ ã‚ã‚ã‚ã¯ãƒžãƒžï¼½</span>

t/gaiji-replace.t view on Meta::CPAN

<img src="../../../gaiji/2-15/2-15-73.png" />
--- expected
<img src="../gaiji/2-15/2-15-73.png" />

=== kindle font broken jis 3
--- html
<img src="../../../gaiji/1-06/1-06-88.png" />
--- expected
<img src="../gaiji/1-06/1-06-88.png" />

=== kindle font broken unicode
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+2152ã€369-2ï¼½</span>
--- expected
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+2152ã€369-2ï¼½</span>

=== kindle font broken unicode 2
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+2189ã€369-2ï¼½</span>
--- expected
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+2189ã€369-2ï¼½</span>

=== kindle font broken unicode 3
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+26BDã€369-2ï¼½</span>
--- expected
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+26BDã€369-2ï¼½</span>

=== kindle font broken unicode 4
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+26BEã€369-2ï¼½</span>
--- expected
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+26BEã€369-2ï¼½</span>

=== kindle font broken unicode 5
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+3244ã€369-2ï¼½</span>
--- expected
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+3244ã€369-2ï¼½</span>

=== kindle font broken unicode over 0xffff
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+1F130ã€369-2ï¼½</span>
--- expected
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+1F130ã€369-2ï¼½</span>

=== kindle font broken unicode over 0xffff but ok
--- html
â€»<span class="notes">ï¼»ï¼ƒã€Œã‚ã‚ã‚ã€ã€U+2a2b2ã€369-2ï¼½</span>
--- expected
\x{2a2b2}

t/gaiji-title-author.t view on Meta::CPAN

use warnings;
use utf8;
use Test::More;
use Test::Base;
use Aozora2Epub::XHTML;
use lib qw/./;
use t::Util;

plan tests => 1 * blocks;

sub eval_unicode_notation {
    my $s = shift;
    $s =~ s|\\x\{([0-9a-fA-F]+)\}|chr(hex($1))|esg;
    return $s;
}

filters {
    input => 'chomp',
    expected => ['chomp', 'eval_unicode_notation'],
};

run {
    my $block = shift;

    my $got = Aozora2Epub::XHTML::conv_gaiji_title_author($block->input);
    is $got, $block->expected, $block->name;
};

__DATA__

t/gaiji-title-author.t view on Meta::CPAN

æŒã€€â€»ï¼»ï¼ƒãƒãƒ¼ãƒžæ•°å—1ã€1-13-21ï¼½ãƒ»â€»ï¼»ï¼ƒãƒãƒ¼ãƒžæ•°å—2ã€1-13-22ï¼½
--- expected
æŒã€€\x{2160}ãƒ»\x{2161}

=== not kome
--- input
ï¼ˆï¼’ï¼»ï¼ƒã€Œï¼’ã€ã¯ãƒãƒ¼ãƒžæ•°å—ã€1-13-22ï¼½ï¼‰
--- expected
ï¼ˆ\x{2161}ï¼‰

=== unicode
--- input
ãŸã¾â€»ï¼»ï¼ƒã€Œã“ã‚ã‚‚ã¸ã‚“ï¼‹æ”€ã€ã€U+897Bï¼½
--- expected
ãŸã¾\x{897b}

=== unicode bad font
--- input
å¤±â€»ï¼»ï¼ƒã€ŒäººãŒã—ã‚‰ï¼äºŒï¼å¿ƒã€ã€U+2B779ã€è¡¨ç´™ï¼½è¡“è¬›ç¾©
--- expected
å¤±â€»ï¼»ï¼ƒã€ŒäººãŒã—ã‚‰ï¼äºŒï¼å¿ƒã€ã€U+2B779ã€è¡¨ç´™ï¼½è¡“è¬›ç¾©
--- note
2b779ã¯kindleã ã¨è±†è…ã«ãªã‚‹

=== no chuuki
--- input
ã‚ã„ã†ãˆãŠ

( run in 0.323 second using v1.01-cache-2.11-cpan-88abd93f124 )