Aozora2Epub

 view release on metacpan or  search on metacpan

lib/Aozora2Epub/XHTML.pm  view on Meta::CPAN

package Aozora2Epub::XHTML;
use strict;
use warnings;
use utf8;
use Aozora2Epub::CachedGet qw/http_get/;
use Aozora2Epub::XHTML::Tree;
use Aozora2Epub::Gensym;
use Aozora2Epub::File;
use HTML::Element;
use Encode::JISX0213;
use Encode qw/decode/;
use base qw(Class::Accessor);
__PACKAGE__->mk_accessors(qw/title subtitle author
                             contents
                             bib_info notation_notes gaiji fig/);

our $VERSION = "0.05";

sub jisx0213_to_utf8 {
    my ($men, $ku, $ten) = @_;
    $ku += 0xa0;
    $ten += 0xa0;
    my $euc = join('', ($men == 2 ? chr(0x8f) : ()),
                   chr($ku), chr($ten));
    my $utf8 = decode('euc-jp-2004', $euc);
    return $utf8;
}

sub kindle_jis2chr {
    my ($men, $ku, $ten) = @_;

    # 半濁点付きカタカナ フ kindleだと2文字に見えるのなんとかならんか?
    return if $men == 1 && $ku == 6 && $ten == 88;

    # kindle font of these characters are broken.
    return if $men == 1 && $ku == 90 && $ten == 61;
    return if $men == 2 && $ku == 15 && $ten == 73;
    return jisx0213_to_utf8($men, $ku, $ten);
}

# kindle font of these characters are broken.
our %kindle_broken_font_unicode = map { $_ => 1 } (
    0x2152,
    0x2189,
    0x26bd,
    0x26be,
    0x3244,
);

our %kindle_ok_font_over0xffff = map { $_ => 1 } (
    0x20d58, 0x20e97, 0x20ed7, 0x210e4, 0x2124f, 0x2296b,
    0x22d07, 0x22e42, 0x22feb, 0x233fe, 0x23cbe, 0x249ad,
    0x24e04, 0x24ff2, 0x2546e, 0x2567f, 0x259cc, 0x2688a,
    0x279b4, 0x280e9, 0x28e17, 0x29170, 0x2a2b2,
);

sub kindle_unicode_hex2chr {
    my $unicode_hex = shift;
    my $unicode = hex($unicode_hex);

    return if $kindle_broken_font_unicode{$unicode};

    # kindle font is almost not avaliable in this range.
    return if $unicode > 0xffff && !$kindle_ok_font_over0xffff{$unicode};

    return chr($unicode);
}

sub _conv_gaiji_title_author {
    my ($unicode, $men, $ku, $ten) = @_;
    if ($unicode) {
        my $ch = kindle_unicode_hex2chr($unicode);
        return $ch if $ch;
        return;
    }
    my $ch = kindle_jis2chr(0+$men, 0+$ku, 0+$ten);
    return $ch if $ch;
    return;
}

sub conv_gaiji_title_author {
    my $s = shift;
    return $s unless $s;
    $s =~ s{(.[#[^、]]*、(U\+([A-Fa-f0-9]+)|.*?(\d)-(\d+)-(\d+)).*?])}
           {
               my $all = $1;
               my $ch = _conv_gaiji_title_author($3, $4, $5, $6);
               $ch ? $ch : $all;
           }esg;
    return $s
}

sub new {
    my ($class, $url) = @_;
    my $base = $url;
    $base =~ s{[^/]+\.html$}{}s;
    return $class->new_from_string(http_get($url), $base);
}

sub new_from_string {
    my ($class, $html) = @_;
    my $self = bless { raw_content => $html }, $class;
    $self->process_doc();
    return $self;
}

sub _process_header {
    my $h = shift;

    # <hx><a id="xxx">ttt</a></hx> to <hx id="xxx">ttt</hx>
    # where hx is h1 h2 h3, h4, h5, etc
    my $anchor = $h->find_by_tag_name('a');
    if ($anchor) {



( run in 1.547 second using v1.01-cache-2.11-cpan-39bf76dae61 )