Aozora2Epub

 view release on metacpan or  search on metacpan

lib/Aozora2Epub/XHTML.pm  view on Meta::CPAN

    return;
}

sub conv_gaiji_title_author {
    my $s = shift;
    return $s unless $s;
    $s =~ s{(.[#[^、]]*、(U\+([A-Fa-f0-9]+)|.*?(\d)-(\d+)-(\d+)).*?])}
           {
               my $all = $1;
               my $ch = _conv_gaiji_title_author($3, $4, $5, $6);
               $ch ? $ch : $all;
           }esg;
    return $s
}

sub new {
    my ($class, $url) = @_;
    my $base = $url;
    $base =~ s{[^/]+\.html$}{}s;
    return $class->new_from_string(http_get($url), $base);
}

sub new_from_string {
    my ($class, $html) = @_;
    my $self = bless { raw_content => $html }, $class;
    $self->process_doc();
    return $self;
}

sub _process_header {
    my $h = shift;

    # <hx><a id="xxx">ttt</a></hx> to <hx id="xxx">ttt</hx>
    # where hx is h1 h2 h3, h4, h5, etc
    my $anchor = $h->find_by_tag_name('a');
    if ($anchor) {
        my $id = $anchor->attr('id');
        $h->attr('id', $id);
        $anchor->replace_with($anchor->content_list);
    }
    $h->attr('id') or $h->attr('id', gensym);
    # <div class="jisage_*" style="margin-left: nn"><hx> to <hx style="text-indent: nn">
    # where hx is h3, h4, h5, etc
    my $parent = $h->parent;
    if ($parent && $parent->isa('HTML::Element')
        && $parent->tag('div')
        && $parent->attr('class')
        && $parent->attr('class') =~ m{jisage_\d+}) {
        my $indent = $parent->attr('style');
        $indent =~ s{margin-left:}{text-indent:};
        $indent .= " " . $h->attr('style') if $h->attr('style');
        $h->attr('style', $indent);
        $parent->replace_with($h);
    }
}

sub _process_img {
    my $img = shift;

    my $src = $img->attr('src');
    if ($src =~ m{/(gaiji/\d-\d+/(\d)-(\d\d)-(\d\d)\.png)$}) {
        my $ch = kindle_jis2chr(0+$2, 0+$3, 0+$4);
        if ($ch) {
            $img->replace_with($ch);
            return;
        }
        $img->attr('src', "../$1");
        return $src;
    }
    # normal image
    $img->attr('src', "../images/$src");
    # find caption
    my $br = $img->right;
    return $src unless $br && $br->isa('HTML::Element') && $br->tag eq 'br';
    my $caption = $br->right;
    return $src unless $caption;
    return $src unless $caption->isa('HTML::Element');
    return $src unless $caption->tag eq 'span' && $caption->attr('class') =~ /caption/;
    $br->detach;
    $caption->detach;
    $caption->tag('figcaption');
    $img->replace_with(['figure', $img, $caption]);
    return $src;
}

sub _is_empty {
    my $elem = shift;
    unless ($elem->isa('HTML::Element')) {
        return $elem =~ /^\s+$/s;
    }
    return $elem->tag eq 'br';
}

sub _list_as_html {
    my @c = @_;

    return '' unless @c;
    my $res = '';
    for my $c (@c) {
        if ($c->isa('HTML::Element')) {
            $res .= $c->as_HTML('<>&', undef, {});
            next;
        }
        $c =~ s/^ //;
        $c =~ s/ $//;
        $res .= $c;
    }
    return $res;
}

sub _process_bibinfo {
    my $div = shift;

    my @hr = $div->find_by_tag_name('hr');
    $_->detach for @hr;
    my @c = $div->content_list;
    while (@c && _is_empty($c[0])) { shift @c }
    while (@c && _is_empty($c[-1])) { pop @c }
    return _list_as_html(@c);
}

lib/Aozora2Epub/XHTML.pm  view on Meta::CPAN

            my $orig_src = _process_img($img);
            $orig_src and push @images, $orig_src;
        })
        ->process('//div[contains(@style, "width")]', => sub {
            my $div = shift;
            my $style = $div->attr('style');
            $style =~ s/(?<![-\w])width:/height:/sg;
            $div->attr('style', $style);
        })
        ->process('h1', \&_process_header)
        ->process('h2', \&_process_header)
        ->process('h3', \&_process_header)
        ->process('h4', \&_process_header)
        ->process('h5', \&_process_header)
        ->process('//div[contains(@style, "margin")]', => sub {
            my $div = shift;
            my $style = $div->attr('style');
            $style =~ s/margin-left/margin-top/sg;
            $style =~ s/margin-right/margin-bottom/sg;
            $div->attr('style', $style);
        })
        ->process('span.notes', sub {
            my $span = shift;
            my $note = $span->as_text;
            return unless $note =~ m{[#[^\]]+?、([^\]]+)]};
            my $desc = $1;
            my $ch = do {
                if ($desc =~ /U\+([A-fa-f0-9]+)/) {
                    kindle_unicode_hex2chr($1);
                } elsif ($desc =~ /第\d水準(\d)-(\d+)-(\d+)/) {
                    kindle_jis2chr(0+$1, 0+$2, 0+$3);
                }
            };
            return unless $ch;

            # find nearest ※ and replace it to $ch
            my $left = $span->left;
            unless ($left->isa('HTML::Element')) {
                if ($left =~ s/※$/$ch/) {
                    $span->parent->splice_content($span->pindex - 1, 2, $left);
                }
                return;
            }
            if ($left->tag eq 'ruby') {
                my $rb = $left->find_by_tag_name('rb');
                my $s = $rb->as_text;
                if ($s =~ s/※/$ch/) {
                    $rb->replace_with(HTML::Element->new_from_lol([rb => $s]));
                    $span->delete;
                }
                return;
            }
        })
        ->as_list;

    # 先頭の<br/>の連続は削除
    while ($contents[0] && _is_empty($contents[0])) { shift @contents; };

    my (@gaiji, @fig);
    for my $path (@images) {
        if ($path =~ m{gaiji/(.+\.png)$}) {
            push @gaiji, $1;
        } else {
            push @fig, $path;
        }
    }
    $self->title(conv_gaiji_title_author($title));
    $self->subtitle(conv_gaiji_title_author($subtitle));
    $self->author(conv_gaiji_title_author($author));
    $self->contents(\@contents);
    $self->bib_info($bib_info || '');
    $self->notation_notes($notation_notes || '');
    $self->gaiji(\@gaiji);
    $self->fig(\@fig);
}

sub _is_chuuki {
    my $elem = shift;
    return $elem->isa('HTML::Element')
           && $elem->tag eq 'span'
           && $elem->attr('class') && $elem->attr('class') =~ /notes/;
}

sub _is_pagebreak {
    my $elem = shift;
    return _is_chuuki($elem) && $elem->as_text =~ /#改丁|#改ページ/;
}

sub _is_center_chuuki {
    my $elem = shift;
    return _is_chuuki($elem) && $elem->as_text =~ /#ページの左右中央/;
}

sub split {
    my $self = shift;

    # ファイルを分割
    # <br/>*<h[123]>* / [#改ページ] / [#改丁]
    my @cur;
    my @files;
    my @contents = @{$self->contents};
    while (my $c = shift @contents) {
        unless ($c->isa('HTML::Element')) {
            push @cur, $c;
            next;
        }
        if (_is_pagebreak($c)) {
            push @files, [@cur] if @cur;
            @cur = ();
            next;
        }
        if ($c->tag =~ m{h[123]}) { # ファイルを区切る
            # 直前の<br/>あるいは空白文字は新しいファイルにいれる
            my @newcur;
            my $last_elem = pop @cur;
            while ($last_elem
                   && (_is_empty($last_elem)
                       || _is_center_chuuki($last_elem))) {
                push @newcur, $last_elem unless _is_center_chuuki($last_elem);
                $last_elem = pop @cur;
            }



( run in 0.562 second using v1.01-cache-2.11-cpan-df04353d9ac )