Aozora2Epub

 view release on metacpan or  search on metacpan

lib/Aozora2Epub.pm  view on Meta::CPAN

package Aozora2Epub;
use utf8;
use strict;
use warnings;
use Aozora2Epub::Gensym;
use Aozora2Epub::CachedGet qw/http_get/;
use Aozora2Epub::Epub;
use Aozora2Epub::XHTML;
use Path::Tiny;
use URI;
use HTML::Escape qw/escape_html/;

use base qw(Class::Accessor);
__PACKAGE__->mk_accessors(qw/files title author epub bib_info notation_notes/);

our $VERSION = '0.05';

our $AOZORA_GAIJI_URL = 'https://www.aozora.gr.jp/gaiji/';
our $AOZORA_CARDS_URL = 'https://www.aozora.gr.jp/cards';

sub _base_url {
    my $base = shift;
    $base =~ s{[^/]+\.html$}{}s;
    return $base;
}

sub _get_file {
    my $url_or_path = "" . shift; # force to string.

    if ($url_or_path =~ m{^https?://}) {
        return http_get($url_or_path);
    }
    if ($url_or_path =~ m{\.html$}) {
        return path($url_or_path)->slurp_utf8;
    }
    return path($url_or_path)->slurp_raw;
}

sub _get_content {
    my $xhtml = shift;
    if ($xhtml =~ m{/card\d+\.html$}) { # 図書カード
        unless ($xhtml =~ m{^https?://}) { # $xhtml shuld be \d+/card\d+.html
            $xhtml = "$AOZORA_CARDS_URL/$xhtml";
        }
        my $text = _get_file($xhtml);
        my $tree = Aozora2Epub::XHTML::Tree->new($text);
        my $xhtml_url;
        $tree->process('//a[text()="いますぐXHTML版で読む"]' => sub {
            $xhtml_url = shift->attr('href');
        });
        my $xhtml_uri = URI->new($xhtml_url)->abs(URI->new($xhtml));
        return _get_content($xhtml_uri->as_string);
    }
    if ($xhtml =~ m{/files/\d+_\d+\.html$}) { # XHTML
        unless ($xhtml =~ m{^https?://}) { # $xhtml shuld be \d+/files/xxx_xxx.html
            $xhtml = "$AOZORA_CARDS_URL/$xhtml";
        }
        my $text = _get_file($xhtml);
        return ($text, _base_url($xhtml));
    }
    # XHTML string
    return (qq{<div class="main_text">$xhtml</div>}, undef);
}

sub new {
    my ($class, $content, %options) = @_;
    my $self =  bless {
        files => [],
        epub => Aozora2Epub::Epub->new,
        title => undef,
        author => undef,

lib/Aozora2Epub.pm  view on Meta::CPAN


sub _build_elemlist_from_xhtml {
    my $xhtml = shift;
    my $tr = Aozora2Epub::XHTML->new_from_string(qq{<div class="main_text">$xhtml</div>});;
    return @{$tr->contents};
}

sub append {
    my ($self, $xhtml_like, %options) = @_;

    my ($xhtml, $base_url) = _get_content($xhtml_like);
    my $doc = Aozora2Epub::XHTML->new_from_string($xhtml);

    unless ($options{no_fetch_assets}) {
        for my $path (@{$doc->gaiji}) {
            my $png = _get_file(_cat_url($AOZORA_GAIJI_URL, $path));
            $self->epub->add_gaiji($png, $path);
        }
        for my $path (@{$doc->fig}) {
            my $png = _get_file(_cat_url($base_url, $path));
            $self->epub->add_image($png, $path);
        }
    }
    my @files = $doc->split;
    my $part_title;
    if (defined $options{title_html}) {
        $files[0]->insert_content(_build_elemlist_from_xhtml($options{title_html}));
    } else {
        unless (defined $options{title}) {
            if ($options{use_subtitle}) {
                $part_title = $doc->subtitle;
            }
            $part_title ||= $doc->title;
        } elsif ($options{title} eq '') {
            $part_title = undef;
        } else {
            $part_title = $options{title};
        }
        if ($files[0] && $part_title) {
            my $title_level = $options{title_level} || 2;
            my $tag = "h$title_level";
            my $header_elem = HTML::Element->new_from_lol([ $tag, { id => gensym },
                                                            $part_title ]);
            $files[0]->insert_content($header_elem);
        }
    }
    push @{$self->files}, @files;
    $self->title or $self->title($doc->title);
    $self->author or $self->author($doc->author);
    $self->add_bib_info($part_title, $doc->bib_info);
    $self->add_notation_notes($part_title, $doc->notation_notes);
}

sub add_bib_info {
    my ($self, $part_title, $bib_info) = @_;

    $self->bib_info(join('',
                         $self->bib_info,
                         "<br/>",
                         ($part_title
                          ? (q{<h5 class="bib">}, escape_html($part_title), "</h5>")
                          : ()),
                         $bib_info));
}

sub add_notation_notes {
    my ($self, $part_title, $notes) = @_;

    $self->notation_notes(join('',
                               $self->notation_notes,
                               "<br/>",
                               ($part_title
                                ? (q{<h5 class="n-notes">}, escape_html($part_title), "</h5>")
                                : ()),
                               $notes));
}

sub _make_content_iterator {
    my $files = shift;

    my @files = @$files;
    my $file = shift @files;
    my @content = @{$file->content};
    my $last;

    return (
        sub { # get next element
            if ($last) {
                my $x = $last;
                undef $last;
                return $x;
            }
            my $elem = shift @content;
            unless ($elem) {
                $file = shift @files;
                return unless $file;
                @content = @{$file->content};
                $elem = shift @content;
            }
            return { elem=>$elem, file=>$file->name };
        },
        sub { $last  = shift; } # putback
    );
}

sub _toc {
    my ($level, $next, $putback) = @_;

    my @cur;
    while (my $c = $next->()) {
        my $e = $c->{elem};
        next unless $e->isa('HTML::Element');
        my $tag = $e->tag;
        my ($lev) = ($tag =~ m{h(\d)});
        next unless $lev;
        if ($lev > $level) {
            $putback->($c);
            my $children = _toc($lev, $next, $putback);
            if ($cur[-1] && $cur[-1]->{level} < $lev) {
                $cur[-1]->{children} = $children;
            } else {
                push @cur, @{$children};
            }
            next;
        }
        if ($lev < $level) {
            $putback->($c);
            return \@cur;
        }
        push @cur, {
            name => gensym,
            level => $lev,
            id => $e->attr('id'),



( run in 1.174 second using v1.01-cache-2.11-cpan-59e3e3084b8 )