Aozora2Epub

 view release on metacpan or  search on metacpan

lib/Aozora2Epub.pm  view on Meta::CPAN

package Aozora2Epub;
use utf8;
use strict;
use warnings;
use Aozora2Epub::Gensym;
use Aozora2Epub::CachedGet qw/http_get/;
use Aozora2Epub::Epub;
use Aozora2Epub::XHTML;
use Path::Tiny;
use URI;
use HTML::Escape qw/escape_html/;

use base qw(Class::Accessor);
__PACKAGE__->mk_accessors(qw/files title author epub bib_info notation_notes/);

our $VERSION = '0.05';

our $AOZORA_GAIJI_URL = 'https://www.aozora.gr.jp/gaiji/';
our $AOZORA_CARDS_URL = 'https://www.aozora.gr.jp/cards';

sub _base_url {
    my $base = shift;
    $base =~ s{[^/]+\.html$}{}s;
    return $base;
}

sub _get_file {
    my $url_or_path = "" . shift; # force to string.

    if ($url_or_path =~ m{^https?://}) {
        return http_get($url_or_path);
    }
    if ($url_or_path =~ m{\.html$}) {
        return path($url_or_path)->slurp_utf8;
    }
    return path($url_or_path)->slurp_raw;
}

sub _get_content {
    my $xhtml = shift;
    if ($xhtml =~ m{/card\d+\.html$}) { # 図書カード
        unless ($xhtml =~ m{^https?://}) { # $xhtml shuld be \d+/card\d+.html
            $xhtml = "$AOZORA_CARDS_URL/$xhtml";
        }
        my $text = _get_file($xhtml);
        my $tree = Aozora2Epub::XHTML::Tree->new($text);
        my $xhtml_url;
        $tree->process('//a[text()="いますぐXHTML版で読む"]' => sub {
            $xhtml_url = shift->attr('href');
        });
        my $xhtml_uri = URI->new($xhtml_url)->abs(URI->new($xhtml));
        return _get_content($xhtml_uri->as_string);
    }
    if ($xhtml =~ m{/files/\d+_\d+\.html$}) { # XHTML
        unless ($xhtml =~ m{^https?://}) { # $xhtml shuld be \d+/files/xxx_xxx.html
            $xhtml = "$AOZORA_CARDS_URL/$xhtml";
        }
        my $text = _get_file($xhtml);
        return ($text, _base_url($xhtml));
    }
    # XHTML string
    return (qq{<div class="main_text">$xhtml</div>}, undef);
}

sub new {
    my ($class, $content, %options) = @_;
    my $self =  bless {
        files => [],
        epub => Aozora2Epub::Epub->new,
        title => undef,
        author => undef,
        bib_info => '',
        notation_notes => '',
    }, $class;
    $self->append($content, %options, title=>'') if $content;
    return $self;
}

sub _cat_url {
    my ($base, $path) = @_;
    unless ($base =~ m{^https?://}) {
        return path($base, $path);
    }
    return URI->new($path)->abs(URI->new($base));
}

sub _build_elemlist_from_xhtml {
    my $xhtml = shift;
    my $tr = Aozora2Epub::XHTML->new_from_string(qq{<div class="main_text">$xhtml</div>});;
    return @{$tr->contents};
}

sub append {
    my ($self, $xhtml_like, %options) = @_;

    my ($xhtml, $base_url) = _get_content($xhtml_like);
    my $doc = Aozora2Epub::XHTML->new_from_string($xhtml);

    unless ($options{no_fetch_assets}) {
        for my $path (@{$doc->gaiji}) {
            my $png = _get_file(_cat_url($AOZORA_GAIJI_URL, $path));
            $self->epub->add_gaiji($png, $path);
        }
        for my $path (@{$doc->fig}) {
            my $png = _get_file(_cat_url($base_url, $path));
            $self->epub->add_image($png, $path);
        }
    }
    my @files = $doc->split;
    my $part_title;
    if (defined $options{title_html}) {
        $files[0]->insert_content(_build_elemlist_from_xhtml($options{title_html}));
    } else {
        unless (defined $options{title}) {
            if ($options{use_subtitle}) {
                $part_title = $doc->subtitle;
            }
            $part_title ||= $doc->title;
        } elsif ($options{title} eq '') {
            $part_title = undef;
        } else {

lib/Aozora2Epub.pm  view on Meta::CPAN

            } else {
                push @cur, @{$children};
            }
            next;
        }
        if ($lev < $level) {
            $putback->($c);
            return \@cur;
        }
        push @cur, {
            name => gensym,
            level => $lev,
            id => $e->attr('id'),
            title => $e->as_text,
            file => $c->{file},
        };
    }
    return \@cur;
}

sub _make_toc {
    my $self = shift;
    my ($next, $putback) = _make_content_iterator($self->{files});
    return _toc(1, $next, $putback);
}

sub toc {
    my ($self, $toc) = @_;
    unless ($toc) {
        $self->{toc} ||= $self->_make_toc;
        return $self->{toc};
    }
    $self->{toc} = $toc;
}

sub to_epub {
    my ($self, %options) = @_;

    my $epub_filename = $options{output};
    $epub_filename ||= $self->title . ".epub";

    if ($options{cover}) {
        $self->epub->set_cover($options{cover});
    }
    $self->epub->build_from_doc($self);

    $self->epub->save($epub_filename);
}

sub as_html {
    my $self = shift;
    return join('', map { $_->as_html } @{$self->files});
}
1;
__END__

=encoding utf-8

=head1 NAME

Aozora2Epub - Convert Aozora Bunko XHTML to EPUB

=head1 SYNOPSIS

  use Aozora2Epub;

  my $book = Aozora2Epub->new("https://www.aozora.gr.jp/cards/000262/files/48074_40209.html");
  $book->to_epub;

  # 合本の作成
  $book = Aozora2Epub->new();
  $book->append("000879/card179.html"); # 藪の中
  $book->append("000879/card127.html"); # 羅生門
  $book->title('芥川竜之介作品集');
  $book->to_epub;


=head1 DESCRIPTION

Aozora2Epub は青空文庫のXHTML形式の本をEPUBに変換するモジュールです。

簡単に合本を生成するためのインタフェースも提供しています。

=head1 METHODS

=head2 new

  my $book = Aozora2Epub->new($book_url);
  my $book = Aozora2Epub->new($xhtml_string);
  my $book = Aozora2Epub->new(); # 空のドキュメントを作る

C<$bool_url>で指定した青空文庫の本を読み込みます。
あるいは、文字列として指定された整形式のXHTMLを本の内容として読み込みます。

本は以下のいずれかの形式で指定します。
いずれも、URL先頭の C<https://www.aozora.gr.jp/cards/>の部分を省略することが可能です。

=over 4

=item 図書カードのURL

青空文庫の図書カードのURLです。以下に例を示します。

  https://www.aozora.gr.jp/cards/001569/card59761.html
  
  001569/card59761.html # URLの先頭部分を省略

=item XHTMLのURL

青空文庫のXHTMLファイルのURLです。以下に例を示します。

  https://www.aozora.gr.jp/cards/001569/files/59761_74795.html
  
  001569/files/59761_74795.html # URLの先頭部分を省略

=back

=head2 append

  $book->append($book_url); # 追加する本のタイトルを章タイトルとして使用
  $book->append($book_url, use_subtitle=>1); # 追加する本のサブタイトルを章タイトルとして使用



( run in 2.064 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )