Aozora2Epub
view release on metacpan or search on metacpan
lib/Aozora2Epub.pm view on Meta::CPAN
package Aozora2Epub;
use utf8;
use strict;
use warnings;
use Aozora2Epub::Gensym;
use Aozora2Epub::CachedGet qw/http_get/;
use Aozora2Epub::Epub;
use Aozora2Epub::XHTML;
use Path::Tiny;
use URI;
use HTML::Escape qw/escape_html/;
use base qw(Class::Accessor);
__PACKAGE__->mk_accessors(qw/files title author epub bib_info notation_notes/);
our $VERSION = '0.05';
our $AOZORA_GAIJI_URL = 'https://www.aozora.gr.jp/gaiji/';
our $AOZORA_CARDS_URL = 'https://www.aozora.gr.jp/cards';
sub _base_url {
my $base = shift;
$base =~ s{[^/]+\.html$}{}s;
return $base;
}
sub _get_file {
my $url_or_path = "" . shift; # force to string.
if ($url_or_path =~ m{^https?://}) {
return http_get($url_or_path);
}
if ($url_or_path =~ m{\.html$}) {
return path($url_or_path)->slurp_utf8;
}
return path($url_or_path)->slurp_raw;
}
sub _get_content {
my $xhtml = shift;
if ($xhtml =~ m{/card\d+\.html$}) { # 峿¸ã«ã¼ã
unless ($xhtml =~ m{^https?://}) { # $xhtml shuld be \d+/card\d+.html
$xhtml = "$AOZORA_CARDS_URL/$xhtml";
}
my $text = _get_file($xhtml);
my $tree = Aozora2Epub::XHTML::Tree->new($text);
my $xhtml_url;
$tree->process('//a[text()="ãã¾ããXHTMLçã§èªã"]' => sub {
$xhtml_url = shift->attr('href');
});
my $xhtml_uri = URI->new($xhtml_url)->abs(URI->new($xhtml));
return _get_content($xhtml_uri->as_string);
}
if ($xhtml =~ m{/files/\d+_\d+\.html$}) { # XHTML
unless ($xhtml =~ m{^https?://}) { # $xhtml shuld be \d+/files/xxx_xxx.html
$xhtml = "$AOZORA_CARDS_URL/$xhtml";
}
my $text = _get_file($xhtml);
return ($text, _base_url($xhtml));
}
# XHTML string
return (qq{<div class="main_text">$xhtml</div>}, undef);
}
sub new {
my ($class, $content, %options) = @_;
my $self = bless {
files => [],
epub => Aozora2Epub::Epub->new,
title => undef,
author => undef,
lib/Aozora2Epub.pm view on Meta::CPAN
sub _build_elemlist_from_xhtml {
my $xhtml = shift;
my $tr = Aozora2Epub::XHTML->new_from_string(qq{<div class="main_text">$xhtml</div>});;
return @{$tr->contents};
}
sub append {
my ($self, $xhtml_like, %options) = @_;
my ($xhtml, $base_url) = _get_content($xhtml_like);
my $doc = Aozora2Epub::XHTML->new_from_string($xhtml);
unless ($options{no_fetch_assets}) {
for my $path (@{$doc->gaiji}) {
my $png = _get_file(_cat_url($AOZORA_GAIJI_URL, $path));
$self->epub->add_gaiji($png, $path);
}
for my $path (@{$doc->fig}) {
my $png = _get_file(_cat_url($base_url, $path));
$self->epub->add_image($png, $path);
}
}
my @files = $doc->split;
my $part_title;
if (defined $options{title_html}) {
$files[0]->insert_content(_build_elemlist_from_xhtml($options{title_html}));
} else {
unless (defined $options{title}) {
if ($options{use_subtitle}) {
$part_title = $doc->subtitle;
}
$part_title ||= $doc->title;
} elsif ($options{title} eq '') {
$part_title = undef;
} else {
$part_title = $options{title};
}
if ($files[0] && $part_title) {
my $title_level = $options{title_level} || 2;
my $tag = "h$title_level";
my $header_elem = HTML::Element->new_from_lol([ $tag, { id => gensym },
$part_title ]);
$files[0]->insert_content($header_elem);
}
}
push @{$self->files}, @files;
$self->title or $self->title($doc->title);
$self->author or $self->author($doc->author);
$self->add_bib_info($part_title, $doc->bib_info);
$self->add_notation_notes($part_title, $doc->notation_notes);
}
sub add_bib_info {
my ($self, $part_title, $bib_info) = @_;
$self->bib_info(join('',
$self->bib_info,
"<br/>",
($part_title
? (q{<h5 class="bib">}, escape_html($part_title), "</h5>")
: ()),
$bib_info));
}
sub add_notation_notes {
my ($self, $part_title, $notes) = @_;
$self->notation_notes(join('',
$self->notation_notes,
"<br/>",
($part_title
? (q{<h5 class="n-notes">}, escape_html($part_title), "</h5>")
: ()),
$notes));
}
sub _make_content_iterator {
my $files = shift;
my @files = @$files;
my $file = shift @files;
my @content = @{$file->content};
my $last;
return (
sub { # get next element
if ($last) {
my $x = $last;
undef $last;
return $x;
}
my $elem = shift @content;
unless ($elem) {
$file = shift @files;
return unless $file;
@content = @{$file->content};
$elem = shift @content;
}
return { elem=>$elem, file=>$file->name };
},
sub { $last = shift; } # putback
);
}
sub _toc {
my ($level, $next, $putback) = @_;
my @cur;
while (my $c = $next->()) {
my $e = $c->{elem};
next unless $e->isa('HTML::Element');
my $tag = $e->tag;
my ($lev) = ($tag =~ m{h(\d)});
next unless $lev;
if ($lev > $level) {
$putback->($c);
my $children = _toc($lev, $next, $putback);
if ($cur[-1] && $cur[-1]->{level} < $lev) {
$cur[-1]->{children} = $children;
} else {
push @cur, @{$children};
}
next;
}
if ($lev < $level) {
$putback->($c);
return \@cur;
}
push @cur, {
name => gensym,
level => $lev,
id => $e->attr('id'),
( run in 1.174 second using v1.01-cache-2.11-cpan-59e3e3084b8 )