App-Zapzi
view release on metacpan or search on metacpan
lib/App/Zapzi/Transformers/HTML.pm view on Meta::CPAN
package App::Zapzi::Transformers::HTML;
# ABSTRACT: process HTML without doing readability transforms
use utf8;
use strict;
use warnings;
our $VERSION = '0.017'; # VERSION
use Carp;
use Encode;
use HTML::Element;
use HTML::Entities ();
use Moo;
with 'App::Zapzi::Roles::Transformer';
sub name
{
return 'HTML';
}
sub handles
{
# By default HTMLExtractMain will handle HTML, not this
return 0;
}
sub transform
{
my $self = shift;
# Use the passed in text if explicity set, else get it from the
# fetched article object. This is used by derived classes that
# transform text into HTML then call this method.
my ($input) = @_;
$input //= $self->input->text;
my $encoding = 'utf8';
if ($self->input->content_type =~ m/charset=([\w-]+)/)
{
$encoding = $1;
}
my $raw_html = Encode::decode($encoding, $input);
$self->_extract_title($raw_html);
my $tree = $self->_extract_html($raw_html);
return unless $tree;
# Delete some elements we don't need
for my $element ($tree->find_by_tag_name(
qw{img script noscript object iframe}))
{
$element->delete;
}
# Set up options to extract the HTML from the tree
my $entities_to_encode = '<>&\'"';
my $indent = ' ' x 4;
my $optional_end_tags = {};
my $text = $tree->as_HTML($entities_to_encode, $indent,
$optional_end_tags);
$text =~ s|<[/]*body>||sg;
$self->_set_readable_text($text);
return 1;
}
sub _extract_title
{
my $self = shift;
my ($raw_html) = @_;
my $title;
# Try finding the <title> tag first
my $tree = eval { HTML::TreeBuilder->new_from_content($raw_html) };
if ($tree)
{
my $tag = $tree->find_by_tag_name('title');
my $content;
$content = ($tag->content_list)[0] if $tag;
# Strip surrounding whitespace and decode HTML entities
$content =~ s/^\s+|\s+$//g if $content;
$title = HTML::Entities::decode($content) if $content;
}
# Use the URL/filename if no title could be found or parsed from
# the HTML
if (! $title)
{
$title = $self->input->source;
}
$self->_set_title($title);
}
sub _extract_html
{
( run in 0.773 second using v1.01-cache-2.11-cpan-39bf76dae61 )