NewsExtractor

 view release on metacpan or  search on metacpan

lib/NewsExtractor/SiteSpecificExtractor/news_pts_org_tw.pm  view on Meta::CPAN

package NewsExtractor::SiteSpecificExtractor::news_pts_org_tw;
use utf8;
use Moo;

extends 'NewsExtractor::JSONLDExtractor';
with 'NewsExtractor::Role::ContentTextExtractor';

use HTML::ExtractContent;
use Importer 'NewsExtractor::TextUtil' => ('html2text', 'reformat_dateline');
use Importer 'Ref::Util' => ('is_hashref');

sub journalist {
    my ($self) = @_;
    my $name;
    my $author = $self->schema_ld->{author};
    if (is_hashref($author) && exists($author->{"name"})) {
        $name = $author->{"name"};
    } else {
        $name = $author;
    }
    return $name;
}

around dateline => sub {
    my $orig = shift;
    my $ret = $orig->(@_);
    return reformat_dateline($ret, '+08:00');
};

around '_build_content_text', sub {
    my $orig = shift;
    my ($self) = @_;

    if (my $el = $self->dom->at('article.post-article')) {
        my $extractor = HTML::ExtractContent->new;
        my $html = $extractor->extract("$el")->as_html;
        my $text = html2text( $html );
        return $text;
    }

    return $orig->($self);
};

1;



( run in 1.233 second using v1.01-cache-2.11-cpan-39bf76dae61 )