BusyBird-Input-Feed

 view release on metacpan or  search on metacpan

lib/BusyBird/Input/Feed.pm  view on Meta::CPAN

package BusyBird::Input::Feed;
use strict;
use warnings;
use XML::FeedPP;
use DateTime::Format::ISO8601;
use BusyBird::DateTime::Format;
use DateTime;
use Try::Tiny;
use Carp;
use WWW::Favicon ();
use LWP::UserAgent;
use URI;

our $VERSION = "0.07";

our @CARP_NOT = qw(Try::Tiny XML::FeedPP);

sub new {
    my ($class, %args) = @_;
    my $self = bless {
        use_favicon => defined($args{use_favicon}) ? $args{use_favicon} : 1,
        favicon_detector => WWW::Favicon->new,
        user_agent => defined($args{user_agent}) ? $args{user_agent} : do {
            my $ua = LWP::UserAgent->new;
            $ua->env_proxy;
            $ua->timeout(30);
            $ua->agent("BusyBird::Inpu::Feed-$VERSION");  ## some Web sites ban LWP::UserAgent's default UserAgent...
            $ua;
        },
        image_max_num => defined($args{image_max_num}) ? $args{image_max_num} : 3,
    }, $class;

    ## Note that WWW::Favicon#ua accessor method is not documented (as of version 0.03001)
    $self->{favicon_detector}->ua($self->{user_agent});
    
    return $self;
}

sub _get_url_head_and_dir {
    my ($url_raw) = @_;
    return (undef, undef) if not defined $url_raw;
    my $url = URI->new($url_raw);
    my $scheme = $url->scheme;
    my $authority = $url->authority;
    return (undef, undef) if !$scheme || !$authority;
    my $url_head = "$scheme://$authority";
    my $url_dir;
    my $path = $url->path;
    if($path =~ m{^(.*/)}i) {
        $url_dir = $1;
    }else {
        $url_dir = "/";
    }
    return ($url_head, $url_dir);
}

sub _extract_image_urls {
    my ($self, $feed_item) = @_;
    return () if $self->{image_max_num} == 0;
    my $content = $feed_item->description;
    return () if !defined($content);
    my ($url_head, $url_dir) = _get_url_head_and_dir($feed_item->link);
    my @urls = ();
    while(($self->{image_max_num} < 0 || @urls < $self->{image_max_num})
          && $content =~ m{<\s*img\s+[^>]*src\s*=\s*(['"])([^>]+?)\1[^>]*>}ig) {
        my $url = URI->new($2);
        if(!$url->scheme) {
            ## Only "path" segment is in the src attribute.
            next if !defined($url_head) || !defined($url_dir);
            if(substr("$url", 0, 1) eq "/") {
                $url = "$url_head$url";
            }else {
                $url = "$url_head$url_dir$url";
            }
        }
        push @urls, "$url";



( run in 2.095 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )