BusyBird-Input-Feed
view release on metacpan or search on metacpan
lib/BusyBird/Input/Feed.pm view on Meta::CPAN
package BusyBird::Input::Feed;
use strict;
use warnings;
use XML::FeedPP;
use DateTime::Format::ISO8601;
use BusyBird::DateTime::Format;
use DateTime;
use Try::Tiny;
use Carp;
use WWW::Favicon ();
use LWP::UserAgent;
use URI;
our $VERSION = "0.07";
our @CARP_NOT = qw(Try::Tiny XML::FeedPP);
sub new {
my ($class, %args) = @_;
my $self = bless {
use_favicon => defined($args{use_favicon}) ? $args{use_favicon} : 1,
favicon_detector => WWW::Favicon->new,
user_agent => defined($args{user_agent}) ? $args{user_agent} : do {
my $ua = LWP::UserAgent->new;
$ua->env_proxy;
$ua->timeout(30);
$ua->agent("BusyBird::Inpu::Feed-$VERSION"); ## some Web sites ban LWP::UserAgent's default UserAgent...
$ua;
},
image_max_num => defined($args{image_max_num}) ? $args{image_max_num} : 3,
}, $class;
## Note that WWW::Favicon#ua accessor method is not documented (as of version 0.03001)
$self->{favicon_detector}->ua($self->{user_agent});
return $self;
}
sub _get_url_head_and_dir {
my ($url_raw) = @_;
return (undef, undef) if not defined $url_raw;
my $url = URI->new($url_raw);
my $scheme = $url->scheme;
my $authority = $url->authority;
return (undef, undef) if !$scheme || !$authority;
my $url_head = "$scheme://$authority";
my $url_dir;
my $path = $url->path;
if($path =~ m{^(.*/)}i) {
$url_dir = $1;
}else {
$url_dir = "/";
}
return ($url_head, $url_dir);
}
sub _extract_image_urls {
my ($self, $feed_item) = @_;
return () if $self->{image_max_num} == 0;
my $content = $feed_item->description;
return () if !defined($content);
my ($url_head, $url_dir) = _get_url_head_and_dir($feed_item->link);
my @urls = ();
while(($self->{image_max_num} < 0 || @urls < $self->{image_max_num})
&& $content =~ m{<\s*img\s+[^>]*src\s*=\s*(['"])([^>]+?)\1[^>]*>}ig) {
my $url = URI->new($2);
if(!$url->scheme) {
## Only "path" segment is in the src attribute.
next if !defined($url_head) || !defined($url_dir);
if(substr("$url", 0, 1) eq "/") {
$url = "$url_head$url";
}else {
$url = "$url_head$url_dir$url";
}
}
push @urls, "$url";
( run in 2.095 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )