BusyBird-Input-Feed
view release on metacpan or search on metacpan
lib/BusyBird/Input/Feed.pm view on Meta::CPAN
},
image_max_num => defined($args{image_max_num}) ? $args{image_max_num} : 3,
}, $class;
## Note that WWW::Favicon#ua accessor method is not documented (as of version 0.03001)
$self->{favicon_detector}->ua($self->{user_agent});
return $self;
}
sub _get_url_head_and_dir {
my ($url_raw) = @_;
return (undef, undef) if not defined $url_raw;
my $url = URI->new($url_raw);
my $scheme = $url->scheme;
my $authority = $url->authority;
return (undef, undef) if !$scheme || !$authority;
my $url_head = "$scheme://$authority";
my $url_dir;
my $path = $url->path;
if($path =~ m{^(.*/)}i) {
$url_dir = $1;
}else {
$url_dir = "/";
}
return ($url_head, $url_dir);
}
sub _extract_image_urls {
my ($self, $feed_item) = @_;
return () if $self->{image_max_num} == 0;
my $content = $feed_item->description;
return () if !defined($content);
my ($url_head, $url_dir) = _get_url_head_and_dir($feed_item->link);
my @urls = ();
while(($self->{image_max_num} < 0 || @urls < $self->{image_max_num})
&& $content =~ m{<\s*img\s+[^>]*src\s*=\s*(['"])([^>]+?)\1[^>]*>}ig) {
my $url = URI->new($2);
if(!$url->scheme) {
## Only "path" segment is in the src attribute.
next if !defined($url_head) || !defined($url_dir);
if(substr("$url", 0, 1) eq "/") {
$url = "$url_head$url";
}else {
$url = "$url_head$url_dir$url";
}
}
push @urls, "$url";
}
return @urls;
}
sub _get_home_url {
my ($self, $feed, $statuses) = @_;
my $home_url = $feed->link;
if(defined($home_url) && $home_url =~ m{^https?://}i) {
return $home_url;
}
foreach my $status (@$statuses) {
$home_url = $status->{busybird}{status_permalink} if defined($status->{busybird});
return $home_url if defined $home_url;
}
return undef;
}
sub _get_favicon_url {
my ($self, $feed, $statuses) = @_;
return try {
my $home_url = $self->_get_home_url($feed, $statuses);
return undef if not defined $home_url;
my $favicon_url = $self->{favicon_detector}->detect($home_url);
return undef if not defined $favicon_url;
my $res = $self->{user_agent}->get($favicon_url);
return undef if !$res->is_success;
my $type = $res->header('Content-Type');
return undef if defined($type) && $type !~ /^image/i;
return $favicon_url;
};
}
sub _make_timestamp_datetime {
my ($self, $timestamp_str) = @_;
return undef if not defined $timestamp_str;
if($timestamp_str =~ /^\d+$/) {
return DateTime->from_epoch(epoch => $timestamp_str, time_zone => '+0000');
}
my $datetime = try { DateTime::Format::ISO8601->parse_datetime($timestamp_str) };
return $datetime if defined $datetime;
return BusyBird::DateTime::Format->parse_datetime($timestamp_str);
}
sub _make_status_from_item {
my ($self, $feed_title, $feed_item) = @_;
my $created_at_dt = $self->_make_timestamp_datetime($feed_item->pubDate);
my $text = $feed_item->title;
$text = "" if !defined($text);
my $permalink = $feed_item->link;
my $status = {
text => $text,
busybird => { defined($permalink) ? (status_permalink => $permalink) : () },
created_at => ($created_at_dt ? BusyBird::DateTime::Format->format_datetime($created_at_dt) : undef ),
user => { screen_name => $feed_title },
};
my $guid = $feed_item->guid;
my $item_id;
if(defined $guid) {
$item_id = $guid;
$status->{busybird}{original}{id} = $guid;
}else {
$item_id = $feed_item->link;
}
if(defined($created_at_dt) && defined($item_id)) {
$status->{id} = $created_at_dt->epoch . '|' . $item_id;
}elsif(defined($item_id)) {
$status->{id} = $item_id;
}
my @image_urls = $self->_extract_image_urls($feed_item);
if(@image_urls) {
$status->{extended_entities}{media} = [map { +{ media_url => $_, indices => [0,0] } } @image_urls];
}
return $status;
}
sub _make_statuses_from_feed {
my ($self, $feed) = @_;
my $feed_title = $feed->title;
my $statuses = [ map { $self->_make_status_from_item($feed_title, $_) } $feed->get_item ];
return $statuses if !$self->{use_favicon};
my $favicon_url = $self->_get_favicon_url($feed, $statuses);
return $statuses if not defined $favicon_url;
$_->{user}{profile_image_url} = $favicon_url foreach @$statuses;
return $statuses;
}
sub _parse_with_feedpp {
my ($self, $feed_source, $feed_type) = @_;
return $self->_make_statuses_from_feed(XML::FeedPP->new(
$feed_source, -type => $feed_type,
utf8_flag => 1, xml_deref => 1, lwp_useragent => $self->{user_agent},
## FeedPP and TreePP mess up with User-Agent. It's pretty annoying.
user_agent => scalar($self->{user_agent}->agent),
));
}
sub parse_string {
my ($self, $string) = @_;
return $self->_parse_with_feedpp($string, "string");
}
*parse = *parse_string;
sub parse_file {
my ($self, $filename) = @_;
return $self->_parse_with_feedpp($filename, "file");
}
sub parse_url {
my ($self, $url) = @_;
return $self->_parse_with_feedpp($url, "url");
}
*parse_uri = *parse_url;
1;
__END__
=pod
=head1 NAME
BusyBird::Input::Feed - input BusyBird statuses from RSS/Atom feed
=head1 SYNOPSIS
use BusyBird;
use BusyBird::Input::Feed;
my $input = BusyBird::Input::Feed->new;
my $statuses = $input->parse($feed_xml);
timeline("feed")->add($statuses);
$statuses = $input->parse_file("feed.atom");
timeline("feed")->add($statuses);
$statuses = $input->parse_url('https://metacpan.org/feed/recent?f=');
timeline("feed")->add($statuses);
=head1 DESCRIPTION
L<BusyBird::Input::Feed> converts RSS and Atom feeds into L<BusyBird> status objects.
For convenience, an executable script L<busybird_input_feed> is bundled in this distribution.
=head1 CLASS METHODS
=head2 $input = BusyBird::Input::Feed->new(%args)
The constructor.
Fields in C<%args> are:
=over
=item C<use_favicon> => BOOL (optional, default: true)
If true (or omitted or C<undef>), it tries to use the favicon of the Web site providing the feed
as the statuses' icons.
If it's defined and false, it won't use favicon.
=item C<user_agent> => L<LWP::UserAgent> object (optional)
L<LWP::UserAgent> object for fetching documents.
=item C<image_max_num> => INT (optional, default: 3)
The maximum number of image URLs extracted from the feed item.
If set to 0, it extracts no images. If set to a negative value, it extracts all image URLs from the feed item.
The extracted image URLs are stored as Twitter Entities in the status's C<extended_entities> field,
so that L<BusyBird> will render them.
See L<BusyBird::Manual::Status/extended_entities.media> for detail.
=back
=head1 OBJECT METHODS
=head2 $statuses = $input->parse($feed_xml_string)
=head2 $statuses = $input->parse_string($feed_xml_string)
Convert the given C<$feed_xml_string> into L<BusyBird> C<$statuses>.
C<parse()> method is an alias for C<parse_string()>.
C<$feed_xml_string> is the XML data to be parsed.
It must be a string encoded in UTF-8.
Return value C<$statuses> is an array-ref of L<BusyBird> status objects.
If C<$feed_xml_string> is invalid, it croaks.
=head2 $statuses = $input->parse_file($feed_xml_filename)
Same as C<parse_string()> except C<parse_file()> reads the file named C<$feed_xml_filename> and converts its content.
=head2 $statuses = $input->parse_url($feed_xml_url)
=head2 $statuses = $input->parse_uri($feed_xml_url)
Same as C<parse_string()> except C<parse_url()> downloads the feed XML from C<$feed_xml_url> and converts its content.
( run in 1.697 second using v1.01-cache-2.11-cpan-39bf76dae61 )