BusyBird-Input-Feed
view release on metacpan or search on metacpan
- Fix favicon detector. It now ignores the feed's "link" tag
if it doesn't look like HTTP or HTTPS URL.
0.05 2014-09-15
- Fix documentation. No changes to the code.
0.04 2014-09-15
[ENHANCEMENT]
- Now it extracts image URLs from feed items.
"image_max_num" option controls this behavior.
- busybird_input_feed executable now has --level option.
0.03 2014-08-17
[PACKAGING]
- migrate to Module::Build::Prereqs::FromCPANfile.
0.02 2014-07-28
[BUG FIX]
- Fix -p option of busybird_input_feed.
It did not take the argument, silly.
0.01 2014-07-28
First version, released on an unsuspecting world.
bin/busybird_input_feed
Build.PL
Changes
cpanfile
eg/parallel.pl
lib/BusyBird/Input/Feed.pm
lib/BusyBird/Input/Feed/Run.pm
MANIFEST This list of files
README
t/00-load.t
t/images.t
bin/busybird_input_feed view on Meta::CPAN
post_url => $post_url,
level => $level,
);
__END__
=pod
=head1 NAME
busybird_input_feed - command-line tool to import RSS/Atom feeds into BusyBird
=head1 SYNOPSIS
$ busybird_input_feed [URL] [OPTIONS]
## Download a feed and output JSON statuses to STDOUT
$ busybird_input_feed 'http://example.com/feed.rss'
## Input a feed file via STDIN and output JSON statuses to STDOUT
$ busybird_input_feed < feed.rss
## Download a feed and post statuses to the BusyBird URL
$ busybird_input_feed 'http://example.com/feed.rss' -p 'http://localhost:5000/timelines/home/statuses.json'
=head1 DESCRIPTION
This script imports a RSS/Atom feed, converts it into L<BusyBird> statuses and outputs the statuses in JSON format.
By default, it reads STDIN for a feed and writes statuses to STDOUT.
If C<URL> argument is set, the feed is downloaded from that URL.
=head1 OPTIONS
lib/BusyBird/Input/Feed.pm view on Meta::CPAN
}
sub _get_home_url {
my ($self, $feed, $statuses) = @_;
my $home_url = $feed->link;
if(defined($home_url) && $home_url =~ m{^https?://}i) {
return $home_url;
}
foreach my $status (@$statuses) {
$home_url = $status->{busybird}{status_permalink} if defined($status->{busybird});
return $home_url if defined $home_url;
}
return undef;
}
sub _get_favicon_url {
my ($self, $feed, $statuses) = @_;
return try {
my $home_url = $self->_get_home_url($feed, $statuses);
return undef if not defined $home_url;
lib/BusyBird/Input/Feed.pm view on Meta::CPAN
}
sub _make_status_from_item {
my ($self, $feed_title, $feed_item) = @_;
my $created_at_dt = $self->_make_timestamp_datetime($feed_item->pubDate);
my $text = $feed_item->title;
$text = "" if !defined($text);
my $permalink = $feed_item->link;
my $status = {
text => $text,
busybird => { defined($permalink) ? (status_permalink => $permalink) : () },
created_at => ($created_at_dt ? BusyBird::DateTime::Format->format_datetime($created_at_dt) : undef ),
user => { screen_name => $feed_title },
};
my $guid = $feed_item->guid;
my $item_id;
if(defined $guid) {
$item_id = $guid;
$status->{busybird}{original}{id} = $guid;
}else {
$item_id = $feed_item->link;
}
if(defined($created_at_dt) && defined($item_id)) {
$status->{id} = $created_at_dt->epoch . '|' . $item_id;
}elsif(defined($item_id)) {
$status->{id} = $item_id;
}
my @image_urls = $self->_extract_image_urls($feed_item);
if(@image_urls) {
lib/BusyBird/Input/Feed.pm view on Meta::CPAN
$statuses = $input->parse_file("feed.atom");
timeline("feed")->add($statuses);
$statuses = $input->parse_url('https://metacpan.org/feed/recent?f=');
timeline("feed")->add($statuses);
=head1 DESCRIPTION
L<BusyBird::Input::Feed> converts RSS and Atom feeds into L<BusyBird> status objects.
For convenience, an executable script L<busybird_input_feed> is bundled in this distribution.
=head1 CLASS METHODS
=head2 $input = BusyBird::Input::Feed->new(%args)
The constructor.
Fields in C<%args> are:
=over
lib/BusyBird/Input/Feed/Run.pm view on Meta::CPAN
my $post_url = $opts{post_url};
my $user_agent = $opts{user_agent};
my $level = $opts{level};
my $input = BusyBird::Input::Feed->new(
defined($user_agent) ? (user_agent => $user_agent) : ()
);
my $json = JSON->new->utf8->ascii;
my $statuses = _parse_feed($input, $download_url);
if(defined($level)) {
foreach my $s (@$statuses) {
$s->{busybird}{level} = $level;
}
}
my $statuses_json = $json->encode($statuses) . "\n";
_post_statuses(\$statuses_json, $post_url, $user_agent);
}
sub _parse_feed {
my ($input, $download_url) = @_;
if(defined($download_url)) {
return $input->parse_url($download_url);
t/missing.t view on Meta::CPAN
use Test::More;
use Test::Warnings;
use BusyBird::Input::Feed;
use File::Spec;
my $input = BusyBird::Input::Feed->new(use_favicon => 0);
my $got = $input->parse_file(File::Spec->catfile(".", "t", "samples", "missing_fields.atom"));
is($got->[0]{text}, "", "title is missing. text should be an empty string.");
ok(!exists($got->[1]{busybird}{status_permalink}), "link is missing. status_permalink should not even exist.");
done_testing;
t/samples.t view on Meta::CPAN
## present in the expected statuses are checked.
my @testcases = (
{ filename => 'rtcpan.rdf',
exp_num => 15,
exp_partial => [
## If <guid> is not present, use <link> for item_id.
## "id" field is (timestamp | item_id)
sh(id => '1363869367|https://rt.cpan.org/Ticket/Display.html?id=84118',
text => 'I really beg you to take back the exception catching feature in Future 0.11',
busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84118' ),
created_at => 'Thu Mar 21 12:36:07 +0000 2013',
user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'} )),
sh( id => '1364188145|https://rt.cpan.org/Ticket/Display.html?id=84187',
text => 'needs_all() throws an exception when immediate failed subfutures are given',
busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84187' ),
created_at => 'Mon Mar 25 05:09:05 +0000 2013',
user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'} )),
sh( id => '1364188230|https://rt.cpan.org/Ticket/Display.html?id=84188',
text => 'Error message is not user-friendly for followed_by(), and_then(), or_else() and repeat()',
busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84188' ),
created_at => 'Mon Mar 25 05:10:30 +0000 2013',
user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'} )),
sh( id => '1364188340|https://rt.cpan.org/Ticket/Display.html?id=84189',
text => 'Behavior of repeat {...} foreach => [] may be counter-intuitive',
busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84189' ),
created_at => 'Mon Mar 25 05:12:20 +0000 2013',
user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'}))
]},
{ filename => 'slashdot.rss',
exp_num => 25,
exp_partial => [
## use <guid> for item_id. In this case, busybird.original.id should maintain the <guid>
sh( id => '1404616500|http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35f940/sc/38/l/0Lhardware0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A392340Cby0E20A450Ethe0Etop0Especies0Ewill0Eno0Elonger0Ebe0Ehumans0Eand0Ethat0Ecould0Ebe0Ea0Eproble...
text => q{By 2045 'The Top Species Will No Longer Be Humans,' and That Could Be a Problem},
busybird => sh( status_permalink => 'http://rss.slashdot.org/~r/Slashdot/slashdot/~3/HdnfMBYoOr4/story01.htm',
original => sh( id => 'http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35f940/sc/38/l/0Lhardware0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A392340Cby0E20A450Ethe0Etop0Especies0Ewill0Eno0Elonger0Ebe0Ehumans0Eand0Ethat0...
created_at => 'Sun Jul 06 03:15:00 +0000 2014',
user => sh( screen_name => 'Slashdot' ),
## extract <img>s from HTML content. Up to 3 images by default.
extended_entities => sh(media => [
sh(media_url => 'http://a.fsdn.com/sd/twitter_icon_large.png'),
sh(media_url => 'http://a.fsdn.com/sd/facebook_icon_large.png'),
sh(media_url => 'http://www.gstatic.com/images/icons/gplus-16.png'),
])),
sh( id => '1404606780|http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35c953/sc/32/l/0Lscience0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A42540Ctwo0Eearth0Elike0Eexoplanets0Edont0Eactually0Eexist0Dutm0Isource0Frss10B0Amainlinkanon0Gutm0I...
text => q{Two Earth-Like Exoplanets Don't Actually Exist},
busybird => sh( status_permalink => 'http://rss.slashdot.org/~r/Slashdot/slashdot/~3/NcsdVQtQOQQ/story01.htm',
original => sh( id => 'http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35c953/sc/32/l/0Lscience0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A42540Ctwo0Eearth0Elike0Eexoplanets0Edont0Eactually0Eexist0Dutm0Isource0Frss10B...
created_at => 'Sun Jul 06 00:33:00 +0000 2014',
user => sh( screen_name => 'Slashdot' ),
extended_entities => sh(media => [
sh(media_url => 'http://a.fsdn.com/sd/twitter_icon_large.png'),
sh(media_url => 'http://a.fsdn.com/sd/facebook_icon_large.png'),
sh(media_url => 'http://www.gstatic.com/images/icons/gplus-16.png'),
])),
]},
{ filename => 'stackoverflow.atom',
exp_num => 30,
exp_partial => [
sh( id => '1404624785|http://stackoverflow.com/q/24593005',
text => 'How to write Unit Test for IValidatableObject Model',
busybird => sh( status_permalink => 'http://stackoverflow.com/questions/24593005/how-to-write-unit-test-for-ivalidatableobject-model',
original => sh( id => 'http://stackoverflow.com/q/24593005' )),
## use <updated> date
created_at => 'Sun Jul 06 05:33:05 +0000 2014',
user => sh( screen_name => 'Recent Questions - Stack Overflow' )),
sh( id => '1404624716|http://stackoverflow.com/q/24593002',
text => 'hide softkeyboard when it is called from menuitem',
busybird => sh( status_permalink => 'http://stackoverflow.com/questions/24593002/hide-softkeyboard-when-it-is-called-from-menuitem',
original => sh( id => 'http://stackoverflow.com/q/24593002' )),
created_at => 'Sun Jul 06 05:31:56 +0000 2014',
user => sh( screen_name => 'Recent Questions - Stack Overflow' )),
]},
{ filename => 'googlejp.atom',
exp_num => 25,
exp_partial => [
sh( id => '1404701402|tag:blogger.com,1999:blog-20042392.post-2515664455683743324',
## status text should be decoded.
text => 'ãããããããã¡ãããã©ããã§ãããªããã©ããªé£éããã§ãã¯ãã¾ãããã',
## if there are multiple <link>s, use rel="alternate".
busybird => sh( status_permalink => 'http://feedproxy.google.com/~r/GoogleJapanBlog/~3/RP_M-WXr_6I/blog-post.html',
original => sh( id => 'tag:blogger.com,1999:blog-20042392.post-2515664455683743324' )),
## <updated> is used instead of <published>
created_at => 'Mon Jul 07 11:50:02 +0900 2014',
user => sh( screen_name => 'Google Japan Blog' ),
extended_entities => sh( media => [
sh(media_url => 'http://1.bp.blogspot.com/-eYSw5ZyZ7Ec/U7YgVYLF3TI/AAAAAAAAM_8/FPpTqUyesk0/s450/gochiphototop1.png'),
sh(media_url => 'http://1.bp.blogspot.com/-bp_kUa_Z8uQ/U7Yip34vN-I/AAAAAAAANAU/ktJQhMvf3BQ/s500/gochiprofile.png'),
sh(media_url => 'http://4.bp.blogspot.com/-pJkRMfPc2m4/U7Yi-Vm4pvI/AAAAAAAANAc/EbXv8oPCyBM/s100/genre_0011.png'),
] )),
sh( id => '1403245680|tag:blogger.com,1999:blog-20042392.post-4467811587369881889',
text => 'ææ°ã® Chrome Experiment ã§ããã¯ãããªãã«ãã·ã¥ã¼ããæ¥½ããã!',
busybird => sh( status_permalink => 'http://feedproxy.google.com/~r/GoogleJapanBlog/~3/qztQgCPoisw/chrome-experiment.html',
original => sh( id => 'tag:blogger.com,1999:blog-20042392.post-4467811587369881889' )),
## <published> is used when <updated> is missing
created_at => 'Fri Jun 20 15:28:00 +0900 2014',
user => sh( screen_name => 'Google Japan Blog' ),
extended_entities => sh(media => [
sh(media_url => 'http://feeds.feedburner.com/~r/GoogleJapanBlog/~4/qztQgCPoisw')
])),
]},
{ filename => 'slashdotjp.rdf',
exp_num => 13,
exp_partial => [
sh( id => '1404899040|http://linux.slashdot.jp/story/14/07/09/097242/',
text => 'ãã©ã¯ã«ã»ãªããã¯ã¹ãã½ãããã³ã¯ã»ãã¯ããã¸ã¼ã«è²·åããã',
busybird => sh( status_permalink => 'http://linux.slashdot.jp/story/14/07/09/097242/' ),
created_at => 'Wed Jul 09 09:44:00 +0000 2014',
user => sh( screen_name => 'ã¹ã©ãã·ã¥ãããã»ã¸ã£ãã³' )),
sh( id => '1404896100|http://yro.slashdot.jp/story/14/07/09/0533213/',
text => 'ãã¤ãªãã¶ã¼ããæãããä¸ä¸ç叿°ã®æ°ä½ãã©ã¼ã²ã¼ã ã¯DLCã§CERO Zç¸å½ã«ãªã',
busybird => sh( status_permalink => 'http://yro.slashdot.jp/story/14/07/09/0533213/' ),
created_at => 'Wed Jul 09 08:55:00 +0000 2014',
user => sh( screen_name => 'ã¹ã©ãã·ã¥ãããã»ã¸ã£ãã³' )),
]},
{ filename => 'pukiwiki_rss09.rss',
exp_num => 15,
exp_partial => [
## both ID and timestamp are missing. item_id is <link>. timestamp is just missing.
sh( id => 'http://debugitos.main.jp/index.php?Ubuntu%2FTrusty%A5%A4%A5%F3%A5%B9%A5%C8%A1%BC%A5%EB%A5%E1%A5%E2',
text => 'Ubuntu/Trustyã¤ã³ã¹ãã¼ã«ã¡ã¢',
busybird => sh( status_permalink => 'http://debugitos.main.jp/index.php?Ubuntu%2FTrusty%A5%A4%A5%F3%A5%B9%A5%C8%A1%BC%A5%EB%A5%E1%A5%E2' ),
created_at => undef,
user => sh( screen_name => q{DebugIto's} )),
]},
{ filename => 'nick.rss',
exp_num => 20,
exp_partial => [
sh( id => '1405617373|http://www.nickandmore.com/?p=24392',
## decode XML Entities (like ’)
text => q{Disney XDâs âThe 7Dâ Launches With Solid Ratings, App Hits 1.3M+ Downloads},
busybird => sh( status_permalink => 'http://www.nickandmore.com/2014/07/17/disney-xds-the-7d-launches-with-solid-ratings-app-hits-1-3m-downloads/',
original => sh( id => 'http://www.nickandmore.com/?p=24392' )),
created_at => 'Thu Jul 17 17:16:13 +0000 2014',
user => sh( screen_name => 'NICKandMORE' )),
sh( id => '1405613508|http://www.nickandmore.com/?p=24371',
## XML Entities with &
text => q{Disney Television Animation Announces âHaunted Mansionâ Special, Three Pilots & Short-Form Series},
busybird => sh( status_permalink => 'http://www.nickandmore.com/2014/07/17/disney-television-animation-announces-haunted-mansion-special-three-pilots-short-form-series/',
original => sh( id => 'http://www.nickandmore.com/?p=24371' )),
created_at => 'Thu Jul 17 16:11:48 +0000 2014',
user => sh( screen_name => 'NICKandMORE' ))
]},
{ filename => 'turner_press.rss',
exp_num => 10,
exp_partial => [
sh( id => '1410386063|7606 at https://pressroom.turner.com',
text => 'Mike Tyson Mysteries',
busybird => sh( status_permalink => 'https://pressroom.turner.com/us/adult-swim/mike-tyson-mysteries-1',
original => sh( id => '7606 at https://pressroom.turner.com' )),
created_at => 'Wed Sep 10 21:54:23 +0000 2014',
user => sh( screen_name => 'Turner Press Site' )),
sh( id => '1410385988|7605 at https://pressroom.turner.com',
text => 'Mike Tyson Mysteries',
busybird => sh( status_permalink => 'https://pressroom.turner.com/us/adult-swim/mike-tyson-mysteries-0',
original => sh( id => '7605 at https://pressroom.turner.com' )),
created_at => 'Wed Sep 10 21:53:08 +0000 2014',
user => sh( screen_name => 'Turner Press Site' ),
extended_entities => sh(media => [
## In the original feed data, only the path is in the src attr in <img> tag. In this case,
## the permalink's scheme and host should complement the link.
sh(media_url => 'https://pressroom.turner.com/modules/file/icons/image-x-generic.png')
]))
]},
{ filename => 'img_paths.rss',
exp_num => 2,
## test extraction of media_urls from <img> tags with path-only src attributes.
exp_partial => [
sh( id => '1410688800|img_paths:02',
text => 'link ends with non-slash',
busybird => sh( status_permalink => 'http://example.com/foo/bar/buzz.html',
original => sh( id => 'img_paths:02' )),
created_at => 'Sun Sep 14 10:00:00 +0000 2014',
user => sh( screen_name => 'Feed for testing img paths' ),
extended_entities => sh(media => [
sh(media_url => 'http://example.com/foo/bar/relative/path.png'),
sh(media_url => 'http://example.com/absolute/path.png')
])),
sh( id => '1410685200|img_paths:01',
text => 'link ends with slash',
busybird => sh( status_permalink => 'http://example.com/foo/bar/',
original => sh( id => 'img_paths:01' )),
created_at => 'Sun Sep 14 09:00:00 +0000 2014',
user => sh( screen_name => 'Feed for testing img paths' ),
extended_entities => sh(media => [
sh(media_url => 'http://example.com/foo/bar/relative/path.png'),
sh(media_url => 'http://example.com/absolute/path.png')
]))
]},
);
my ($output_json) = @_;
local $Test::Builder::Level = $Test::Builder::Level + 1;
my $got = decode_json($output_json);
is ref($got), "ARRAY", "got ARRAY-ref";
my $num = scalar(@$got);
cmp_ok $num, '>', 0, "got more than one (actually $num) statuses";
foreach my $i (0 .. $#$got) {
my $s = $got->[$i];
ok defined($s->{id}), "status $i: id is defined";
ok defined($s->{text}), "status $i: text is defined";
ok defined($s->{busybird}{status_permalink}), "status $i: busybird.status_permalink is defined";
}
}
my $run_cmd = "perl -Ilib $FindBin::RealBin/../bin/busybird_input_feed";
{
note("--- STDIN -> STDOUT");
my $output = `$run_cmd < '$FindBin::RealBin/../t/samples/stackoverflow.atom'`;
check_output $output;
}
{
note("--- STDIN -> STDOUT (level)");
my $output = `$run_cmd -l 5 < '$FindBin::RealBin/../t/samples/stackoverflow.atom'`;
my $got = decode_json($output);
cmp_ok scalar(@$got), ">", 0, "get at least 1 status";
foreach my $s (@$got) {
is $s->{busybird}{level}, 5, "level set to 5";
}
}
{
note("--- URL -> STDOUT");
my $output = `$run_cmd 'http://rss.slashdot.org/Slashdot/slashdot'`;
check_output $output;
}
{
if(!$ENV{BB_INPUT_FEED_NETWORK_TEST}) {
plan('skip_all', "Set BB_INPUT_FEED_NETWORK_TEST environment to enable the test");
exit;
}
sub sh { superhashof({@_}) }
my $EXP_STATUS = sh(
id => ignore,
busybird => sh( status_permalink => ignore ),
created_at => ignore,
user => sh( screen_name => ignore )
);
sub check_statuses {
my ($label, $got_statuses) = @_;
cmp_ok scalar(@$got_statuses), ">", 0, "$label: loaded at least 1 status";
foreach my $status (@$got_statuses) {
cmp_deeply $status, $EXP_STATUS, "$label: status structure OK";
note("Status: $status->{text}");
( run in 0.320 second using v1.01-cache-2.11-cpan-87723dcf8b7 )