BusyBird-Input-Feed

 view release on metacpan or  search on metacpan

Changes  view on Meta::CPAN

        - Fix favicon detector. It now ignores the feed's "link" tag
          if it doesn't look like HTTP or HTTPS URL.

0.05    2014-09-15
        - Fix documentation. No changes to the code.

0.04    2014-09-15
        [ENHANCEMENT]
        - Now it extracts image URLs from feed items.
          "image_max_num" option controls this behavior.
        - busybird_input_feed executable now has --level option.

0.03    2014-08-17
        [PACKAGING]
        - migrate to Module::Build::Prereqs::FromCPANfile.

0.02    2014-07-28
        [BUG FIX]
        - Fix -p option of busybird_input_feed.
          It did not take the argument, silly.

0.01    2014-07-28
        First version, released on an unsuspecting world.

MANIFEST  view on Meta::CPAN

bin/busybird_input_feed
Build.PL
Changes
cpanfile
eg/parallel.pl
lib/BusyBird/Input/Feed.pm
lib/BusyBird/Input/Feed/Run.pm
MANIFEST			This list of files
README
t/00-load.t
t/images.t

bin/busybird_input_feed  view on Meta::CPAN

    post_url => $post_url,
    level => $level,
);

__END__

=pod

=head1 NAME

busybird_input_feed - command-line tool to import RSS/Atom feeds into BusyBird

=head1 SYNOPSIS

    $ busybird_input_feed [URL] [OPTIONS]
    
    ## Download a feed and output JSON statuses to STDOUT
    $ busybird_input_feed 'http://example.com/feed.rss'
    
    ## Input a feed file via STDIN and output JSON statuses to STDOUT
    $ busybird_input_feed < feed.rss
    
    ## Download a feed and post statuses to the BusyBird URL
    $ busybird_input_feed 'http://example.com/feed.rss' -p 'http://localhost:5000/timelines/home/statuses.json'

=head1 DESCRIPTION

This script imports a RSS/Atom feed, converts it into L<BusyBird> statuses and outputs the statuses in JSON format.

By default, it reads STDIN for a feed and writes statuses to STDOUT.
If C<URL> argument is set, the feed is downloaded from that URL.

=head1 OPTIONS

lib/BusyBird/Input/Feed.pm  view on Meta::CPAN

}

sub _get_home_url {
    my ($self, $feed, $statuses) = @_;
    my $home_url = $feed->link;
    if(defined($home_url) && $home_url =~ m{^https?://}i) {
        return $home_url;
    }
    
    foreach my $status (@$statuses) {
        $home_url = $status->{busybird}{status_permalink} if defined($status->{busybird});
        return $home_url if defined $home_url;
    }
    return undef;
}

sub _get_favicon_url {
    my ($self, $feed, $statuses) = @_;
    return try {
        my $home_url = $self->_get_home_url($feed, $statuses);
        return undef if not defined $home_url;

lib/BusyBird/Input/Feed.pm  view on Meta::CPAN

}

sub _make_status_from_item {
    my ($self, $feed_title, $feed_item) = @_;
    my $created_at_dt = $self->_make_timestamp_datetime($feed_item->pubDate);
    my $text = $feed_item->title;
    $text = "" if !defined($text);
    my $permalink = $feed_item->link;
    my $status = {
        text => $text,
        busybird => { defined($permalink) ? (status_permalink => $permalink) : () },
        created_at => ($created_at_dt ? BusyBird::DateTime::Format->format_datetime($created_at_dt) : undef ),
        user => { screen_name => $feed_title },
    };
    my $guid = $feed_item->guid;
    my $item_id;
    if(defined $guid) {
        $item_id = $guid;
        $status->{busybird}{original}{id} = $guid;
    }else {
        $item_id = $feed_item->link;
    }
    if(defined($created_at_dt) && defined($item_id)) {
        $status->{id} = $created_at_dt->epoch . '|' . $item_id;
    }elsif(defined($item_id)) {
        $status->{id} = $item_id;
    }
    my @image_urls = $self->_extract_image_urls($feed_item);
    if(@image_urls) {

lib/BusyBird/Input/Feed.pm  view on Meta::CPAN

    $statuses = $input->parse_file("feed.atom");
    timeline("feed")->add($statuses);
    
    $statuses = $input->parse_url('https://metacpan.org/feed/recent?f=');
    timeline("feed")->add($statuses);

=head1 DESCRIPTION

L<BusyBird::Input::Feed> converts RSS and Atom feeds into L<BusyBird> status objects.

For convenience, an executable script L<busybird_input_feed> is bundled in this distribution.

=head1 CLASS METHODS

=head2 $input = BusyBird::Input::Feed->new(%args)

The constructor.

Fields in C<%args> are:

=over

lib/BusyBird/Input/Feed/Run.pm  view on Meta::CPAN

    my $post_url = $opts{post_url};
    my $user_agent = $opts{user_agent};
    my $level = $opts{level};
    my $input = BusyBird::Input::Feed->new(
        defined($user_agent) ? (user_agent => $user_agent) : ()
    );
    my $json = JSON->new->utf8->ascii;
    my $statuses = _parse_feed($input, $download_url);
    if(defined($level)) {
        foreach my $s (@$statuses) {
            $s->{busybird}{level} = $level;
        }
    }
    my $statuses_json = $json->encode($statuses) . "\n";
    _post_statuses(\$statuses_json, $post_url, $user_agent);
}

sub _parse_feed {
    my ($input, $download_url) = @_;
    if(defined($download_url)) {
        return $input->parse_url($download_url);

t/missing.t  view on Meta::CPAN

use Test::More;
use Test::Warnings;
use BusyBird::Input::Feed;
use File::Spec;

my $input = BusyBird::Input::Feed->new(use_favicon => 0);

my $got = $input->parse_file(File::Spec->catfile(".", "t", "samples", "missing_fields.atom"));

is($got->[0]{text}, "", "title is missing. text should be an empty string.");
ok(!exists($got->[1]{busybird}{status_permalink}), "link is missing. status_permalink should not even exist.");

done_testing;

t/samples.t  view on Meta::CPAN

## present in the expected statuses are checked.

my @testcases = (
    { filename => 'rtcpan.rdf',
      exp_num => 15,
      exp_partial => [
          ## If <guid> is not present, use <link> for item_id.
          ## "id" field is (timestamp | item_id)
          sh(id => '1363869367|https://rt.cpan.org/Ticket/Display.html?id=84118',
             text => 'I really beg you to take back the exception catching feature in Future 0.11',
             busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84118' ),
             created_at => 'Thu Mar 21 12:36:07 +0000 2013',
             user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'} )),
          sh( id => '1364188145|https://rt.cpan.org/Ticket/Display.html?id=84187',
              text => 'needs_all() throws an exception when immediate failed subfutures are given',
              busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84187' ),
              created_at => 'Mon Mar 25 05:09:05 +0000 2013',
              user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'} )),
          sh( id => '1364188230|https://rt.cpan.org/Ticket/Display.html?id=84188',
              text => 'Error message is not user-friendly for followed_by(), and_then(), or_else() and repeat()',
              busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84188' ),
              created_at => 'Mon Mar 25 05:10:30 +0000 2013',
              user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'} )),
          sh( id => '1364188340|https://rt.cpan.org/Ticket/Display.html?id=84189',
              text => 'Behavior of repeat {...} foreach => [] may be counter-intuitive',
              busybird => sh( status_permalink => 'https://rt.cpan.org/Ticket/Display.html?id=84189' ),
              created_at => 'Mon Mar 25 05:12:20 +0000 2013',
              user => sh( screen_name => q{rt.cpan.org: Search Queue = 'future'}))
      ]},
    { filename => 'slashdot.rss',
      exp_num => 25,
      exp_partial => [
          ## use <guid> for item_id. In this case, busybird.original.id should maintain the <guid>
          sh( id => '1404616500|http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35f940/sc/38/l/0Lhardware0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A392340Cby0E20A450Ethe0Etop0Especies0Ewill0Eno0Elonger0Ebe0Ehumans0Eand0Ethat0Ecould0Ebe0Ea0Eproble...
              text => q{By 2045 'The Top Species Will No Longer Be Humans,' and That Could Be a Problem},
              busybird => sh( status_permalink => 'http://rss.slashdot.org/~r/Slashdot/slashdot/~3/HdnfMBYoOr4/story01.htm',
                              original => sh( id => 'http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35f940/sc/38/l/0Lhardware0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A392340Cby0E20A450Ethe0Etop0Especies0Ewill0Eno0Elonger0Ebe0Ehumans0Eand0Ethat0...
              created_at => 'Sun Jul 06 03:15:00 +0000 2014',
              user => sh( screen_name => 'Slashdot' ),

              ## extract <img>s from HTML content. Up to 3 images by default.
              extended_entities => sh(media => [
                  sh(media_url => 'http://a.fsdn.com/sd/twitter_icon_large.png'),
                  sh(media_url => 'http://a.fsdn.com/sd/facebook_icon_large.png'),
                  sh(media_url => 'http://www.gstatic.com/images/icons/gplus-16.png'),
              ])),
          sh( id => '1404606780|http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35c953/sc/32/l/0Lscience0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A42540Ctwo0Eearth0Elike0Eexoplanets0Edont0Eactually0Eexist0Dutm0Isource0Frss10B0Amainlinkanon0Gutm0I...
              text => q{Two Earth-Like Exoplanets Don't Actually Exist},
              busybird => sh( status_permalink => 'http://rss.slashdot.org/~r/Slashdot/slashdot/~3/NcsdVQtQOQQ/story01.htm',
                              original => sh( id => 'http://slashdot.feedsportal.com/c/35028/f/647410/s/3c35c953/sc/32/l/0Lscience0Bslashdot0Borg0Cstory0C140C0A70C0A60C0A0A42540Ctwo0Eearth0Elike0Eexoplanets0Edont0Eactually0Eexist0Dutm0Isource0Frss10B...
              created_at => 'Sun Jul 06 00:33:00 +0000 2014',
              user => sh( screen_name => 'Slashdot' ),
              extended_entities => sh(media => [
                  sh(media_url => 'http://a.fsdn.com/sd/twitter_icon_large.png'),
                  sh(media_url => 'http://a.fsdn.com/sd/facebook_icon_large.png'),
                  sh(media_url => 'http://www.gstatic.com/images/icons/gplus-16.png'),
              ])),
      ]},
    { filename => 'stackoverflow.atom',
      exp_num => 30,
      exp_partial => [
          sh( id => '1404624785|http://stackoverflow.com/q/24593005',
              text => 'How to write Unit Test for IValidatableObject Model',
              busybird => sh( status_permalink => 'http://stackoverflow.com/questions/24593005/how-to-write-unit-test-for-ivalidatableobject-model',
                              original => sh( id => 'http://stackoverflow.com/q/24593005' )),
            
              ## use <updated> date
              created_at => 'Sun Jul 06 05:33:05 +0000 2014',
              user => sh( screen_name => 'Recent Questions - Stack Overflow' )),
          sh( id => '1404624716|http://stackoverflow.com/q/24593002',
              text => 'hide softkeyboard when it is called from menuitem',
              busybird => sh( status_permalink => 'http://stackoverflow.com/questions/24593002/hide-softkeyboard-when-it-is-called-from-menuitem',
                              original => sh( id => 'http://stackoverflow.com/q/24593002' )),
              created_at => 'Sun Jul 06 05:31:56 +0000 2014',
              user => sh( screen_name => 'Recent Questions - Stack Overflow' )),
      ]},
    { filename => 'googlejp.atom',
      exp_num => 25,
      exp_partial => [
          sh( id => '1404701402|tag:blogger.com,1999:blog-20042392.post-2515664455683743324',

              ## status text should be decoded.
              text => 'あたらしい「ごちそうフォト」で、あなたがどんな食通かチェックしましょう。',
              
              ## if there are multiple <link>s, use rel="alternate".
              busybird => sh( status_permalink => 'http://feedproxy.google.com/~r/GoogleJapanBlog/~3/RP_M-WXr_6I/blog-post.html',
                              original => sh( id => 'tag:blogger.com,1999:blog-20042392.post-2515664455683743324' )),

              ## <updated> is used instead of <published>
              created_at => 'Mon Jul 07 11:50:02 +0900 2014',
              user => sh( screen_name => 'Google Japan Blog' ),

              extended_entities => sh( media => [
                  sh(media_url => 'http://1.bp.blogspot.com/-eYSw5ZyZ7Ec/U7YgVYLF3TI/AAAAAAAAM_8/FPpTqUyesk0/s450/gochiphototop1.png'),
                  sh(media_url => 'http://1.bp.blogspot.com/-bp_kUa_Z8uQ/U7Yip34vN-I/AAAAAAAANAU/ktJQhMvf3BQ/s500/gochiprofile.png'),
                  sh(media_url => 'http://4.bp.blogspot.com/-pJkRMfPc2m4/U7Yi-Vm4pvI/AAAAAAAANAc/EbXv8oPCyBM/s100/genre_0011.png'),
              ] )),
          
          sh( id => '1403245680|tag:blogger.com,1999:blog-20042392.post-4467811587369881889',
              text => '最新の Chrome Experiment でキック、ドリブル、シュートを楽しもう!',
              busybird => sh( status_permalink => 'http://feedproxy.google.com/~r/GoogleJapanBlog/~3/qztQgCPoisw/chrome-experiment.html',
                              original => sh( id => 'tag:blogger.com,1999:blog-20042392.post-4467811587369881889' )),

              ## <published> is used when <updated> is missing
              created_at => 'Fri Jun 20 15:28:00 +0900 2014',
              user => sh( screen_name => 'Google Japan Blog' ),

              extended_entities => sh(media => [
                  sh(media_url => 'http://feeds.feedburner.com/~r/GoogleJapanBlog/~4/qztQgCPoisw')
              ])),
      ]},
    { filename => 'slashdotjp.rdf',
      exp_num => 13,
      exp_partial => [
          sh( id => '1404899040|http://linux.slashdot.jp/story/14/07/09/097242/',
              text => 'ミラクル・リナックス、ソフトバンク・テクノロジーに買収される',
              busybird => sh( status_permalink => 'http://linux.slashdot.jp/story/14/07/09/097242/' ),
              created_at => 'Wed Jul 09 09:44:00 +0000 2014',
              user => sh( screen_name => 'スラッシュドット・ジャパン' )),
          sh( id => '1404896100|http://yro.slashdot.jp/story/14/07/09/0533213/',
              text => 'バイオハザードを手がけた三上真司氏の新作ホラーゲームはDLCでCERO Z相当になる',
              busybird => sh( status_permalink => 'http://yro.slashdot.jp/story/14/07/09/0533213/' ),
              created_at => 'Wed Jul 09 08:55:00 +0000 2014',
              user => sh( screen_name => 'スラッシュドット・ジャパン' )),
      ]},
    { filename => 'pukiwiki_rss09.rss',
      exp_num => 15,
      exp_partial => [
          ## both ID and timestamp are missing. item_id is <link>. timestamp is just missing.
          sh( id => 'http://debugitos.main.jp/index.php?Ubuntu%2FTrusty%A5%A4%A5%F3%A5%B9%A5%C8%A1%BC%A5%EB%A5%E1%A5%E2',
              text => 'Ubuntu/Trustyインストールメモ',
              busybird => sh( status_permalink => 'http://debugitos.main.jp/index.php?Ubuntu%2FTrusty%A5%A4%A5%F3%A5%B9%A5%C8%A1%BC%A5%EB%A5%E1%A5%E2' ),
              created_at => undef,
              user => sh( screen_name => q{DebugIto's} )),
      ]},
    { filename => 'nick.rss',
      exp_num => 20,
      exp_partial => [
          sh( id => '1405617373|http://www.nickandmore.com/?p=24392',

              ## decode XML Entities (like &#8217;)
              text => q{Disney XD’s “The 7D” Launches With Solid Ratings, App Hits 1.3M+ Downloads},
              busybird => sh( status_permalink => 'http://www.nickandmore.com/2014/07/17/disney-xds-the-7d-launches-with-solid-ratings-app-hits-1-3m-downloads/',
                              original => sh( id => 'http://www.nickandmore.com/?p=24392' )),
              created_at => 'Thu Jul 17 17:16:13 +0000 2014',
              user => sh( screen_name => 'NICKandMORE' )),
          sh( id => '1405613508|http://www.nickandmore.com/?p=24371',

              ## XML Entities with &amp;
              text => q{Disney Television Animation Announces “Haunted Mansion” Special, Three Pilots & Short-Form Series},
              busybird => sh( status_permalink => 'http://www.nickandmore.com/2014/07/17/disney-television-animation-announces-haunted-mansion-special-three-pilots-short-form-series/',
                              original => sh( id => 'http://www.nickandmore.com/?p=24371' )),
              created_at => 'Thu Jul 17 16:11:48 +0000 2014',
              user => sh( screen_name => 'NICKandMORE' ))
      ]},
    { filename => 'turner_press.rss',
      exp_num => 10,
      exp_partial => [
          sh( id => '1410386063|7606 at https://pressroom.turner.com',
              text => 'Mike Tyson Mysteries',
              busybird => sh( status_permalink => 'https://pressroom.turner.com/us/adult-swim/mike-tyson-mysteries-1',
                              original => sh( id => '7606 at https://pressroom.turner.com' )),
              created_at => 'Wed Sep 10 21:54:23 +0000 2014',
              user => sh( screen_name => 'Turner Press Site' )),
          sh( id => '1410385988|7605 at https://pressroom.turner.com',
              text => 'Mike Tyson Mysteries',
              busybird => sh( status_permalink => 'https://pressroom.turner.com/us/adult-swim/mike-tyson-mysteries-0',
                              original => sh( id => '7605 at https://pressroom.turner.com' )),
              created_at => 'Wed Sep 10 21:53:08 +0000 2014',
              user => sh( screen_name => 'Turner Press Site' ),
              extended_entities => sh(media => [

                  ## In the original feed data, only the path is in the src attr in <img> tag. In this case,
                  ## the permalink's scheme and host should complement the link.
                  sh(media_url => 'https://pressroom.turner.com/modules/file/icons/image-x-generic.png')
              ]))
      ]},
    { filename => 'img_paths.rss',
      exp_num => 2,

      ## test extraction of media_urls from <img> tags with path-only src attributes.
      exp_partial => [
          sh( id => '1410688800|img_paths:02',
              text => 'link ends with non-slash',
              busybird => sh( status_permalink => 'http://example.com/foo/bar/buzz.html',
                              original => sh( id => 'img_paths:02' )),
              created_at => 'Sun Sep 14 10:00:00 +0000 2014',
              user => sh( screen_name => 'Feed for testing img paths' ),
              extended_entities => sh(media => [
                  sh(media_url => 'http://example.com/foo/bar/relative/path.png'),
                  sh(media_url => 'http://example.com/absolute/path.png')
              ])),
          sh( id => '1410685200|img_paths:01',
              text => 'link ends with slash',
              busybird => sh( status_permalink => 'http://example.com/foo/bar/',
                              original => sh( id => 'img_paths:01' )),
              created_at => 'Sun Sep 14 09:00:00 +0000 2014',
              user => sh( screen_name => 'Feed for testing img paths' ),
              extended_entities => sh(media => [
                  sh(media_url => 'http://example.com/foo/bar/relative/path.png'),
                  sh(media_url => 'http://example.com/absolute/path.png')
              ]))
      ]},
);

xt/run.t  view on Meta::CPAN

    my ($output_json) = @_;
    local $Test::Builder::Level = $Test::Builder::Level + 1;
    my $got = decode_json($output_json);
    is ref($got), "ARRAY", "got ARRAY-ref";
    my $num = scalar(@$got);
    cmp_ok $num, '>', 0, "got more than one (actually $num) statuses";
    foreach my $i (0 .. $#$got) {
        my $s = $got->[$i];
        ok defined($s->{id}), "status $i: id is defined";
        ok defined($s->{text}), "status $i: text is defined";
        ok defined($s->{busybird}{status_permalink}), "status $i: busybird.status_permalink is defined";
    }
}

my $run_cmd = "perl -Ilib $FindBin::RealBin/../bin/busybird_input_feed";

{
    note("--- STDIN -> STDOUT");
    my $output = `$run_cmd < '$FindBin::RealBin/../t/samples/stackoverflow.atom'`;
    check_output $output;
}

{
    note("--- STDIN -> STDOUT (level)");
    my $output = `$run_cmd -l 5 < '$FindBin::RealBin/../t/samples/stackoverflow.atom'`;
    my $got = decode_json($output);
    cmp_ok scalar(@$got), ">", 0, "get at least 1 status";
    foreach my $s (@$got) {
        is $s->{busybird}{level}, 5, "level set to 5";
    }
}

{
    note("--- URL -> STDOUT");
    my $output = `$run_cmd 'http://rss.slashdot.org/Slashdot/slashdot'`;
    check_output $output;
}

{

xt/url.t  view on Meta::CPAN


if(!$ENV{BB_INPUT_FEED_NETWORK_TEST}) {
    plan('skip_all', "Set BB_INPUT_FEED_NETWORK_TEST environment to enable the test");
    exit;
}

sub sh { superhashof({@_}) }

my $EXP_STATUS = sh(
    id => ignore,
    busybird => sh( status_permalink => ignore ),
    created_at => ignore,
    user => sh( screen_name => ignore )
);

sub check_statuses {
    my ($label, $got_statuses) = @_;
    cmp_ok scalar(@$got_statuses), ">", 0, "$label: loaded at least 1 status";
    foreach my $status (@$got_statuses) {
        cmp_deeply $status, $EXP_STATUS, "$label: status structure OK";
        note("Status: $status->{text}");



( run in 0.369 second using v1.01-cache-2.11-cpan-87723dcf8b7 )