App-jupiter
view release on metacpan or search on metacpan
script/jupiter view on Meta::CPAN
B<jupiter html> I<atom.xml template.xml planet.html template.html feed.opml>
In the above case, Planet Jupiter will write a feed called F<atom.xml> based on
F<template.xml> and a HTML file called F<planet.html> based on F<template.html>,
using the cached entries matching the feeds in F<feed.opml>.
=cut
use DateTime;
use DateTime::Format::Mail;
use DateTime::Format::ISO8601;
use File::Basename;
use File::Slurper qw(read_binary write_binary read_text write_text);
use List::Util qw(uniq min shuffle);
use Modern::Perl;
use Mojo::Log;
use Mojo::JSON qw(decode_json encode_json);
use Mojo::Template;
use Mojo::UserAgent;
use Pod::Simple::Text;
use XML::LibXML;
use Mojo::Util qw(slugify trim xml_escape html_unescape);
use File::ShareDir 'dist_file';
use vars qw($log);
our $log = Mojo::Log->new;
my $xpc = XML::LibXML::XPathContext->new;
$xpc->registerNs('atom', 'http://www.w3.org/2005/Atom');
$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml');
$xpc->registerNs('dc', 'http://purl.org/dc/elements/1.1/');
$xpc->registerNs('itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd');
my $undefined_date = DateTime->from_epoch( epoch => 0 );
my (%wday, %month, $wday_re, $month_re);
%wday = qw (lun. Mon mar. Tue mer. Wed jeu. Thu ven. Fri sam. Sat dim. Sun);
%month = qw (janv. Jan févr. Feb mars Mar avr. Apr mai May juin Jun
juil. Jul août Aug sept. Sep oct. Oct nov. Nov déc. Dec);
$wday_re = join('|', map { quotemeta } keys %wday) unless $wday_re;
$month_re = join('|', map { quotemeta } keys %month) unless $month_re;
# Our tests don't want to call main
__PACKAGE__->main unless caller;
sub main {
my ($log_level) = grep /^--log=/, @ARGV;
$log->level(substr($log_level, 6)) if $log_level;
my ($command) = grep /^[a-z]+$/, @ARGV;
$command ||= 'help';
if ($command eq 'update') {
update_cache(@ARGV);
} elsif ($command eq 'html') {
make_html(@ARGV);
} else {
my $parser = Pod::Simple::Text->new();
$parser->parse_file($0);
}
}
sub update_cache {
my ($feeds, $files) = read_opml(@_);
make_directories($feeds);
load_feed_metadata($feeds, $files);
my $ua = Mojo::UserAgent->new->with_roles('+Queued')
->max_redirects(3)
->max_active(5);
make_promises($ua, $feeds);
fetch_feeds($feeds);
save_feed_metadata($feeds, $files);
cleanup_cache($feeds);
}
sub make_promises {
my $ua = shift;
my $feeds = shift;
for my $feed (@$feeds) {
my $url = html_unescape $feed->{url}; # undo xml_escape for the request
$ua->on(start => sub {
my ($ua, $tx) = @_;
$tx->req->headers->if_none_match($feed->{etag}) if ($feed->{etag});
$tx->req->headers->if_modified_since($feed->{last_modified}) if ($feed->{last_modified});
});
$feed->{promise} = $ua->get_p($url)
->catch(sub {
$feed->{message} = "@_";
$feed->{code} = 521;
# returning 0 in the case of an error is important
0; })
# sleeping to stop blogger.com from blocking us
->finally(sub { $log->debug($url); sleep 2; });
}
}
sub fetch_feeds {
my $feeds = shift;
$log->info("Fetching feeds...");
Mojo::Promise->all(map { $_->{promise} } @$feeds)->then(sub {
# all returns the values in the same order!
for (my $i = 0; $i < @_; $i++) {
my $feed = $feeds->[$i];
my $value = $_[$i];
my $tx = $value->[0];
# relies on catch returning 0 above
next unless $tx;
$feed->{message} = $tx->result->message;
$feed->{code} = $tx->result->code;
$feed->{last_modified} = $tx->result->headers->last_modified;
$feed->{etag} = $tx->result->headers->etag;
# save raw bytes if this is a success
eval { write_binary($feed->{cache_file}, $tx->result->body) } if $tx->result->is_success;
warn "Unable to write $feed->{cache_file}: $@\n" if $@;
}
})->catch(sub {
warn "Something went wrong: @_";
})->wait;
}
sub load_feed_metadata {
my $feeds = shift;
my $files = shift;
script/jupiter view on Meta::CPAN
for my $feed (@$feeds) {
next unless -r $feed->{cache_file};
my $doc = eval { XML::LibXML->load_xml(recover => 2, location => $feed->{cache_file} )};
if (not $doc) {
$feed->{message} = xml_escape "Parsing error: $@";
$feed->{code} = 422; # unprocessable
next;
}
$feed->{doc} = $doc;
my @nodes = $xpc->findnodes("/rss/channel/item | /atom:feed/atom:entry", $doc);
if (not @nodes) {
$feed->{message} = "Empty feed";
$feed->{code} = 204; # no content
next;
}
# if this is an Atom feed, we need to sort the entries ourselves (older entries at the end)
my @candidates = map {
my $entry = {};
$entry->{element} = $_;
$entry->{id} = id($_);
$entry->{date} = updated($_) || $undefined_date;
$entry;
} @nodes;
@candidates = grep { DateTime->compare($_->{date}, $now) <= 0 } @candidates;
@candidates = unique(sort { DateTime->compare( $b->{date}, $a->{date} ) } @candidates);
@candidates = @candidates[0 .. min($#candidates, $limit - 1)];
# now that we have limited the candidates, let's add more metadata from the feed
for my $entry (@candidates) {
$entry->{feed} = $feed;
# these two are already escaped
$entry->{blog_title} = $feed->{title};
$entry->{blog_url} = $feed->{url};
}
add_age_warning($feed, \@candidates, $date);
push @entries, @candidates;
}
return \@entries;
}
sub add_age_warning {
my $feed = shift;
my $entries = shift;
my $date = shift;
# feed modification date is smaller than the date given
my ($node) = $xpc->findnodes("/rss/channel | /atom:feed", $feed->{doc});
my $feed_date = updated($node);
if ($feed_date and DateTime->compare($feed_date, $date) == -1) {
$feed->{message} = "No feed updates in 90 days";
$feed->{code} = 206; # partial content
return;
} else {
# or no entry found with a modification date equal or bigger than the date given
for my $entry (@$entries) {
return if DateTime->compare($entry->{date}, $date) >= 0;
}
$feed->{message} = "No entry newer than 90 days";
$feed->{code} = 206; # partial content
}
}
sub updated {
my $node = shift;
return unless $node;
my @nodes = $xpc->findnodes('pubDate | atom:published | atom:updated', $node) or return;
my $date = $nodes[0]->textContent;
my $dt = eval { DateTime::Format::Mail->parse_datetime($date) }
|| eval { DateTime::Format::ISO8601->parse_datetime($date) }
|| eval { DateTime::Format::Mail->parse_datetime(french($date)) };
return $dt;
}
sub french {
my $date = shift;
$date =~ s/^($wday_re)/$wday{$1}/;
$date =~ s/\b($month_re)/$month{$1}/;
return $date;
}
sub id {
my $node = shift;
return unless $node;
my $id = $xpc->findvalue('guid | atom:id', $node); # id is mandatory for Atom
$id ||= $node->findvalue('link'); # one of the following three is mandatory for RSS
$id ||= $node->findvalue('title');
$id ||= $node->findvalue('description');
return $id;
}
sub unique {
my %seen;
my @unique;
for my $node (@_) {
next if $seen{$node->{id}};
$seen{$node->{id}} = 1;
push(@unique, $node);
}
return @unique;
}
sub limit {
my $entries = shift;
my $limit = shift;
# we want the most recent entries overall
@$entries = sort { DateTime->compare( $b->{date}, $a->{date} ) } unique(@$entries);
return [@$entries[0 .. min($#$entries, $limit - 1)]];
}
=head2 Writing templates for entries
Entries have the following keys available:
B<title> is the title of the post.
B<link> is the URL to the post on the web (probably a HTML page).
B<blog_title> is the title of the site.
B<blog_link> is the URL for the site on the web (probably a HTML page).
B<blog_url> is the URL for the site's feed (RSS or Atom).
( run in 1.722 second using v1.01-cache-2.11-cpan-437f7b0c052 )