sub update results from the CPAN

App-jupiter

view release on metacpan or search on metacpan

B<jupiter html> I<atom.xml template.xml planet.html template.html feed.opml>

In the above case, Planet Jupiter will write a feed called F<atom.xml> based on
F<template.xml> and a HTML file called F<planet.html> based on F<template.html>,
using the cached entries matching the feeds in F<feed.opml>.

=cut

use DateTime;
use DateTime::Format::Mail;
use DateTime::Format::ISO8601;
use File::Basename;
use File::Slurper qw(read_binary write_binary read_text write_text);
use List::Util qw(uniq min shuffle);
use Modern::Perl;
use Mojo::Log;
use Mojo::JSON qw(decode_json encode_json);
use Mojo::Template;
use Mojo::UserAgent;
use Pod::Simple::Text;
use XML::LibXML;
use Mojo::Util qw(slugify trim xml_escape html_unescape);
use File::ShareDir 'dist_file';

use vars qw($log);
our $log = Mojo::Log->new;

my $xpc = XML::LibXML::XPathContext->new;
$xpc->registerNs('atom', 'http://www.w3.org/2005/Atom');
$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml');
$xpc->registerNs('dc', 'http://purl.org/dc/elements/1.1/');
$xpc->registerNs('itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd');

my $undefined_date = DateTime->from_epoch( epoch => 0 );

my (%wday, %month, $wday_re, $month_re);
%wday = qw (lun. Mon mar. Tue mer. Wed jeu. Thu ven. Fri sam. Sat dim. Sun);
%month = qw (janv. Jan fÃ©vr. Feb mars Mar avr. Apr mai May juin Jun
	     juil. Jul aoÃ»t Aug sept. Sep oct. Oct nov. Nov dÃ©c. Dec);
$wday_re = join('|', map { quotemeta } keys %wday) unless $wday_re;
$month_re = join('|', map { quotemeta } keys %month) unless $month_re;

# Our tests don't want to call main
__PACKAGE__->main unless caller;

sub main {
  my ($log_level) = grep /^--log=/, @ARGV;
  $log->level(substr($log_level, 6)) if $log_level;
  my ($command) = grep /^[a-z]+$/, @ARGV;
  $command ||= 'help';
  if ($command eq 'update') {
    update_cache(@ARGV);
  } elsif ($command eq 'html') {
    make_html(@ARGV);
  } else {
    my $parser = Pod::Simple::Text->new();
    $parser->parse_file($0);
  }
}

sub update_cache {
  my ($feeds, $files) = read_opml(@_);
  make_directories($feeds);
  load_feed_metadata($feeds, $files);
  my $ua = Mojo::UserAgent->new->with_roles('+Queued')
      ->max_redirects(3)
      ->max_active(5);
  make_promises($ua, $feeds);
  fetch_feeds($feeds);
  save_feed_metadata($feeds, $files);
  cleanup_cache($feeds);
}

sub make_promises {
  my $ua = shift;
  my $feeds = shift;
  for my $feed (@$feeds) {
    my $url = html_unescape $feed->{url}; # undo xml_escape for the request
    $ua->on(start => sub {
      my ($ua, $tx) = @_;
      $tx->req->headers->if_none_match($feed->{etag}) if ($feed->{etag});
      $tx->req->headers->if_modified_since($feed->{last_modified}) if ($feed->{last_modified});
    });
    $feed->{promise} = $ua->get_p($url)
	->catch(sub {
	  $feed->{message} = "@_";
	  $feed->{code} = 521;
	  # returning 0 in the case of an error is important
	  0; })
	# sleeping to stop blogger.com from blocking us
	->finally(sub { $log->debug($url); sleep 2; });
  }
}

sub fetch_feeds {
  my $feeds = shift;
  $log->info("Fetching feeds...");
  Mojo::Promise->all(map { $_->{promise} } @$feeds)->then(sub {
    # all returns the values in the same order!
    for (my $i = 0; $i < @_; $i++) {
      my $feed = $feeds->[$i];
      my $value = $_[$i];
      my $tx = $value->[0];
      # relies on catch returning 0 above
      next unless $tx;
      $feed->{message} = $tx->result->message;
      $feed->{code} = $tx->result->code;
      $feed->{last_modified} = $tx->result->headers->last_modified;
      $feed->{etag} = $tx->result->headers->etag;
      # save raw bytes if this is a success
      eval { write_binary($feed->{cache_file}, $tx->result->body) } if $tx->result->is_success;
      warn "Unable to write $feed->{cache_file}: $@\n" if $@;
    }
  })->catch(sub {
    warn "Something went wrong: @_";
  })->wait;
}

sub load_feed_metadata {
  my $feeds = shift;
  my $files = shift;

script/jupiter view on Meta::CPAN

  for my $feed (@$feeds) {
    next unless -r $feed->{cache_file};
    my $doc = eval { XML::LibXML->load_xml(recover => 2, location => $feed->{cache_file} )};
    if (not $doc) {
      $feed->{message} = xml_escape "Parsing error: $@";
      $feed->{code} = 422; # unprocessable
      next;
    }
    $feed->{doc} = $doc;
    my @nodes = $xpc->findnodes("/rss/channel/item | /atom:feed/atom:entry", $doc);
    if (not @nodes) {
      $feed->{message} = "Empty feed";
      $feed->{code} = 204; # no content
      next;
    }
    # if this is an Atom feed, we need to sort the entries ourselves (older entries at the end)
    my @candidates = map {
      my $entry = {};
      $entry->{element} = $_;
      $entry->{id} = id($_);
      $entry->{date} = updated($_) || $undefined_date;
      $entry;
    } @nodes;
    @candidates = grep { DateTime->compare($_->{date}, $now) <= 0 } @candidates;
    @candidates = unique(sort { DateTime->compare( $b->{date}, $a->{date} ) } @candidates);
    @candidates = @candidates[0 .. min($#candidates, $limit - 1)];
    # now that we have limited the candidates, let's add more metadata from the feed
    for my $entry (@candidates) {
      $entry->{feed} = $feed;
      # these two are already escaped
      $entry->{blog_title} = $feed->{title};
      $entry->{blog_url} = $feed->{url};
    }
    add_age_warning($feed, \@candidates, $date);
    push @entries, @candidates;
  }
  return \@entries;
}

sub add_age_warning {
  my $feed = shift;
  my $entries = shift;
  my $date = shift;
  # feed modification date is smaller than the date given
  my ($node) = $xpc->findnodes("/rss/channel | /atom:feed", $feed->{doc});
  my $feed_date = updated($node);
  if ($feed_date and DateTime->compare($feed_date, $date) == -1) {
    $feed->{message} = "No feed updates in 90 days";
    $feed->{code} = 206; # partial content
    return;
  } else {
    # or no entry found with a modification date equal or bigger than the date given
    for my $entry (@$entries) {
      return if DateTime->compare($entry->{date}, $date) >= 0;
    }
    $feed->{message} = "No entry newer than 90 days";
    $feed->{code} = 206; # partial content
  }
}

sub updated {
  my $node = shift;
  return unless $node;
  my @nodes = $xpc->findnodes('pubDate | atom:published | atom:updated', $node) or return;
  my $date = $nodes[0]->textContent;
  my $dt = eval { DateTime::Format::Mail->parse_datetime($date) }
  || eval { DateTime::Format::ISO8601->parse_datetime($date) }
  || eval { DateTime::Format::Mail->parse_datetime(french($date)) };
  return $dt;
}

sub french {
  my $date = shift;
  $date =~ s/^($wday_re)/$wday{$1}/;
  $date =~ s/\b($month_re)/$month{$1}/;
  return $date;
}

sub id {
  my $node = shift;
  return unless $node;
  my $id = $xpc->findvalue('guid | atom:id', $node); # id is mandatory for Atom
  $id ||= $node->findvalue('link'); # one of the following three is mandatory for RSS
  $id ||= $node->findvalue('title');
  $id ||= $node->findvalue('description');
  return $id;
}

sub unique {
  my %seen;
  my @unique;
  for my $node (@_) {
    next if $seen{$node->{id}};
    $seen{$node->{id}} = 1;
    push(@unique, $node);
  }
  return @unique;
}

sub limit {
  my $entries = shift;
  my $limit = shift;
  # we want the most recent entries overall
  @$entries = sort { DateTime->compare( $b->{date}, $a->{date} ) } unique(@$entries);
  return [@$entries[0 .. min($#$entries, $limit - 1)]];
}

=head2 Writing templates for entries

Entries have the following keys available:

B<title> is the title of the post.

B<link> is the URL to the post on the web (probably a HTML page).

B<blog_title> is the title of the site.

B<blog_link> is the URL for the site on the web (probably a HTML page).

B<blog_url> is the URL for the site's feed (RSS or Atom).

( run in 0.784 second using v1.01-cache-2.11-cpan-9581c071862 )