App-scrape
view release on metacpan or search on metacpan
bin/scrape2rss.pl view on Meta::CPAN
Selector for the entry title
=item B<--summary>
Selector for the entry summary
=item B<--permalink>
Selector for the entry permalink
=item B<--pages>
Selector for the pagination links to follow
=item B<--date>
Selector for the entry publication date
=item B<--date-fmt>
C<sprintf> format that the entry publication date is in
for conversion into a proper Atom timestamp
=item B<--outfile>
Name of the output file
Default is STDOUT
=item B<--debug>
Output information in clear text
=back
=cut
GetOptions(
'help|h' => \my $help,
'feed-url|b:s' => \my $feed_url,
'feed-title|f:s' => \my $feed_title,
'title|t:s' => \my $title,
'summary|s:s' => \my $summary,
'permalink|l:s' => \my $permalink,
'pages|p:s' => \my $pages,
'date:s' => \my $date,
'date-fmt:s' => \my $date_fmt,
'category|c:s' => \my $category,
'outfile|o:s' => \my $outfile,
'debug|d' => \my $debug,
) or pod2usage(2);
pod2usage(1) if $help;
die "No URL given.\n"
unless @ARGV;
$feed_url ||= $outfile || 'feed.atom';
$feed_title ||= 'Atom feed';
$category ||= '';
my $updated = Time::Piece->gmtime->strftime('%Y-%m-%dT%H:%M:%SZ');
my $feed = XML::Atom::SimpleFeed->new(
title => $feed_title,
link => $feed_url,
link => { rel => 'self', href => $feed_url, },
author => 'scrape2rss',
id => $feed_url,
updated => $updated,
);
my %seen;
while (@ARGV) {
my $url = shift @ARGV;
next if $seen{ $url }++;
my $html;
if ($url eq '-') {
# read from STDIN
local $/;
$html = <>;
} else {
$html = get $url;
};
do_scrape($feed, $url, $html);
if ($pages) {
my @pagination = scrape( $html,
{ page => $pages },
{ base => $url },
);
push @ARGV, grep { !$seen{ $_ }} map { $_->{page} } @pagination;
};
};
if ($outfile) {
open STDOUT, '>', $outfile
or die "Couldn't create '$outfile': $!";
};
print $feed->as_string;
sub do_scrape {
my ($feed, $url, $html) = @_;
my @fields;
my @rows = scrape($html, {
summary => $summary,
permalink => $permalink,
title => $title,
date => $date,
#category => $category,
}, {
base => $url,
});
for my $item (@rows) {
my $item_updated = $item->{date} || $updated;
# Now, extract the information, just in case there is "garbage"
# around the string
(my $extr = $date_fmt) =~ s!%\w!\\d+!g;
$extr = qr/($extr)/;
if ($item_updated =~ /$extr/) {
$item_updated = $1;
} else {
warn "Is [$updated] a valid date?\n";
$item_updated = $updated;
};
my $ts = Time::Piece->strptime( $item_updated, $date_fmt );
$updated = $ts->strftime('%Y-%m-%dT%H:%M:%SZ');
my $enc_url = $item->{permalink};
my %info = (
title => $item->{title},
link => $enc_url,
id => $enc_url,
summary => $item->{summary},
updated => $item_updated,
category => ($item->{category} || $category),
);
if ($debug) {
for (sort keys %info) {
printf "%10s : %s\n", $_, $info{ $_ };
};
};
# beware. XML::Atom::SimpleFeed uses warnings => fatal,
# so all warnings within it die.
$feed->add_entry(%info);
};
};
=head1 REPOSITORY
The public repository of this module is
L<http://github.com/Corion/App-scrape>.
=head1 SUPPORT
The public support forum of this program is
L<http://perlmonks.org/>.
=head1 AUTHOR
Max Maischein C<corion@cpan.org>
=head1 COPYRIGHT (c)
Copyright 2011-2011 by Max Maischein C<corion@cpan.org>.
=head1 LICENSE
This module is released under the same terms as Perl itself.
=cut
( run in 1.062 second using v1.01-cache-2.11-cpan-5837b0d9d2c )