App-scrape

 view release on metacpan or  search on metacpan

bin/scrape2rss.pl  view on Meta::CPAN

Selector for the entry title

=item B<--summary>

Selector for the entry summary

=item B<--permalink>

Selector for the entry permalink

=item B<--pages>

Selector for the pagination links to follow

=item B<--date>

Selector for the entry publication date

=item B<--date-fmt>

C<sprintf> format that the entry publication date is in
for conversion into a proper Atom timestamp

=item B<--outfile>

Name of the output file

Default is STDOUT

=item B<--debug>

Output information in clear text

=back

=cut

GetOptions(
    'help|h' => \my $help,
    'feed-url|b:s' => \my $feed_url,
    'feed-title|f:s' => \my $feed_title,
    'title|t:s' => \my $title,
    'summary|s:s' => \my $summary,
    'permalink|l:s' => \my $permalink,
    'pages|p:s' => \my $pages,
    'date:s' => \my $date,
    'date-fmt:s' => \my $date_fmt,
    'category|c:s' => \my $category,
    'outfile|o:s' => \my $outfile,
    'debug|d' => \my $debug,
) or pod2usage(2);
pod2usage(1) if $help;

die "No URL given.\n"
    unless @ARGV;

$feed_url ||= $outfile || 'feed.atom';
$feed_title ||= 'Atom feed';
$category ||= '';

my $updated = Time::Piece->gmtime->strftime('%Y-%m-%dT%H:%M:%SZ');

my $feed = XML::Atom::SimpleFeed->new(
    title   => $feed_title,
    link    => $feed_url,
    link    => { rel => 'self', href => $feed_url, },
    author  => 'scrape2rss',
    id      => $feed_url,
    updated => $updated,
);

my %seen;
while (@ARGV) {
    my $url = shift @ARGV;
    next if $seen{ $url }++;
    
    my $html;
    if ($url eq '-') {
        # read from STDIN
        local $/;
        $html = <>;
    } else {
        $html = get $url;
    };
    
    do_scrape($feed, $url, $html);
    
    if ($pages) {
        my @pagination = scrape( $html,
            { page => $pages },
            { base => $url },
        );
        push @ARGV, grep { !$seen{ $_ }} map {  $_->{page} } @pagination;
    };
};

if ($outfile) {
    open STDOUT, '>', $outfile
        or die "Couldn't create '$outfile': $!";
};
    
print $feed->as_string;

sub do_scrape {
    my ($feed, $url, $html) = @_;

    my @fields;

    my @rows = scrape($html, {
            summary => $summary,
            permalink => $permalink,
            title => $title,
            date => $date,
            #category => $category,
        }, {
        base => $url,
    });

    for my $item (@rows) {
        my $item_updated = $item->{date} || $updated;
        
        # Now, extract the information, just in case there is "garbage"
        # around the string
        (my $extr = $date_fmt) =~ s!%\w!\\d+!g;
        $extr = qr/($extr)/;
        
        if ($item_updated =~ /$extr/) {
            $item_updated = $1;
        } else {
            warn "Is [$updated] a valid date?\n";
            $item_updated = $updated;
        };
        
        my $ts = Time::Piece->strptime( $item_updated, $date_fmt );
        $updated = $ts->strftime('%Y-%m-%dT%H:%M:%SZ');

        my $enc_url = $item->{permalink};
        
        my %info = (
            title     => $item->{title},
            link      => $enc_url,
            id        => $enc_url,
            summary   => $item->{summary},
            updated   => $item_updated,
            category  => ($item->{category} || $category),
        );
        
        if ($debug) {
            for (sort keys %info) {
                printf "%10s : %s\n", $_, $info{ $_ };
            };
        };
            
        # beware. XML::Atom::SimpleFeed uses warnings => fatal,
        # so all warnings within it die.
        $feed->add_entry(%info);
    };
};

=head1 REPOSITORY

The public repository of this module is 
L<http://github.com/Corion/App-scrape>.

=head1 SUPPORT

The public support forum of this program is
L<http://perlmonks.org/>.

=head1 AUTHOR

Max Maischein C<corion@cpan.org>

=head1 COPYRIGHT (c)

Copyright 2011-2011 by Max Maischein C<corion@cpan.org>.

=head1 LICENSE

This module is released under the same terms as Perl itself.

=cut



( run in 1.062 second using v1.01-cache-2.11-cpan-5837b0d9d2c )