Acme-Jungle-CrawlerExample

 view release on metacpan or  search on metacpan

lib/Acme/Jungle/CrawlerExample.pm  view on Meta::CPAN

package Acme::Jungle::CrawlerExample;
use Moose;
use Jungle;
our $VERSION     = '0.01';

has spider => (
    is => 'ro',
    isa => 'Jungle',
    default => sub { 
        return Jungle->new;
    },
); 



#################### main pod documentation begin ###################
## Below is the stub of documentation for your module. 
## You better edit it!

lib/Data/News.pm  view on Meta::CPAN

package Data::News;
use Moose;
use Text::CSV_XS;
use DateTime;
use Digest::SHA1 qw(sha1_hex);
use HTML::Entities;

has filename_csv => (
    is      => 'rw',
    isa     => 'Str',
    default => sub {
        my ($self) = @_;
        my $today = DateTime->now( time_zone => 'local' );
        #defines a name for our csv.
        my $filename = $today->dmy('-').'_' . $today->hms( '-' ) . '.csv';
        $self->filename_csv($filename);
    },
);

has site_name => (
    is  => 'rw',
    isa => 'Str',
    default => '',
);

after 'site_name' => sub {
    my ( $self, $value, $skip_verify ) = @_; 
    return if ! $value;
    if ( ! $skip_verify ) {
        $value =~ s{::}{-}g;
        $self->site_name( $value, 1 );
    }
} ;

has [ qw/title author content webpage meta_keywords meta_description/ ] => (
    is  => 'rw',
    isa => 'Any',
);

has images => (
    is => 'rw',
    isa => 'ArrayRef',
    default => sub { return []; } ,
); 

has data => (
    is      => 'rw',
    isa     => 'Data::News',
    default => sub {
        my ($self) = @_;
        return $self;
    },
);

has csv => (
    is => 'ro',
    isa => 'Text::CSV_XS',
    default => sub {
        my $csv = Text::CSV_XS->new()
          or die "Cannot use CSV: " . Text::CSV_XS->error_diag();
        $csv->eol("\r\n");
        return $csv;
    },
);

sub save {    #saves the data to csv
    my ($self) = @_;
    my @rows = (

lib/NewsSpider/Terra.pm  view on Meta::CPAN

package NewsSpider::Terra; #has charset iso-8859-1
use Moose;
with qw/Jungle::Spider/;

has startpage => (
    is => 'rw',
    isa => 'Str',
    default => 'http://noticias.terra.com.br/ultimasnoticias/0,,EI188,00.html',
);

sub on_start { 
    my ( $self ) = @_; 
}

sub search {
    my ( $self ) = @_; 
    my $news = $self->tree->findnodes( '//div[@class="list articles"]/ol/li/a' );
    foreach my $item ( $news->get_nodelist ) {



( run in 0.708 second using v1.01-cache-2.11-cpan-0a6323c29d9 )