Acme-Jungle-CrawlerExample
view release on metacpan or search on metacpan
lib/Acme/Jungle/CrawlerExample.pm view on Meta::CPAN
package Acme::Jungle::CrawlerExample;
use Moose;
use Jungle;
our $VERSION = '0.01';
has spider => (
is => 'ro',
isa => 'Jungle',
default => sub {
return Jungle->new;
},
);
#################### main pod documentation begin ###################
## Below is the stub of documentation for your module.
## You better edit it!
lib/Data/News.pm view on Meta::CPAN
package Data::News;
use Moose;
use Text::CSV_XS;
use DateTime;
use Digest::SHA1 qw(sha1_hex);
use HTML::Entities;
has filename_csv => (
is => 'rw',
isa => 'Str',
default => sub {
my ($self) = @_;
my $today = DateTime->now( time_zone => 'local' );
#defines a name for our csv.
my $filename = $today->dmy('-').'_' . $today->hms( '-' ) . '.csv';
$self->filename_csv($filename);
},
);
has site_name => (
is => 'rw',
isa => 'Str',
default => '',
);
after 'site_name' => sub {
my ( $self, $value, $skip_verify ) = @_;
return if ! $value;
if ( ! $skip_verify ) {
$value =~ s{::}{-}g;
$self->site_name( $value, 1 );
}
} ;
has [ qw/title author content webpage meta_keywords meta_description/ ] => (
is => 'rw',
isa => 'Any',
);
has images => (
is => 'rw',
isa => 'ArrayRef',
default => sub { return []; } ,
);
has data => (
is => 'rw',
isa => 'Data::News',
default => sub {
my ($self) = @_;
return $self;
},
);
has csv => (
is => 'ro',
isa => 'Text::CSV_XS',
default => sub {
my $csv = Text::CSV_XS->new()
or die "Cannot use CSV: " . Text::CSV_XS->error_diag();
$csv->eol("\r\n");
return $csv;
},
);
sub save { #saves the data to csv
my ($self) = @_;
my @rows = (
lib/NewsSpider/Terra.pm view on Meta::CPAN
package NewsSpider::Terra; #has charset iso-8859-1
use Moose;
with qw/Jungle::Spider/;
has startpage => (
is => 'rw',
isa => 'Str',
default => 'http://noticias.terra.com.br/ultimasnoticias/0,,EI188,00.html',
);
sub on_start {
my ( $self ) = @_;
}
sub search {
my ( $self ) = @_;
my $news = $self->tree->findnodes( '//div[@class="list articles"]/ol/li/a' );
foreach my $item ( $news->get_nodelist ) {
( run in 0.708 second using v1.01-cache-2.11-cpan-0a6323c29d9 )