Dezi-App
view release on metacpan or search on metacpan
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
has 'credential_timeout' => ( is => 'rw', isa => Int, default => sub {30} );
has 'credentials' => ( is => 'rw', isa => Str );
has 'delay' => ( is => 'rw', isa => Int, default => sub {5} );
has 'email' => (
is => 'rw',
isa => Str,
default => sub {'dezi@user.failed.to.set.email.invalid'},
);
has 'file_rules' => ( is => 'rw', isa => DeziFileRules, coerce => 1, );
has 'follow_redirects' => ( is => 'rw', isa => Bool, default => sub {1} );
has 'keep_alive' => ( is => 'rw', isa => Bool, default => sub {0} );
# whitelist which HTML tags we consider "links"
# should be subset of what HTML::LinkExtor considers links
has 'link_tags' => (
is => 'rw',
isa => ArrayRef,
default => sub { [ 'a', 'frame', 'iframe' ] }
);
has 'max_depth' => ( is => 'rw', isa => Maybe [Int] );
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
truncated per LWP::UserAgent.
Set max_size to zero for unlimited size.
=item modified_since I<date>
This optional parameter will skip any URIs that do not report having
been modified since I<date>. The C<Last-Modified> HTTP header is used to
determine modification time.
=item keep_alive I<1|0>
This optional parameter will enable keep alive requests. This can dramatically speed
up spidering and reduce the load on server being spidered. The default is to not use
keep alives, although enabling it will probably be the right thing to do.
To get the most out of keep alives, you may want to set up your web server to
allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache).
Apache's default is 100, which should be good.
When a connection is not closed the spider does not wait the "delay"
time when making the next request. In other words, there is no delay in
requesting documents while the connection is open.
Note: you must have at least libwww-perl-5.53_90 installed to use this feature.
=item delay I<n>
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
$self->{ua}->delay(0);
$self->{ua}->timeout( $self->timeout );
# TODO we test this using HEAD request. Set here too?
#$self->{ua}->max_size( $self->{max_size} ) if $self->{max_size};
if ( $self->use_cookies ) {
$self->{ua}->cookie_jar( HTTP::Cookies->new() );
}
if ( $self->keep_alive ) {
if ( $self->{ua}->can('conn_cache') ) {
$self->{ua}
->conn_cache( { total_capacity => $self->keep_alive } );
}
else {
warn
"can't use keep-alive: conn_cache() method not available on ua "
. ref( $self->{ua} );
}
}
$self->{_current_depth} = 1;
$self->{same_host_lookup} = { map { $_ => 1 } @{ $self->{same_hosts} } };
if ( $self->use_md5 ) {
Class::Load::load_class('Digest::MD5');
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
return $self->_make_request($uri);
}
sub _make_request {
my ( $self, $uri ) = @_;
# get our useragent
my $ua = $self->ua;
my $delay = 0;
if ( $self->{keep_alive} ) {
$delay = 0;
}
elsif ( !$self->{delay} or !$self->{_last_response_time} ) {
$delay = 0;
}
else {
my $elapsed = time() - $self->{_last_response_time};
$delay = $self->{delay} - $elapsed;
$delay = 0 if $delay < 0;
$self->debug
( run in 2.414 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )