ParallelUserAgent
view release on metacpan or search on metacpan
lib/LWP/Parallel/RobotUA.pm view on Meta::CPAN
# requests to an unvisited site.
# It will also observe the delay specified in our ->delay method
sub _make_connections_in_order {
my $self = shift;
LWP::Debug::trace('()');
my($failed_connections, $remember_failures, $ordpend_connections, $rules) =
@{$self}{qw(failed_connections remember_failures
ordpend_connections rules)};
my ($entry, @queue, %busy);
# get first entry from pending connections
while ( $entry = shift @$ordpend_connections ) {
my $request = $entry->request;
my $netloc = eval { local $SIG{__DIE__}; $request->url->host_port; };
if ( $remember_failures and $failed_connections->{$netloc} ) {
my $response = $entry->response;
$response->code (&HTTP::Status::RC_INTERNAL_SERVER_ERROR);
$response->message ("Server unavailable");
# simulate immediate response from server
$self->on_failure ($entry->request, $response, $entry);
next;
}
push (@queue, $entry), next if $busy{$netloc};
# Do we try to access a new server?
my $allowed = $rules->allowed($request->url);
# PS: pending Robots.txt requests are always allowed! (hopefully)
if ($allowed < 0) {
LWP::Debug::debug("Host not visited before, or robots.".
"txt expired: ($allowed) ".$request->url);
my $checking = $self->_checking_robots_txt ($netloc);
# let's see if we're already busy checkin' this host
if ( $checking > 0 ) {
LWP::Debug::debug("Already busy checking here. Request queued");
push (@queue, $entry);
} elsif ( $checking < 0 ) {
# We already checked here. Seems the robots.txt
# expired afterall. Pretend we're allowed
LWP::Debug::debug("Checked this host before. robots.txt".
" expired. Assuming access ok");
$allowed = 1;
} else {
# fetch "robots.txt"
my $robot_url = $request->url->clone;
lib/LWP/Parallel/RobotUA.pm view on Meta::CPAN
$rules->parse($robot_url, "", $fresh_until);
}
} else {
LWP::Debug::debug("No robots.txt file found at " .
$netloc);
$rules->parse($robot_url, "", $fresh_until);
}
},
};
# immediately try to connect (if bandwith available)
push (@queue, $robot_entry), $busy{$netloc}++
unless $self->_check_bandwith($robot_entry);
# mark this host as being checked
$self->_checking_robots_txt ($netloc, 1);
# don't forget to queue the entry that triggered this request
push (@queue, $entry);
}
}
unless ($allowed) {
# we're not allowed to connect to this host
lib/LWP/Parallel/RobotUA.pm view on Meta::CPAN
push (@queue, $entry);
# now we also have to raise a red flag for all
# remaining entries at this particular
# host. Otherwise we might block the first x
# requests to this server, but have finally waited
# long enough when the x+1 request comes off the
# queue, and then we would connect to the x+1
# request before any of the first x requests
# (which is not what we want!)
$busy{$netloc}++;
} else {
LWP::Debug::debug("'use_sleep' disabled, generating response");
my $res = new HTTP::Response
&HTTP::Status::RC_SERVICE_UNAVAILABLE, 'Please, slow down';
$res->header('Retry-After', time2str(time + $wait));
$entry->response($res);
}
} else { # check bandwith
unless ( $self->_check_bandwith($entry) ) {
# if _check_bandwith returns a value, it means that
# no bandwith is available: push $entry on queue
push (@queue, $entry);
$busy{$netloc}++;
} else {
$rules->visit($netloc);
}
}
}
}
# the un-connected entries form the new stack
$self->{'ordpend_connections'} = \@queue;
}
lib/LWP/Parallel/RobotUA.pm view on Meta::CPAN
# Do we try to access a new server?
my $allowed = $rules->allowed($request->url);
# PS: pending Robots.txt requests are always allowed! (hopefully)
if ($allowed < 0) {
LWP::Debug::debug("Host not visited before, or robots.".
"txt expired: ".$request->url);
my $checking = $self->_checking_robots_txt
($request->url->host_port);
# let's see if we're already busy checkin' this host
if ( $checking > 0 ) {
# if so, don't register yet another robots.txt request!
LWP::Debug::debug("Already busy checking here. ".
"Request queued");
unshift (@$queue, $entry);
next SERVER;
} elsif ( $checking < 0 ) {
# We already checked here. Seems the robots.txt
# expired afterall. Pretend we're allowed
LWP::Debug::debug("Checked this host before. ".
"robots.txt expired. Assuming access ok");
$allowed = 1;
} else {
lib/LWP/Parallel/UserAgent.pm view on Meta::CPAN
$self->_make_connections_in_order;
} else {
$self->_make_connections_unordered;
}
}
sub _make_connections_in_order {
my $self = shift;
LWP::Debug::trace('()');
my ($entry, @queue, %busy);
# get first entry from pending connections
while ( $entry = shift @{ $self->{'ordpend_connections'} } ) {
my $netloc = $self->_netloc($entry->request->url);
push (@queue, $entry), next if $busy{$netloc};
unless ($self->_check_bandwith($entry)) {
push (@queue, $entry);
$busy{$netloc}++;
};
};
# the un-connected entries form the new stack
$self->{'ordpend_connections'} = \@queue;
}
# unordered connections have the advantage that we do not have to
# care about screwing up our list of pending connections. This will
# speed up our iteration through the list
sub _make_connections_unordered {
( run in 0.278 second using v1.01-cache-2.11-cpan-87723dcf8b7 )