ParallelUserAgent
view release on metacpan or search on metacpan
lib/LWP/Parallel/RobotUA.pm view on Meta::CPAN
Returns the number of seconds you must wait before you can make a new
request to this server. This method keeps track of all of the robots
connection, and enforces the delay constraint specified via the delay
method above for each server individually.
Note: Although it says 'host', it really means 'netloc/server',
i.e. it differentiates between individual servers running on different
ports, even though they might be on the same machine ('host'). This
function is mostly used internally, where RobotUA calls it to find out
when to send the next request to a certain server.
=cut
sub host_wait
{
my($self, $netloc) = @_;
return undef unless defined $netloc;
my $last = $self->{'rules'}->last_visit($netloc);
if ($last) {
my $wait = int($self->{'delay'} * 60 - (time - $last));
$wait = 0 if $wait < 0;
return $wait;
}
return 0;
}
=head2 $ua->as_string
Returns a string that describes the state of the UA.
Mainly useful for debugging.
=cut
sub as_string
{
my $self = shift;
my @s;
push(@s, "Robot: $self->{'agent'} operated by $self->{'from'} [$self]");
push(@s, " Minimum delay: " . int($self->{'delay'}) . " minutes");
push(@s, " Rules = $self->{'rules'}");
join("\n", @s, '');
}
#
# private methods (reimplementations of LWP::Parallel::UserAgent methods)
#
# this method now first checks the robot rules. It will try to
# download the robots.txt file before proceeding with any more
# requests to an unvisited site.
# It will also observe the delay specified in our ->delay method
sub _make_connections_in_order {
my $self = shift;
LWP::Debug::trace('()');
my($failed_connections, $remember_failures, $ordpend_connections, $rules) =
@{$self}{qw(failed_connections remember_failures
ordpend_connections rules)};
my ($entry, @queue, %busy);
# get first entry from pending connections
while ( $entry = shift @$ordpend_connections ) {
my $request = $entry->request;
my $netloc = eval { local $SIG{__DIE__}; $request->url->host_port; };
if ( $remember_failures and $failed_connections->{$netloc} ) {
my $response = $entry->response;
$response->code (&HTTP::Status::RC_INTERNAL_SERVER_ERROR);
$response->message ("Server unavailable");
# simulate immediate response from server
$self->on_failure ($entry->request, $response, $entry);
next;
}
push (@queue, $entry), next if $busy{$netloc};
# Do we try to access a new server?
my $allowed = $rules->allowed($request->url);
# PS: pending Robots.txt requests are always allowed! (hopefully)
if ($allowed < 0) {
LWP::Debug::debug("Host not visited before, or robots.".
"txt expired: ($allowed) ".$request->url);
my $checking = $self->_checking_robots_txt ($netloc);
# let's see if we're already busy checkin' this host
if ( $checking > 0 ) {
LWP::Debug::debug("Already busy checking here. Request queued");
push (@queue, $entry);
} elsif ( $checking < 0 ) {
# We already checked here. Seems the robots.txt
# expired afterall. Pretend we're allowed
LWP::Debug::debug("Checked this host before. robots.txt".
" expired. Assuming access ok");
$allowed = 1;
} else {
# fetch "robots.txt"
my $robot_url = $request->url->clone;
$robot_url->path("robots.txt");
$robot_url->query(undef);
LWP::Debug::debug("Requesting $robot_url");
# make access to robot.txt legal since this might become
# a recursive call (in case we lack bandwith to connect
# immediately)
$rules->parse($robot_url, "");
my $robot_req = new HTTP::Request 'GET', $robot_url;
my $response = HTTP::Response->new(0, '<empty response>');
$response->request($robot_req);
my $robot_entry = new LWP::Parallel::UserAgent::Entry {
request => $robot_req,
response => $response,
size => 8192,
redirect_ok => 0,
arg => sub {
# callback function (closure)
my ($content, $robot_res, $protocol) = @_;
my $netloc = eval { local $SIG{__DIE__};
$request->url->host_port; };
# unset flag - we're done checking
$self->_checking_robots_txt ($netloc, -1);
$rules->visit($netloc);
my $fresh_until = $robot_res->fresh_until;
if ($robot_res->is_success) {
my $c = $robot_res->content;
if ($robot_res->content_type =~ m,^text/, &&
$c =~ /Disallow/) {
LWP::Debug::debug("Parsing robot rules for ".
$netloc);
$rules->parse($robot_url, $c, $fresh_until);
}
else {
LWP::Debug::debug("Ignoring robots.txt for ".
$netloc);
$rules->parse($robot_url, "", $fresh_until);
}
} else {
LWP::Debug::debug("No robots.txt file found at " .
$netloc);
$rules->parse($robot_url, "", $fresh_until);
}
},
};
# immediately try to connect (if bandwith available)
push (@queue, $robot_entry), $busy{$netloc}++
unless $self->_check_bandwith($robot_entry);
# mark this host as being checked
$self->_checking_robots_txt ($netloc, 1);
# don't forget to queue the entry that triggered this request
push (@queue, $entry);
}
}
unless ($allowed) {
# we're not allowed to connect to this host
my $res = new HTTP::Response
&HTTP::Status::RC_FORBIDDEN, 'Forbidden by robots.txt';
$entry->response($res);
# silently drop entry here from ordpend_connections
} elsif ($allowed > 0) {
# check robot-wait information to see if we have to wait
my $wait = $self->host_wait($netloc);
# if so, push on @queue queue
if ($wait) {
LWP::Debug::trace("Must wait $wait more seconds (sleep is ".
($self->{'use_sleep'} ? 'on' : 'off') . ")");
if ($self->{'use_sleep'}) {
# well, we don't really use sleep, but lets emulate
# the standard LWP behavior as closely as possible...
push (@queue, $entry);
# now we also have to raise a red flag for all
# remaining entries at this particular
# host. Otherwise we might block the first x
# requests to this server, but have finally waited
# long enough when the x+1 request comes off the
# queue, and then we would connect to the x+1
# request before any of the first x requests
# (which is not what we want!)
$busy{$netloc}++;
} else {
LWP::Debug::debug("'use_sleep' disabled, generating response");
my $res = new HTTP::Response
&HTTP::Status::RC_SERVICE_UNAVAILABLE, 'Please, slow down';
$res->header('Retry-After', time2str(time + $wait));
$entry->response($res);
}
} else { # check bandwith
unless ( $self->_check_bandwith($entry) ) {
# if _check_bandwith returns a value, it means that
# no bandwith is available: push $entry on queue
push (@queue, $entry);
$busy{$netloc}++;
} else {
$rules->visit($netloc);
}
}
}
}
# the un-connected entries form the new stack
$self->{'ordpend_connections'} = \@queue;
}
# this method now first checks the robot rules. It will try to
# download the robots.txt file before proceeding with any more
# requests to an unvisited site.
# It will also observe the delay specified in our ->delay method
sub _make_connections_unordered {
my $self = shift;
LWP::Debug::trace('()');
my($pending_connections, $failed_connections, $remember_failures, $rules) =
@{$self}{qw(pending_connections failed_connections
remember_failures rules)};
my ($entry, $queue, $netloc);
my %delete;
# check every host in sequence (use 'each' for better performance)
SERVER:
while (($netloc, $queue) = each %$pending_connections) {
# since we shouldn't alter the hash itself while iterating through it
# via 'each', we'll make a note here for each netloc that has an
# empty queue, so that we can explicitly delete them afterwards:
unless (@$queue) {
LWP::Debug::debug("Marking empty queue for '$netloc' for deletion");
$delete{$netloc}++;
next SERVER;
}
# check if we already tried to connect to this location, and failed
if ( $remember_failures and $failed_connections->{$netloc} ) {
LWP::Debug::debug("Removing all ". scalar @$queue .
" entries for unreachable host '$netloc'");
while ( $entry = shift @$queue ) {
my $response = $entry->response;
$response->code (&HTTP::Status::RC_INTERNAL_SERVER_ERROR);
$response->message ("Server unavailable");
# simulate immediate response from server
$self->on_failure ($entry->request, $response, $entry);
}
# make sure we delete this netloc-entry later
LWP::Debug::debug("Marking empty queue for '$netloc' for deletion");
$delete{$netloc}++;
next SERVER;
}
# get first entry from pending connections at this host
while ( $entry = shift @$queue ) {
my $request = $entry->request;
# Do we try to access a new server?
my $allowed = $rules->allowed($request->url);
# PS: pending Robots.txt requests are always allowed! (hopefully)
if ($allowed < 0) {
LWP::Debug::debug("Host not visited before, or robots.".
"txt expired: ".$request->url);
my $checking = $self->_checking_robots_txt
($request->url->host_port);
# let's see if we're already busy checkin' this host
if ( $checking > 0 ) {
# if so, don't register yet another robots.txt request!
LWP::Debug::debug("Already busy checking here. ".
"Request queued");
unshift (@$queue, $entry);
next SERVER;
} elsif ( $checking < 0 ) {
# We already checked here. Seems the robots.txt
# expired afterall. Pretend we're allowed
LWP::Debug::debug("Checked this host before. ".
"robots.txt expired. Assuming access ok");
$allowed = 1;
} else {
# queue the entry that triggered this request
unshift (@$queue, $entry);
# fetch "robots.txt" (i.e. create & issue robot request)
my $robot_url = $request->url->clone;
$robot_url->path("robots.txt");
$robot_url->query(undef);
LWP::Debug::debug("Requesting $robot_url");
# make access to robot.txt legal since this might become
# a recursive call (in case we lack bandwith to connect
# immediately)
$rules->parse($robot_url, "");
my $robot_req = new HTTP::Request 'GET', $robot_url;
my $response = HTTP::Response->new(0, '<empty response>');
$response->request($robot_req);
my $robot_entry = new LWP::Parallel::UserAgent::Entry {
request => $robot_req,
response => $response,
size => 8192,
redirect_ok => 0,
arg => sub {
# callback function (closure)
my ($content, $robot_res, $protocol) = @_;
my $netloc = eval { local $SIG{__DIE__};
$request->url->host_port; };
# unset flag - we're done checking
$self->_checking_robots_txt ($netloc, -1);
$rules->visit($netloc);
my $fresh_until = $robot_res->fresh_until;
if ($robot_res->is_success) {
my $c = $content; # thanks to Vlad Ciubotariu
if ($robot_res->content_type =~ m,^text/, &&
$c =~ /Disallow/) {
LWP::Debug::debug("Parsing robot rules for ".
$netloc);
$rules->parse($robot_url, $c, $fresh_until);
}
else {
LWP::Debug::debug("Ignoring robots.txt for ".
$netloc);
$rules->parse($robot_url, "", $fresh_until);
}
} else {
LWP::Debug::debug("No robots.txt file found at ".
$netloc);
$rules->parse($robot_url, "", $fresh_until);
}
( run in 0.685 second using v1.01-cache-2.11-cpan-e1769b4cff6 )