ParallelUserAgent

 view release on metacpan or  search on metacpan

lib/LWP/Parallel/RobotUA.pm  view on Meta::CPAN

# requests to an unvisited site.
# It will also observe the delay specified in our ->delay method
sub _make_connections_in_order {
    my $self = shift;
    LWP::Debug::trace('()');

    my($failed_connections, $remember_failures, $ordpend_connections, $rules) =
      @{$self}{qw(failed_connections remember_failures 
		  ordpend_connections rules)};

    my ($entry, @queue, %busy);
    # get first entry from pending connections
    while ( $entry = shift @$ordpend_connections ) {

	my $request = $entry->request;
	my $netloc  = eval { local $SIG{__DIE__}; $request->url->host_port; };

        if ( $remember_failures and $failed_connections->{$netloc} ) {
	    my $response = $entry->response;
	    $response->code (&HTTP::Status::RC_INTERNAL_SERVER_ERROR);
	    $response->message ("Server unavailable");
	    # simulate immediate response from server
	    $self->on_failure ($entry->request, $response, $entry);
	    next;
	  }

	push (@queue, $entry), next  if $busy{$netloc};

	# Do we try to access a new server?
	my $allowed = $rules->allowed($request->url);
	# PS: pending Robots.txt requests are always allowed! (hopefully)

	if ($allowed < 0) {
	  LWP::Debug::debug("Host not visited before, or robots.".
			    "txt expired: ($allowed) ".$request->url);
	    my $checking = $self->_checking_robots_txt ($netloc);
	    # let's see if we're already busy checkin' this host
	    if ( $checking > 0 ) {
	      LWP::Debug::debug("Already busy checking here. Request queued");
		push (@queue, $entry);
	    } elsif ( $checking < 0 ) {
		# We already checked here. Seems the robots.txt
		# expired afterall. Pretend we're allowed
	      LWP::Debug::debug("Checked this host before. robots.txt".
				" expired. Assuming access ok");
		$allowed = 1; 
	    } else { 
		# fetch "robots.txt"
		my $robot_url = $request->url->clone;

lib/LWP/Parallel/RobotUA.pm  view on Meta::CPAN

		            $rules->parse($robot_url, "", $fresh_until);
	                  }
			} else {
			  LWP::Debug::debug("No robots.txt file found at " . 
					    $netloc);
			    $rules->parse($robot_url, "", $fresh_until);
			}
		    },
		};
		# immediately try to connect (if bandwith available)
		push (@queue, $robot_entry), $busy{$netloc}++  
		    unless  $self->_check_bandwith($robot_entry);
		# mark this host as being checked
		$self->_checking_robots_txt ($netloc, 1);
		# don't forget to queue the entry that triggered this request
		push (@queue, $entry);
	    }
	} 

	unless ($allowed) {
	    # we're not allowed to connect to this host

lib/LWP/Parallel/RobotUA.pm  view on Meta::CPAN

		push (@queue, $entry);
		
		# now we also have to raise a red flag for all
		# remaining entries at this particular
		# host. Otherwise we might block the first x
		# requests to this server, but have finally waited
		# long enough when the x+1 request comes off the
		# queue, and then we would connect to the x+1
		# request before any of the first x requests
		# (which is not what we want!)
		$busy{$netloc}++;
              } else {
	        LWP::Debug::debug("'use_sleep' disabled, generating response");
	        my $res = new HTTP::Response
	          &HTTP::Status::RC_SERVICE_UNAVAILABLE, 'Please, slow down';
	        $res->header('Retry-After', time2str(time + $wait));
	        $entry->response($res);
	      }
	    } else { # check bandwith
		unless ( $self->_check_bandwith($entry) ) {
		    # if _check_bandwith returns a value, it means that
		    # no bandwith is available: push $entry on queue
		    push (@queue, $entry);
		    $busy{$netloc}++;
		} else {
		    $rules->visit($netloc);
		}
	    }
	}
    }
    # the un-connected entries form the new stack
    $self->{'ordpend_connections'} = \@queue;
}

lib/LWP/Parallel/RobotUA.pm  view on Meta::CPAN

	    
	    # Do we try to access a new server?
	    my $allowed = $rules->allowed($request->url);
	    # PS: pending Robots.txt requests are always allowed! (hopefully)
	    
	    if ($allowed < 0) {
	      LWP::Debug::debug("Host not visited before, or robots.".
				"txt expired: ".$request->url);
		my $checking = $self->_checking_robots_txt 
		    ($request->url->host_port);
		# let's see if we're already busy checkin' this host
		if ( $checking > 0 ) {
		    # if so, don't register yet another robots.txt request!
		  LWP::Debug::debug("Already busy checking here. ".
				    "Request queued");
		    unshift (@$queue, $entry);
		    next SERVER;
		} elsif ( $checking < 0 ) {
		    # We already checked here. Seems the robots.txt
		    # expired afterall. Pretend we're allowed
		  LWP::Debug::debug("Checked this host before. ".
				    "robots.txt expired. Assuming access ok");
		    $allowed = 1; 
		} else { 

lib/LWP/Parallel/UserAgent.pm  view on Meta::CPAN

    $self->_make_connections_in_order;
  } else {
    $self->_make_connections_unordered;
  }
}

sub _make_connections_in_order {
  my $self = shift;
  LWP::Debug::trace('()');
  
  my ($entry, @queue, %busy);
  # get first entry from pending connections
  while ( $entry = shift @{ $self->{'ordpend_connections'} } ) {
    my $netloc = $self->_netloc($entry->request->url);
    push (@queue, $entry), next  if $busy{$netloc};
    unless ($self->_check_bandwith($entry)) {
      push (@queue, $entry);
      $busy{$netloc}++;
    };
  };
  # the un-connected entries form the new stack
  $self->{'ordpend_connections'} = \@queue;
}

# unordered connections have the advantage that we do not have to 
# care about screwing up our list of pending connections. This will
# speed up our iteration through the list
sub _make_connections_unordered {



( run in 0.263 second using v1.01-cache-2.11-cpan-3cd7ad12f66 )