ParallelUserAgent
view release on metacpan or search on metacpan
lib/LWP/Parallel.pm view on Meta::CPAN
require WWW::RobotRules::AnyDBM_File;
# shortcut for demo URLs
my $url = "http://www.cs.washington.edu/";
my $reqs = [
HTTP::Request->new('GET', $url),
# these are all redirects. depending on how you set
# 'redirect_ok' they either just return the status code for
# redirect (like 302 moved), or continue to follow redirection.
HTTP::Request->new('GET', $url."research/ahoy/"),
HTTP::Request->new('GET', $url."research/ahoy/doc/paper.html"),
HTTP::Request->new('GET', "http://metacrawler.cs.washington.edu:6060/"),
# these are all non-existant server. the first one should take
# some time, but the following ones should be rejected right
# away
HTTP::Request->new('GET', "http://www.foobar.foo/research/ahoy/"),
HTTP::Request->new('GET', "http://www.foobar.foo/foobar/foo/"),
HTTP::Request->new('GET', "http://www.foobar.foo/baz/buzz.html"),
# although server exists, file doesn't
HTTP::Request->new('GET', $url."foobar/bar/baz.html"),
];
my ($req,$res);
# establish persistant robot rules cache. See WWW::RobotRules for
# non-permanent version. you should probably adjust the agentname
# and cache filename.
my $rules = new WWW::RobotRules::AnyDBM_File 'ParallelUA', 'cache';
# create new UserAgent (actually, a Robot)
my $pua = new LWP::Parallel::RobotUA ("ParallelUA",
'yourname@your.site.com', $rules);
$pua->timeout (2); # in seconds
$pua->delay ( 5); # in seconds
$pua->max_req ( 2); # max parallel requests per server
$pua->max_hosts(10); # max parallel servers accessed
# for our own print statements that follow below:
local($\) = ""; # ensure standard $OUTPUT_RECORD_SEPARATOR
# register requests
foreach $req (@$reqs) {
print "Registering '".$req->url."'\n";
$pua->register ($req , \&handle_answer);
# Each request, even if it failed to # register properly, will
# show up in the final list of # requests returned by $pua->wait,
# so you can examine it # later.
}
# $pua->wait returns a pointer to an associative array, containing
# an '$entry' for each request made, sorted by its url. (as returned
# by $request->url->as_string)
my $entries = $pua->wait(); # give another timeout here, 25 seconds
# let's see what we got back (see also callback function!!)
foreach (keys %$entries) {
$res = $entries->{$_}->response;
# examine response to find cascaded requests (redirects, etc) and
# set current response to point to the very first response of this
# sequence. (not very exciting if you set '$pua->redirect(0)')
my $r = $res; my @redirects;
while ($r) {
$res = $r;
$r = $r->previous;
push (@redirects, $res) if $r;
}
# summarize response. see "perldoc HTTP::Response"
print "Answer for '",$res->request->url, "' was \t", $res->code,": ",
$res->message,"\n";
# print redirection history, in case we got redirected
foreach (@redirects) {
print "\t",$_->request->url, "\t", $_->code,": ", $_->message,"\n";
}
}
# our callback function gets called whenever some data comes in
# (same parameter format as standard LWP::UserAgent callbacks!)
sub handle_answer {
my ($content, $response, $protocol, $entry) = @_;
print "Handling answer from '",$response->request->url,": ",
length($content), " bytes, Code ",
$response->code, ", ", $response->message,"\n";
if (length ($content) ) {
# just store content if it comes in
$response->add_content($content);
} else {
# Having no content doesn't mean the connection is closed!
# Sometimes the server might return zero bytes, so unless
# you already got the information you need, you should continue
# processing here (see below)
# Otherwise you can return a special exit code that will
# determins how ParallelUA will continue with this connection.
# Note: We have to import those constants via "qw(:CALLBACK)"!
# return C_ENDCON; # will end only this connection
# (silly, we already have EOF)
# return C_LASTCON; # wait for remaining open connections,
# but don't issue any new ones!!
# return C_ENDALL; # will immediately end all connections
# and return from $pua->wait
}
# ATTENTION!! If you want to keep reading from your connection,
# you should have a final 'return undef' statement here. Even if
# you think that all data has arrived, it does not hurt to return
# undef here. The Parallel UserAgent will figure out by itself
# when to close the connection!
return undef; # just keep on connecting/reading/waiting
# until the server closes the connection.
}
=head1 AUTHOR
( run in 1.024 second using v1.01-cache-2.11-cpan-13bb782fe5a )