Dezi-Bot
view release on metacpan or search on metacpan
bin/dezibot view on Meta::CPAN
}
my $n_urls = scalar(@uniq_urls);
$workers ||= 1;
$pool_size ||= int( $n_urls / $workers );
$debug and dump $config;
# simple case, no parallel
if ( $workers == 1 ) {
my $bot = Dezi::Bot->new(%$config);
my $total = $bot->crawl(@uniq_urls);
$verbose and warn "crawled $total URLs\n";
exit(0);
}
# set up the parallel manager
my $manager = Parallel::Forker->new( use_sig_child => 1 );
# signal handling (propagate death)
$SIG{CHLD} = sub { Parallel::Forker::sig_child($manager) };
$SIG{TERM} = sub {
if ( $manager && $manager->in_parent ) {
$manager->kill_tree_all('TERM');
die "Quitting...\n";
}
};
# if urls <= workers, no problem.
if ( $n_urls <= $workers ) {
for my $url (@uniq_urls) {
my $process = $manager->schedule(
name => $url, # unique
run_on_start => sub {
my $proc = shift;
if ($verbose) {
warn sprintf( "[%s] starting %s\n", $$, $proc->name );
}
my $bot = Dezi::Bot->new(%$config);
$bot->crawl( $proc->name );
},
run_on_finish => sub {
my ( $proc, $exit_status ) = @_;
if ($verbose) {
warn sprintf( "crawl(%s) exited with %s\n",
$proc->name, $exit_status );
}
},
);
$process->ready();
}
$manager->poll(); # start ready workers
$manager->wait_all(); # block till we're done
exit(0);
}
# TODO
# if urls > workers, divide urls into pools
# of $pool_size, and assign each pool to a worker.
# if we can't schedule all urls immediately,
# then schedule the rest of the pools to start as each
# worker finishes. The goal is to keep all workers busy,
# not to wait on the slowest.
else {
exit(0);
}
__END__
=head1 NAME
dezibot - parallel web crawler
=head1 SYNOPSIS
# crawl 2 sites
% dezibot http://dezi.org http://swish-e.org
# crawl a list of sites
% dezibot --urls file_with_urls
# pass in stored config
% dezibot --config botconfig.pl
# crawl in parallel
% dezibot --workers 5 --urls file_with_urls
=head1 DESCRIPTION
B<dezibot> is a command line tool wrapping the Dezi::Bot module.
B<dezibot> can:
=over
=item
read from a config file or take options on the command line
=item
read URLs from a file or from @ARGV
=item
spawn multiple parallel spiders
=back
=head1 OPTIONS
The following options are supported.
=head2 --help
Print this message.
=head2 --debug
Spew lots of information to stderr. Overrides any setting in B<--config>.
( run in 1.337 second using v1.01-cache-2.11-cpan-ceb78f64989 )