Dezi-Bot

 view release on metacpan or  search on metacpan

bin/dezibot  view on Meta::CPAN

}
my $n_urls = scalar(@uniq_urls);
$workers   ||= 1;
$pool_size ||= int( $n_urls / $workers );

$debug and dump $config;

# simple case, no parallel
if ( $workers == 1 ) {
    my $bot   = Dezi::Bot->new(%$config);
    my $total = $bot->crawl(@uniq_urls);
    $verbose and warn "crawled $total URLs\n";
    exit(0);
}

# set up the parallel manager
my $manager = Parallel::Forker->new( use_sig_child => 1 );

# signal handling (propagate death)
$SIG{CHLD} = sub { Parallel::Forker::sig_child($manager) };
$SIG{TERM} = sub {
    if ( $manager && $manager->in_parent ) {
        $manager->kill_tree_all('TERM');
        die "Quitting...\n";
    }
};

# if urls <= workers, no problem.
if ( $n_urls <= $workers ) {
    for my $url (@uniq_urls) {
        my $process = $manager->schedule(
            name         => $url,    # unique
            run_on_start => sub {
                my $proc = shift;
                if ($verbose) {
                    warn sprintf( "[%s] starting %s\n", $$, $proc->name );
                }
                my $bot = Dezi::Bot->new(%$config);
                $bot->crawl( $proc->name );
            },
            run_on_finish => sub {
                my ( $proc, $exit_status ) = @_;
                if ($verbose) {
                    warn sprintf( "crawl(%s) exited with %s\n",
                        $proc->name, $exit_status );
                }
            },
        );
        $process->ready();
    }
    $manager->poll();        # start ready workers
    $manager->wait_all();    # block till we're done
    exit(0);
}

# TODO
# if urls > workers, divide urls into pools
# of $pool_size, and assign each pool to a worker.
# if we can't schedule all urls immediately,
# then schedule the rest of the pools to start as each
# worker finishes. The goal is to keep all workers busy,
# not to wait on the slowest.
else {

    exit(0);
}

__END__

=head1 NAME

dezibot - parallel web crawler

=head1 SYNOPSIS

 # crawl 2 sites
 % dezibot http://dezi.org http://swish-e.org
 
 # crawl a list of sites
 % dezibot --urls file_with_urls
 
 # pass in stored config
 % dezibot --config botconfig.pl
 
 # crawl in parallel
 % dezibot --workers 5 --urls file_with_urls
 
=head1 DESCRIPTION

B<dezibot> is a command line tool wrapping the Dezi::Bot module. 

B<dezibot> can:

=over

=item 

read from a config file or take options on the command line

=item 

read URLs from a file or from @ARGV

=item

spawn multiple parallel spiders

=back

=head1 OPTIONS

The following options are supported.

=head2 --help

Print this message.

=head2 --debug

Spew lots of information to stderr. Overrides any setting in B<--config>.



( run in 1.337 second using v1.01-cache-2.11-cpan-ceb78f64989 )