AnyEvent-WebArchive
view release on metacpan or search on metacpan
lib/AnyEvent/WebArchive.pm view on Meta::CPAN
package AnyEvent::WebArchive;
use strict;
use AnyEvent::HTTP;
use Data::Dumper;
use base 'Exporter';
our $VERSION = '0.02';
our @EXPORT = qw(restore_url);
my $DEBUG = 0;
sub restore_url {
my $url = shift;
my $cb = pop;
$url =~ s/^www\.//;
my $opt = ref $_[0] ? $_[0] : {@_};
$AnyEvent::HTTP::USERAGENT = $opt->{'user_agent'} || 'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.5.24 Version/10.52';
$AnyEvent::HTTP::MAX_PER_HOST ||= $opt->{'max_per_host'};
$AnyEvent::HTTP::ACTIVE ||= $opt->{'active' };
my $count;
my $worker = {};
bless $worker, __PACKAGE__;
$worker->{'domain'} = $url;
http_get _search($url), sub {
$url = $url;
$DEBUG && warn "GET $url\n";
my ($body, $headers) = @_;
for my $job (grep { $_->[0] } # XXX
map { [ /href="([^"]+)"/sg, />([^<]+)<\/a>/sg ] } map { split /(<br>){2}/ }
$body =~ m{<!-- SEARCH RESULTS -->(.*?)<!-- /SEARCH RESULTS -->}si
) {
$DEBUG && warn "GET $job->[0]\n";
$count++;
http_get $job->[0], sub {
my ($body, $headers) = @_;
if ($headers->{'Status'} == 200) {
$worker->_save_file($job->[1], $body);
} else {
warn "Bad status for url $job->[0]: $_" for Dumper($headers);
}
$cb->() unless --$count;
}
}
}
}
sub _filename {
my $str = shift;
$str =~ s/[^a-z\.\,\s\;-]/_/sig;
return $str;
}
sub _search {
return "http://web.archive.org/web/*sr_1nr_10000/$_*" for shift;
}
sub _save_file {
my ($worker, $url,$body) = @_;
( run in 0.966 second using v1.01-cache-2.11-cpan-39bf76dae61 )