AnyEvent-WebArchive

 view release on metacpan or  search on metacpan

lib/AnyEvent/WebArchive.pm  view on Meta::CPAN

package AnyEvent::WebArchive;

use strict;
use AnyEvent::HTTP;
use Data::Dumper;
use base 'Exporter';
our $VERSION = '0.02';

our @EXPORT = qw(restore_url);
my $DEBUG = 0;
sub restore_url {
	my $url = shift;
	my $cb  = pop;
	
	$url =~ s/^www\.//;
	my $opt = ref $_[0] ? $_[0] : {@_};
	
	$AnyEvent::HTTP::USERAGENT      = $opt->{'user_agent'} || 'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.5.24 Version/10.52';
	$AnyEvent::HTTP::MAX_PER_HOST ||= $opt->{'max_per_host'};
	$AnyEvent::HTTP::ACTIVE       ||= $opt->{'active'      };
	
	my $count;
	my $worker = {};
	bless $worker, __PACKAGE__;
	$worker->{'domain'} = $url;
	http_get _search($url), sub {
		$url = $url;
		$DEBUG && warn "GET $url\n";
		my ($body, $headers) = @_;
		
		for my $job (grep { $_->[0] } # XXX
			map { [ /href="([^"]+)"/sg, />([^<]+)<\/a>/sg ] } map { split /(<br>){2}/ } 
			$body =~ m{<!-- SEARCH RESULTS -->(.*?)<!-- /SEARCH RESULTS -->}si
		) {
			$DEBUG && warn "GET $job->[0]\n";
			$count++;
			http_get $job->[0], sub {
				my ($body, $headers) = @_;
				if ($headers->{'Status'} == 200) {
					$worker->_save_file($job->[1], $body);
				} else {
					warn "Bad status for url $job->[0]: $_" for Dumper($headers);
				}
				
				$cb->() unless --$count;
			} 
		}
	}
}

sub _filename {
	my $str = shift;
	
	$str =~ s/[^a-z\.\,\s\;-]/_/sig;
	
	return $str;
}

sub _search {
	return "http://web.archive.org/web/*sr_1nr_10000/$_*" for shift;
}

sub _save_file {
	my ($worker, $url,$body) = @_;
	



( run in 0.966 second using v1.01-cache-2.11-cpan-39bf76dae61 )