WWW-Sitemapper
view release on metacpan or search on metacpan
t/sitemapper.t view on Meta::CPAN
use base qw( WWW::Sitemapper );
has 'url_parents' => (
is => 'rw',
isa => 'HashRef',
default => sub { +{} },
);
sub _build_robot_config {
my $self = shift;
return {
NAME => 'MyRobot',
EMAIL => 'me@domain.tld',
DELAY => 0,
};
}
sub url_test : Hook('follow-url-test') {
my $self = shift;
my ($robot, $hook_name, $uri) = @_;
my @restricted = (
qr{^/12.html},
);
my $url = $uri->path_query;
if ( $self->site->host eq $uri->host ) {
for my $re ( @restricted ) {
if ( $url =~ /$re/ ) {
return 0;
}
}
return 1;
}
return 0;
}
sub incorrect_attr : Attr(whatever) {
my $self = shift;
return 1;
}
around '_map_builder' => sub {
my $orig = shift;
my $self = shift;
my ($robot, $hook_name, $from_url, $to_url) = @_;
$self->url_parents->{$to_url->path}->{$from_url->path} = 1;
$self->$orig( $robot, $hook_name, $from_url, $to_url );
};
};
# We want to be safe from non-resolving local host names
delete $ENV{HTTP_PROXY};
my $d = HTTP::Daemon->new( LocalAddr => 'localhost' ) || die;
my $server_host = $d->url;
my $is_test;
my $STATUS_STORAGE_FILE = "t/status.storage";
my $TEST_TIME = time();
my $W3C_DATETIME = DateTime->from_epoch(
epoch => $TEST_TIME
)->strftime('%FT%T%z');
$W3C_DATETIME =~ s/(\d{2})$/:$1/;
# for qr
$W3C_DATETIME =~ s/([\-\+])/\\$1/g;
my $HTTP_DATE = time2str($TEST_TIME);
if ($is_test = fork ) {
# wait for server to start up
sleep 1;
my @valid_links = map {
"$server_host$_"
} qw(
index.html
1.html
11.html
3.html
31.html
32.html
2.html
21.html
22.html
);
my @valid_redirects = map {
"$server_host$_"
} qw(
friendly_url.html
);
my $mapper;
lives_ok {
$mapper = MyWebSite::Map->new(
site => "${server_host}index.html",
status_storage => $STATUS_STORAGE_FILE,
auto_save => 0.02,
);
} "mapper object created";
lives_ok {
$mapper->run();
} "run() works";
my $root = $mapper->tree;
is_deeply(
[ sort keys %{ $root->_dictionary } ],
[ sort @valid_links ],
"valid links were fetched"
);
is_deeply(
[ sort keys %{ $root->_redirects } ],
[ sort @valid_redirects ],
( run in 0.328 second using v1.01-cache-2.11-cpan-71847e10f99 )