Dezi-App
view release on metacpan or search on metacpan
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
# authorize and try again
$self->write_log(
uri => $uri,
msg => sprintf( "retrying, %s", $http_response->status_line ),
);
return $self->get_authorized_doc( $uri, $response );
}
else {
$self->write_log(
uri => $uri,
msg => $http_response->status_line,
);
return $response->status;
}
return; # never get here.
}
sub _get_user_pass {
my $self = shift;
my $uri = shift;
# Set basic auth if defined - use URI specific first, then credentials.
# this doesn't track what should have authorization
my $last_auth;
if ( $self->{last_auth} ) {
my $path = $uri->path;
$path =~ s!/[^/]*$!!;
$last_auth = $self->{last_auth}->{auth}
if $self->{last_auth}->{path} eq $path;
}
my ( $user, $pass ) = split /:/,
( $last_auth || $uri->userinfo || $self->credentials || '' );
return ( $user, $pass );
}
=head2 looks_like_feed( I<http_response> )
Called internally to perform naive heuristics on I<http_response>
to determine whether it looks like an XML feed of some kind,
rather than a HTML page.
=cut
sub looks_like_feed {
my $self = shift;
my $response = shift or croak "response required";
my $headers = $response->headers;
my $ct = $headers->content_type;
if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
return 0;
}
if ( $ct eq 'text/xml'
or $ct eq 'application/rss+xml'
or $ct eq 'application/rdf+xml'
or $ct eq 'application/atom+xml' )
{
my $xml = $response->decoded_content; # TODO or content()
return XML::Feed->parse( \$xml );
}
return 0;
}
=head2 looks_like_sitemap( I<http_response> )
Called internally to perform naive heuristics on I<http_response>
to determine whether it looks like a XML sitemap feed,
rather than a HTML page.
=cut
sub looks_like_sitemap {
my $self = shift;
my $response = shift or croak "response required";
my $headers = $response->headers;
my $ct = $headers->content_type;
if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
return 0;
}
if ( $ct eq 'text/xml'
or $ct eq 'application/xml' )
{
my $xml = $response->decoded_content; # TODO or content()
my $sitemap = WWW::Sitemap::XML->new();
eval { $sitemap->load( string => $xml ); };
if ($@) {
return 0;
}
return $sitemap;
}
return 0;
}
=head2 crawl( I<uri> )
Implements the required crawl() method. Recursively fetches I<uri>
and its child links to a depth set in max_depth().
Will quit after max_files() unless max_files==0.
Will quit after max_time() seconds unless max_time==0.
=cut
sub crawl {
my $self = shift;
my @urls = @_;
my $indexer = $self->indexer; # may be undef
for my $url (@urls) {
my $started = time();
$self->debug and $self->write_log(
uri => $url,
msg => "crawling",
);
my $uri = URI->new($url)->canonical;
$self->uri_cache->add( "$uri" => 1 );
$self->add_to_queue($uri);
$self->{_base} = $uri->as_string;
while ( my $doc = $self->get_doc ) {
$self->debug and $self->write_log_line();
next unless blessed($doc);
# indexer not required
$indexer->process($doc) if $indexer;
$self->_increment_count;
# abort if we've met any max_* conditions
last if $self->max_files and $self->count >= $self->max_files;
last
if $self->max_time
and ( time() - $started ) > $self->max_time;
}
}
return $self->count;
}
=head2 write_log( I<args> )
( run in 1.049 second using v1.01-cache-2.11-cpan-5a3173703d6 )