Dezi-App

 view release on metacpan or  search on metacpan

lib/Dezi/Aggregator/Spider.pm  view on Meta::CPAN

        # authorize and try again
        $self->write_log(
            uri => $uri,
            msg => sprintf( "retrying, %s", $http_response->status_line ),
        );
        return $self->get_authorized_doc( $uri, $response );
    }
    else {

        $self->write_log(
            uri => $uri,
            msg => $http_response->status_line,
        );
        return $response->status;
    }

    return;    # never get here.
}

sub _get_user_pass {
    my $self = shift;
    my $uri  = shift;

    # Set basic auth if defined - use URI specific first, then credentials.
    # this doesn't track what should have authorization
    my $last_auth;
    if ( $self->{last_auth} ) {
        my $path = $uri->path;
        $path =~ s!/[^/]*$!!;
        $last_auth = $self->{last_auth}->{auth}
            if $self->{last_auth}->{path} eq $path;
    }

    my ( $user, $pass ) = split /:/,
        ( $last_auth || $uri->userinfo || $self->credentials || '' );

    return ( $user, $pass );
}

=head2 looks_like_feed( I<http_response> )

Called internally to perform naive heuristics on I<http_response>
to determine whether it looks like an XML feed of some kind,
rather than a HTML page.

=cut

sub looks_like_feed {
    my $self     = shift;
    my $response = shift or croak "response required";
    my $headers  = $response->headers;
    my $ct       = $headers->content_type;
    if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
        return 0;
    }
    if (   $ct eq 'text/xml'
        or $ct eq 'application/rss+xml'
        or $ct eq 'application/rdf+xml'
        or $ct eq 'application/atom+xml' )
    {
        my $xml = $response->decoded_content;    # TODO or content()
        return XML::Feed->parse( \$xml );
    }

    return 0;
}

=head2 looks_like_sitemap( I<http_response> )

Called internally to perform naive heuristics on I<http_response>
to determine whether it looks like a XML sitemap feed,
rather than a HTML page.

=cut

sub looks_like_sitemap {
    my $self     = shift;
    my $response = shift or croak "response required";
    my $headers  = $response->headers;
    my $ct       = $headers->content_type;
    if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
        return 0;
    }
    if (   $ct eq 'text/xml'
        or $ct eq 'application/xml' )
    {
        my $xml     = $response->decoded_content;    # TODO or content()
        my $sitemap = WWW::Sitemap::XML->new();
        eval { $sitemap->load( string => $xml ); };
        if ($@) {
            return 0;
        }
        return $sitemap;
    }

    return 0;
}

=head2 crawl( I<uri> )

Implements the required crawl() method. Recursively fetches I<uri>
and its child links to a depth set in max_depth().

Will quit after max_files() unless max_files==0.

Will quit after max_time() seconds unless max_time==0.

=cut

sub crawl {
    my $self = shift;
    my @urls = @_;

    my $indexer = $self->indexer;    # may be undef

    for my $url (@urls) {
        my $started = time();
        $self->debug and $self->write_log(
            uri => $url,
            msg => "crawling",
        );

        my $uri = URI->new($url)->canonical;
        $self->uri_cache->add( "$uri" => 1 );
        $self->add_to_queue($uri);
        $self->{_base} = $uri->as_string;
        while ( my $doc = $self->get_doc ) {
            $self->debug and $self->write_log_line();
            next unless blessed($doc);

            # indexer not required
            $indexer->process($doc) if $indexer;

            $self->_increment_count;

            # abort if we've met any max_* conditions
            last if $self->max_files and $self->count >= $self->max_files;
            last
                if $self->max_time
                and ( time() - $started ) > $self->max_time;
        }
    }

    return $self->count;
}

=head2 write_log( I<args> )



( run in 1.049 second using v1.01-cache-2.11-cpan-5a3173703d6 )