Dezi-App

 view release on metacpan or  search on metacpan

lib/Dezi/Aggregator/Spider.pm  view on Meta::CPAN

#
#

has 'agent' => (
    is      => 'rw',
    isa     => Str,
    default => sub {'dezi-spider http://dezi.org/'},
);
has 'authn_callback'     => ( is => 'rw', isa => CodeRef );
has 'credential_timeout' => ( is => 'rw', isa => Int, default => sub {30} );
has 'credentials'        => ( is => 'rw', isa => Str );
has 'delay'              => ( is => 'rw', isa => Int, default => sub {5} );
has 'email' => (
    is      => 'rw',
    isa     => Str,
    default => sub {'dezi@user.failed.to.set.email.invalid'},
);
has 'file_rules' => ( is => 'rw', isa => DeziFileRules, coerce => 1, );
has 'follow_redirects' => ( is => 'rw', isa => Bool, default => sub {1} );
has 'keep_alive' => ( is => 'rw', isa => Bool, default => sub {0} );

lib/Dezi/Aggregator/Spider.pm  view on Meta::CPAN

Get/set the number of seconds to wait between making requests. Default is
5 seconds (a very friendly delay).

=item timeout I<n>

Get/set the number of seconds to wait before considering the remote
server unresponsive. The default is 10.

=item authn_callback I<code_ref>

CODE reference to fetch username/password credentials when necessary. See also
C<credentials>.

=item credential_timeout I<n>

Number of seconds to wait before skipping manual prompt for username/password.

=item credentials I<user:pass>

String with C<username>:C<password> pair to be used when prompted by
the server.

=item follow_redirects I<1|0>

By default, 3xx responses from the server will be followed when
they are on the same hostname. Set to false (0) to not follow
redirects.

lib/Dezi/Aggregator/Spider.pm  view on Meta::CPAN

                    ->( $self, $uri, $response, $realm );
                $uri->userinfo($user_pass);

                #warn " >> set userinfo via authn_callback\n" if $self->debug;
                return 1;
            }
        }

        # otherwise, prompt (over and over)
        if ( !$user_pass ) {
            $user_pass = $self->_get_basic_credentials( $uri, $realm );
        }

        if ($user_pass) {
            $uri->userinfo($user_pass);
            $self->{cur_realm} = $realm;  # save so we can cache if it's valid
            return 1;
        }
    }

    return 0;

}

# From spider.pl
sub _get_basic_credentials {
    my ( $self, $uri, $realm ) = @_;

    # Exists but undefined means don't ask.
    return
        if exists $self->{credential_timeout}
        && !defined $self->{credential_timeout};

    my $netloc = $uri->canonical->host_port;

    my ( $user, $password );

lib/Dezi/Aggregator/Spider.pm  view on Meta::CPAN

    if ($doc) {
        $self->remove_from_queue($uri);
    }

    return $doc;
}

=head2 get_authorized_doc( I<uri>, I<response> )

Called internally when the server returns a 401 or 403 response.
Will attempt to determine the correct credentials for I<uri>
based on the previous attempt in I<response> and what you
have configured in B<credentials>, B<authn_callback> or when
manually prompted.

=cut

sub get_authorized_doc {
    my $self     = shift;
    my $uri      = shift or croak "uri required";
    my $response = shift or croak "response required";

    # set up credentials
    $self->_authorize( $uri, $response->http_response ) or return;

    return $self->_make_request($uri);
}

sub _make_request {
    my ( $self, $uri ) = @_;

    # get our useragent
    my $ua    = $self->ua;

lib/Dezi/Aggregator/Spider.pm  view on Meta::CPAN

        my $encoding = $headers->content_encoding || $charset;
        my %doc = (
            url     => $meta->{org_uri},
            modtime => ( $headers->last_modified || $headers->date ),
            type    => $meta->{ct},
            content => ( $encoding =~ m/utf-8/i ? to_utf8($buf) : $buf ),
            size => $headers->content_length || length( pack 'C0a*', $buf ),
            charset => $encoding,
        );

        # cache whatever credentials were used so we can re-use
        if ( $self->{cur_realm} and $uri->userinfo ) {
            my $key = $uri->canonical->host_port . ':' . $self->{cur_realm};
            $self->{_auth_cache}->add( $key => $uri->userinfo );

            # not too sure of the best logic here
            my $path = $uri->path;
            $path =~ s!/[^/]*$!!;
            $self->{last_auth} = {
                path => $path,
                auth => $uri->userinfo,

lib/Dezi/Aggregator/Spider.pm  view on Meta::CPAN

        return $response->status;
    }

    return;    # never get here.
}

sub _get_user_pass {
    my $self = shift;
    my $uri  = shift;

    # Set basic auth if defined - use URI specific first, then credentials.
    # this doesn't track what should have authorization
    my $last_auth;
    if ( $self->{last_auth} ) {
        my $path = $uri->path;
        $path =~ s!/[^/]*$!!;
        $last_auth = $self->{last_auth}->{auth}
            if $self->{last_auth}->{path} eq $path;
    }

    my ( $user, $pass ) = split /:/,
        ( $last_auth || $uri->userinfo || $self->credentials || '' );

    return ( $user, $pass );
}

=head2 looks_like_feed( I<http_response> )

Called internally to perform naive heuristics on I<http_response>
to determine whether it looks like an XML feed of some kind,
rather than a HTML page.

t/012-spider-server.t  view on Meta::CPAN

                'filename contains \?.*query=pass',    # specific query param
            ],

            # hurry up and fail
            delay => 0,

            filter => sub {
                $debug and diag( "doc filter on " . $_[0]->url );
                $debug and diag( "body:" . $_[0]->content );
            },
            credentials    => 'foo:bar',
            same_hosts     => ["127.0.0.1"],
            modified_since => time2str( time() ),
            max_size       => 4096,
        ),
        "new spider"
    );

    diag( "spidering " . $base_uri );
    is( $spider->crawl($base_uri), 5, "crawl" );



( run in 0.246 second using v1.01-cache-2.11-cpan-4d50c553e7e )