Dezi-App
view release on metacpan or search on metacpan
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
#
#
has 'agent' => (
is => 'rw',
isa => Str,
default => sub {'dezi-spider http://dezi.org/'},
);
has 'authn_callback' => ( is => 'rw', isa => CodeRef );
has 'credential_timeout' => ( is => 'rw', isa => Int, default => sub {30} );
has 'credentials' => ( is => 'rw', isa => Str );
has 'delay' => ( is => 'rw', isa => Int, default => sub {5} );
has 'email' => (
is => 'rw',
isa => Str,
default => sub {'dezi@user.failed.to.set.email.invalid'},
);
has 'file_rules' => ( is => 'rw', isa => DeziFileRules, coerce => 1, );
has 'follow_redirects' => ( is => 'rw', isa => Bool, default => sub {1} );
has 'keep_alive' => ( is => 'rw', isa => Bool, default => sub {0} );
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
Get/set the number of seconds to wait between making requests. Default is
5 seconds (a very friendly delay).
=item timeout I<n>
Get/set the number of seconds to wait before considering the remote
server unresponsive. The default is 10.
=item authn_callback I<code_ref>
CODE reference to fetch username/password credentials when necessary. See also
C<credentials>.
=item credential_timeout I<n>
Number of seconds to wait before skipping manual prompt for username/password.
=item credentials I<user:pass>
String with C<username>:C<password> pair to be used when prompted by
the server.
=item follow_redirects I<1|0>
By default, 3xx responses from the server will be followed when
they are on the same hostname. Set to false (0) to not follow
redirects.
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
->( $self, $uri, $response, $realm );
$uri->userinfo($user_pass);
#warn " >> set userinfo via authn_callback\n" if $self->debug;
return 1;
}
}
# otherwise, prompt (over and over)
if ( !$user_pass ) {
$user_pass = $self->_get_basic_credentials( $uri, $realm );
}
if ($user_pass) {
$uri->userinfo($user_pass);
$self->{cur_realm} = $realm; # save so we can cache if it's valid
return 1;
}
}
return 0;
}
# From spider.pl
sub _get_basic_credentials {
my ( $self, $uri, $realm ) = @_;
# Exists but undefined means don't ask.
return
if exists $self->{credential_timeout}
&& !defined $self->{credential_timeout};
my $netloc = $uri->canonical->host_port;
my ( $user, $password );
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
if ($doc) {
$self->remove_from_queue($uri);
}
return $doc;
}
=head2 get_authorized_doc( I<uri>, I<response> )
Called internally when the server returns a 401 or 403 response.
Will attempt to determine the correct credentials for I<uri>
based on the previous attempt in I<response> and what you
have configured in B<credentials>, B<authn_callback> or when
manually prompted.
=cut
sub get_authorized_doc {
my $self = shift;
my $uri = shift or croak "uri required";
my $response = shift or croak "response required";
# set up credentials
$self->_authorize( $uri, $response->http_response ) or return;
return $self->_make_request($uri);
}
sub _make_request {
my ( $self, $uri ) = @_;
# get our useragent
my $ua = $self->ua;
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
my $encoding = $headers->content_encoding || $charset;
my %doc = (
url => $meta->{org_uri},
modtime => ( $headers->last_modified || $headers->date ),
type => $meta->{ct},
content => ( $encoding =~ m/utf-8/i ? to_utf8($buf) : $buf ),
size => $headers->content_length || length( pack 'C0a*', $buf ),
charset => $encoding,
);
# cache whatever credentials were used so we can re-use
if ( $self->{cur_realm} and $uri->userinfo ) {
my $key = $uri->canonical->host_port . ':' . $self->{cur_realm};
$self->{_auth_cache}->add( $key => $uri->userinfo );
# not too sure of the best logic here
my $path = $uri->path;
$path =~ s!/[^/]*$!!;
$self->{last_auth} = {
path => $path,
auth => $uri->userinfo,
lib/Dezi/Aggregator/Spider.pm view on Meta::CPAN
return $response->status;
}
return; # never get here.
}
sub _get_user_pass {
my $self = shift;
my $uri = shift;
# Set basic auth if defined - use URI specific first, then credentials.
# this doesn't track what should have authorization
my $last_auth;
if ( $self->{last_auth} ) {
my $path = $uri->path;
$path =~ s!/[^/]*$!!;
$last_auth = $self->{last_auth}->{auth}
if $self->{last_auth}->{path} eq $path;
}
my ( $user, $pass ) = split /:/,
( $last_auth || $uri->userinfo || $self->credentials || '' );
return ( $user, $pass );
}
=head2 looks_like_feed( I<http_response> )
Called internally to perform naive heuristics on I<http_response>
to determine whether it looks like an XML feed of some kind,
rather than a HTML page.
t/012-spider-server.t view on Meta::CPAN
'filename contains \?.*query=pass', # specific query param
],
# hurry up and fail
delay => 0,
filter => sub {
$debug and diag( "doc filter on " . $_[0]->url );
$debug and diag( "body:" . $_[0]->content );
},
credentials => 'foo:bar',
same_hosts => ["127.0.0.1"],
modified_since => time2str( time() ),
max_size => 4096,
),
"new spider"
);
diag( "spidering " . $base_uri );
is( $spider->crawl($base_uri), 5, "crawl" );
( run in 0.246 second using v1.01-cache-2.11-cpan-4d50c553e7e )