Dezi-App
view release on metacpan or search on metacpan
lib/Dezi/Aggregator/Spider/Response.pm view on Meta::CPAN
# TODO set by our UA. duplicate?
#$self->{link_tags} ||= { a => 1, frame => 1, iframe => 1, };
}
=head2 http_response
Returns internal HTTP::Response object.
=cut
=head2 success
Shortcut for $response->http_response->is_success.
=cut
sub success {
return shift->http_response->is_success;
}
=head2 status
Shortcut for $response->http_response->code.
=cut
sub status {
return shift->http_response->code;
}
=head2 ct
Shortcut for $response->response->header('content-type').
Any encoding will be stripped from the returned string.
=cut
sub ct {
my $self = shift;
my $ct = $self->http_response->header('content-type');
$ct =~ s/;.+// if $ct;
return $ct;
}
=head2 is_html
Returns true if ct() looks like HTML or XHTML.
=cut
sub is_html {
my $self = shift;
my $ct = $self->ct;
return defined $ct
&& ( $ct eq 'text/html' || $ct eq 'application/xhtml+xml' );
}
=head2 content
Shortcut for $response->http_response->decoded_content.
=cut
sub content {
return shift->http_response->decoded_content;
}
=head2 links
Returns array of href targets in content(). Parsed
using HTML::LinkExtor.
=cut
sub links {
my $self = shift;
my @links = ();
my $http_response = $self->http_response;
my $debug = $self->debug;
if ( $http_response and $self->is_html ) {
my $le = HTML::LinkExtor->new();
my $base = $http_response->base;
$le->parse( $self->content );
my %skipped_tags;
for my $link ( $le->links ) {
my ( $tag, %attr ) = @$link;
# which tags to use
my $attr = join ' ', map {qq[$_="$attr{$_}"]} keys %attr;
$debug and Dezi::Utils->write_log(
uri => $base,
msg => "extracted tag '<$tag $attr>'"
);
if ( !exists $self->link_tags->{$tag} ) {
$debug
and Dezi::Utils->write_log(
uri => $base,
msg => "skipping tag '<$tag $attr>', not on whitelist"
);
next;
}
# Grab which attribute(s) which might contain links for this tag
my $links = $HTML::Tagset::linkElements{$tag};
$links = [$links] unless ref $links;
my $found = 0;
# check each attribute to see if a link exists
for my $attribute (@$links) {
if ( $attr{$attribute} ) {
# strip any anchors as noise
$attr{$attribute} =~ s/#.*//;
my $u = URI->new_abs( $attr{$attribute}, $base );
push @links, $u;
$debug
and Dezi::Utils->write_log(
uri => $base,
( run in 2.531 seconds using v1.01-cache-2.11-cpan-98e64b0badf )