Dezi-App

 view release on metacpan or  search on metacpan

lib/Dezi/Aggregator/Spider/Response.pm  view on Meta::CPAN


    # TODO set by our UA. duplicate?
    #$self->{link_tags} ||= { a => 1, frame => 1, iframe => 1, };
}

=head2 http_response

Returns internal HTTP::Response object.

=cut

=head2 success

Shortcut for $response->http_response->is_success.

=cut

sub success {
    return shift->http_response->is_success;
}

=head2 status

Shortcut for $response->http_response->code.

=cut

sub status {
    return shift->http_response->code;
}

=head2 ct

Shortcut for $response->response->header('content-type').
Any encoding will be stripped from the returned string.

=cut

sub ct {
    my $self = shift;
    my $ct   = $self->http_response->header('content-type');
    $ct =~ s/;.+// if $ct;
    return $ct;
}

=head2 is_html

Returns true if ct() looks like HTML or XHTML.

=cut

sub is_html {
    my $self = shift;
    my $ct   = $self->ct;
    return defined $ct
        && ( $ct eq 'text/html' || $ct eq 'application/xhtml+xml' );
}

=head2 content

Shortcut for $response->http_response->decoded_content.

=cut

sub content {
    return shift->http_response->decoded_content;
}

=head2 links

Returns array of href targets in content(). Parsed
using HTML::LinkExtor.

=cut

sub links {
    my $self          = shift;
    my @links         = ();
    my $http_response = $self->http_response;
    my $debug         = $self->debug;

    if ( $http_response and $self->is_html ) {
        my $le   = HTML::LinkExtor->new();
        my $base = $http_response->base;
        $le->parse( $self->content );

        my %skipped_tags;

        for my $link ( $le->links ) {
            my ( $tag, %attr ) = @$link;

            # which tags to use
            my $attr = join ' ', map {qq[$_="$attr{$_}"]} keys %attr;

            $debug and Dezi::Utils->write_log(
                uri => $base,
                msg => "extracted tag '<$tag $attr>'"
            );

            if ( !exists $self->link_tags->{$tag} ) {
                $debug
                    and Dezi::Utils->write_log(
                    uri => $base,
                    msg => "skipping tag '<$tag $attr>', not on whitelist"
                    );
                next;
            }

            # Grab which attribute(s) which might contain links for this tag
            my $links = $HTML::Tagset::linkElements{$tag};
            $links = [$links] unless ref $links;

            my $found = 0;

            # check each attribute to see if a link exists
            for my $attribute (@$links) {
                if ( $attr{$attribute} ) {

                    # strip any anchors as noise
                    $attr{$attribute} =~ s/#.*//;

                    my $u = URI->new_abs( $attr{$attribute}, $base );
                    push @links, $u;
                    $debug
                        and Dezi::Utils->write_log(
                        uri => $base,



( run in 2.531 seconds using v1.01-cache-2.11-cpan-98e64b0badf )