HTML-Robot-Scrapper
view release on metacpan or search on metacpan
lib/HTML/Robot/Scrapper/UserAgent/Default.pm view on Meta::CPAN
=cut
sub parse_content {
my ( $self, $res ) = @_;
my $content_types_avail = $self->robot->parser->content_types;
#set headers
$self->headers( $res->{ headers } );
$self->response_headers( $res->{ headers } );
#content type
my $content_type =$res->{headers}->{'content-type'};
$self->content_type( $content_type );
#charset
my $content_charset = $self->charset_from_headers( $res->{ headers } );
$self->charset( $content_charset );
my $content_type_found = 0;
foreach my $ct (keys %$content_types_avail ) {
foreach my $parser ( @{ $content_types_avail->{$ct} } ) {
next unless $content_type =~ m/^$ct/ig;
my $parse_method = $parser->{parse_method};
# my $content = $res->{content};
$self->robot->parser->$parse_method( $self->content );
$content_type_found = 1;
}
}
print "**** Content type not set for: " . $content_type . '... please configure it correctly adding a parser for that content type'."\n" if !$content_type_found;
# foreach my $ct ( keys $self->parser_content_type ) {
# if ( $self->response->{ headers }->{'content-type'} =~ m|^$ct|g ) {
# my $parser_method = $self->parser_methods->{ $self->parser_content_type->{ $ct } };
# $self->$parser_method();
# }
# }
# my $reader_method = $item->{method};
# $self->$reader_method; #redirects back to method
}
sub charset_from_headers {
my ( $self, $headers ) = @_;
my $ct = $headers->{'content-type'};
my $charset ;
if ( $ct =~ m/charset=([^;|^ ]+)/ig ) {
$charset = $1;
}
return $charset;
}
sub normalize_url {
my ( $self, $url ) = @_;
# if ( ref $self->before_normalize_url eq ref {}
( run in 3.317 seconds using v1.01-cache-2.11-cpan-524268b4103 )