HTML-Robot-Scrapper
view release on metacpan or search on metacpan
lib/HTML/Robot/Scrapper.pm view on Meta::CPAN
This example show an asp website that has those '__EVENTVALIDATION' and '__VIEWSTATE' which must be sent back again on each request... here is the example of such crawler for such website...
This example also demonstrates how one could easily login into a website and crawl it also.
package WWW::Tabela::Fipe;
use Moose;
with 'HTML::Robot::Scrapper::Reader';
use Data::Printer;
use utf8;
use HTML::Entities;
use HTTP::Request::Common qw(POST);
has [ qw/marcas viewstate eventvalidation/ ] => ( is => 'rw' );
has veiculos => ( is => 'rw' , default => sub { return []; });
has referer => ( is => 'rw' );
sub start {
my ( $self ) = @_;
}
has startpage => (
is => 'rw',
default => sub {
return [
{
tipo => 'moto',
url => 'http://www.fipe.org.br/web/indices/veiculos/default.aspx?azxp=1&v=m&p=52'
},
{
tipo => 'carro',
url => 'http://www.fipe.org.br/web/indices/veiculos/default.aspx?p=51'
},
{
tipo => 'caminhao',
url => 'http://www.fipe.org.br/web/indices/veiculos/default.aspx?v=c&p=53'
},
]
},
);
sub on_start {
my ( $self ) = @_;
foreach my $item ( @{ $self->startpage } ) {
$self->append( search => $item->{ url }, {
passed_key_values => {
tipo => $item->{ tipo },
referer => $item->{ url },
}
} );
}
}
sub _headers {
my ( $self , $url, $form ) = @_;
return {
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding' => 'gzip, deflate',
'Accept-Language' => 'en-US,en;q=0.5',
'Cache-Control' => 'no-cache',
'Connection' => 'keep-alive',
'Content-Length' => length( POST('url...', [], Content => $form)->content ),
'Content-Type' => 'application/x-www-form-urlencoded; charset=utf-8',
'DNT' => '1',
'Host' => 'www.fipe.org.br',
'Pragma' => 'no-cache',
'Referer' => $url,
'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:20.0) Gecko/20100101 Firefox/20.0',
'X-MicrosoftAjax' => 'Delta=true',
};
}
sub _form {
my ( $self, $args ) = @_;
return [
ScriptManager1 => $args->{ script_manager },
__ASYNCPOST => 'true',
__EVENTARGUMENT => '',
__EVENTTARGET => $args->{ event_target },
__EVENTVALIDATION => $args->{ event_validation },
__LASTFOCUS => '',
__VIEWSTATE => $args->{ viewstate },
ddlAnoValor => ( !exists $args->{ano} ) ? 0 : $args->{ ano },
ddlMarca => ( !exists $args->{marca} ) ? 0 : $args->{ marca },
ddlModelo => ( !exists $args->{modelo} ) ? 0 : $args->{ modelo },
ddlTabelaReferencia => 154,
txtCodFipe => '',
];
}
sub search {
my ( $self ) = @_;
my $marcas = $self->tree->findnodes( '//select[@name="ddlMarca"]/option' );
my $viewstate = $self->tree->findnodes( '//form[@id="form1"]//input[@id="__VIEWSTATE"]' )->get_node->attr('value');
my $event_validation = $self->tree->findnodes( '//form[@id="form1"]//input[@id="__EVENTVALIDATION"]' )->get_node->attr('value');
foreach my $marca ( $marcas->get_nodelist ) {
my $form = $self->_form( {
script_manager => 'UdtMarca|ddlMarca',
event_target => 'ddlMarca',
event_validation=> $event_validation,
viewstate => $viewstate,
marca => $marca->attr( 'value' ),
} );
$self->prepend( busca_marca => 'url' , {
passed_key_values => {
marca => $marca->as_text,
marca_id => $marca->attr( 'value' ),
tipo => $self->robot->reader->passed_key_values->{ tipo },
referer => $self->robot->reader->passed_key_values->{referer },
},
request => [
'POST',
$self->robot->reader->passed_key_values->{ referer },
{
headers => $self->_headers( $self->robot->reader->passed_key_values->{ referer } , $form ),
content => POST('url...', [], Content => $form)->content,
}
]
} );
}
}
( run in 1.141 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )