HTML-Robot-Scrapper

 view release on metacpan or  search on metacpan

lib/HTML/Robot/Scrapper.pm  view on Meta::CPAN

This example show an asp website that has those '__EVENTVALIDATION' and '__VIEWSTATE' which must be sent back again on each request... here is the example of such crawler for such website...

This example also demonstrates how one could easily login into a website and crawl it also.

    package WWW::Tabela::Fipe;
    use Moose;
    with 'HTML::Robot::Scrapper::Reader';
    use Data::Printer;
    use utf8;
    use HTML::Entities;
    use HTTP::Request::Common qw(POST);

    has [ qw/marcas viewstate eventvalidation/ ] => ( is => 'rw' );

    has veiculos => ( is => 'rw' , default => sub { return []; });
    has referer => ( is => 'rw' );

    sub start {
        my ( $self ) = @_;
    }

    has startpage => (
        is => 'rw',
        default => sub {
            return [
              {
                tipo => 'moto',
                url => 'http://www.fipe.org.br/web/indices/veiculos/default.aspx?azxp=1&v=m&p=52'
              },
              {
                tipo => 'carro',
                url => 'http://www.fipe.org.br/web/indices/veiculos/default.aspx?p=51'
              },
              {
                tipo => 'caminhao',
                url => 'http://www.fipe.org.br/web/indices/veiculos/default.aspx?v=c&p=53'
              },
            ]
        },
    );

    sub on_start {
      my ( $self ) = @_;
      foreach my $item ( @{ $self->startpage } ) {
        $self->append( search => $item->{ url }, {
            passed_key_values => {
                tipo => $item->{ tipo },
                referer => $item->{ url },
            }
        } );
      }
    }

    sub _headers {
        my ( $self , $url, $form ) = @_;
        return {
          'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
          'Accept-Encoding' => 'gzip, deflate',
          'Accept-Language' => 'en-US,en;q=0.5',
          'Cache-Control' => 'no-cache',
          'Connection' => 'keep-alive',
          'Content-Length' => length( POST('url...', [], Content => $form)->content ),
          'Content-Type' => 'application/x-www-form-urlencoded; charset=utf-8',
          'DNT' => '1',
          'Host' => 'www.fipe.org.br',
          'Pragma' => 'no-cache',
          'Referer' => $url,
          'User-Agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:20.0) Gecko/20100101 Firefox/20.0',
          'X-MicrosoftAjax' => 'Delta=true',
        };
    }

    sub _form {
        my ( $self, $args ) = @_;
        return [
          ScriptManager1 => $args->{ script_manager },
          __ASYNCPOST => 'true',
          __EVENTARGUMENT => '',
          __EVENTTARGET => $args->{ event_target },
          __EVENTVALIDATION => $args->{ event_validation },
          __LASTFOCUS => '',
          __VIEWSTATE => $args->{ viewstate },
          ddlAnoValor => ( !exists $args->{ano} ) ? 0 : $args->{ ano },
          ddlMarca => ( !exists $args->{marca} ) ? 0 : $args->{ marca },
          ddlModelo => ( !exists $args->{modelo} ) ? 0 : $args->{ modelo },
          ddlTabelaReferencia => 154,
          txtCodFipe => '',
        ];
    }

    sub search {
      my ( $self ) = @_;
      my $marcas = $self->tree->findnodes( '//select[@name="ddlMarca"]/option' );
      my $viewstate = $self->tree->findnodes( '//form[@id="form1"]//input[@id="__VIEWSTATE"]' )->get_node->attr('value');
      my $event_validation = $self->tree->findnodes( '//form[@id="form1"]//input[@id="__EVENTVALIDATION"]' )->get_node->attr('value');
      foreach my $marca ( $marcas->get_nodelist ) {
        my $form = $self->_form( {
            script_manager => 'UdtMarca|ddlMarca',
            event_target => 'ddlMarca',
            event_validation=> $event_validation,
            viewstate => $viewstate,
            marca => $marca->attr( 'value' ),
        } );
        $self->prepend( busca_marca => 'url' , {
          passed_key_values => {
              marca => $marca->as_text,
              marca_id => $marca->attr( 'value' ),
              tipo => $self->robot->reader->passed_key_values->{ tipo },
              referer => $self->robot->reader->passed_key_values->{referer },
          },
          request => [
            'POST',
            $self->robot->reader->passed_key_values->{ referer },
            {
              headers => $self->_headers( $self->robot->reader->passed_key_values->{ referer } , $form ),
              content => POST('url...', [], Content => $form)->content,
            }
          ]
        } );
      }
    }



( run in 1.141 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )