Bundle-WWW-Scraper-Job

 view release on metacpan or  search on metacpan

lib/WWW/Scraper/Dice.pm  view on Meta::CPAN


package WWW::Scraper::Dice;
use strict;
use vars qw($VERSION @ISA);
@ISA = qw(WWW::Scraper);
$VERSION = sprintf("%d.%02d", q$Revision: 1.09 $ =~ /(\d+)\.(\d+)/);

use WWW::Scraper(qw(1.48 generic_option trimTags addURL));
use WWW::Scraper::FieldTranslation;

my $scraperRequest = 
   { 
      'type' => 'POST'       # Type of query generation is 'POST'
     ,'redirectMethod' => 'GET' # Let me quote W3C HTTP 1.1 Specification (at http://www.w3.org/Protocols/rfc2068/rfc2068)
                                #      Note: When automatically redirecting a POST request after receiving
                                #     a 302 status code, some existing HTTP/1.0 user agents will
                                #     erroneously change it into a GET request.
                                # Yet Dice.com *relies* on the browser to change it to 'GET', otherwise it don't work!
                                # I guess that's what's so nice about standards - there's so many to choose from!

      # This is the basic URL on which to build the query.
     ,'url' => 'http://jobsearch.dice.com/jobsearch/jobsearch.cgi?'
      # This is the Scraper attributes => native input fields mapping
      ,'nativeQuery' => 'query'
      ,'defaultRequestClass' => 'Job'
      ,'nativeDefaults' =>
           {
               'Search.x' => 1,'Search.y' => 1,
               'banner' => '0',
               'brief' => 'true',
               'method' => 'bool',
               'taxterm' => '',
               'state' => 'ALL',         # or two character abbreviation(s)
               'acode' => '',            # multiple acode INPUT fields
               'daysback' => 30,         # (1, 2, 7, 10, 14, 21, 30)
               'num_per_page' => 50,     # (10, 20, 30, 40, 50) 
               'num_to_retrieve' => 2000 # (100, 200, 300, 400, 500, 600, 2000)
           }
     ,'fieldTranslations' =>
             {
                 '*' =>
                     {    'skills'    => 'query'
                         ,'locations' => new WWW::Scraper::FieldTranslation('Dice', 'Job', 'locations')
                         ,'*'             => '*'
                     }
             }
      # Some more options for the Scraper operation.
     ,'cookies' => 0
     # Some search engines don't connect every time - retry Dice this many times.
     ,'retry' => 2
   };

my $scraperFrame =
      [ 'HTML', 
         [  
            [ 'NEXT', 1, 'Show next \d+ jobs' ] , # meaning how to find the NEXT button.
            [ 'COUNT', 'Jobs [-0123456789]+ of (\d+) matching your query' ] , # the total count can be found here.
            [ 'HIT*', 'Job',                    # meaning the content of this array element represents hits!
               [  
                  [ 'DL',                       # meaning detail is in a definition list
                     [
                        [ 'DT', [[ 'F', \&titleJobID, 'url', 'title', 'jobID' ]] ] # meaning that the job description link is here, in the definition term, 
                       ,[ 'DD', [[ 'F', \&touchupLocation, 'location', 'description']] ] # meaning the location is in the definition data.
                       ,[ 'RESIDUE', 'residue' ]
                     ]
                  ]
               ]
            ]
         ] 
      ];


sub init {
    my ($self) = @_;
    $self->searchEngineHome('http://www.Dice.com');
    return $self;
}

sub testParameters {



( run in 3.450 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )