Bundle-WWW-Scraper-Job
view release on metacpan or search on metacpan
lib/WWW/Scraper/Dice.pm view on Meta::CPAN
package WWW::Scraper::Dice;
use strict;
use vars qw($VERSION @ISA);
@ISA = qw(WWW::Scraper);
$VERSION = sprintf("%d.%02d", q$Revision: 1.09 $ =~ /(\d+)\.(\d+)/);
use WWW::Scraper(qw(1.48 generic_option trimTags addURL));
use WWW::Scraper::FieldTranslation;
my $scraperRequest =
{
'type' => 'POST' # Type of query generation is 'POST'
,'redirectMethod' => 'GET' # Let me quote W3C HTTP 1.1 Specification (at http://www.w3.org/Protocols/rfc2068/rfc2068)
# Note: When automatically redirecting a POST request after receiving
# a 302 status code, some existing HTTP/1.0 user agents will
# erroneously change it into a GET request.
# Yet Dice.com *relies* on the browser to change it to 'GET', otherwise it don't work!
# I guess that's what's so nice about standards - there's so many to choose from!
# This is the basic URL on which to build the query.
,'url' => 'http://jobsearch.dice.com/jobsearch/jobsearch.cgi?'
# This is the Scraper attributes => native input fields mapping
,'nativeQuery' => 'query'
,'defaultRequestClass' => 'Job'
,'nativeDefaults' =>
{
'Search.x' => 1,'Search.y' => 1,
'banner' => '0',
'brief' => 'true',
'method' => 'bool',
'taxterm' => '',
'state' => 'ALL', # or two character abbreviation(s)
'acode' => '', # multiple acode INPUT fields
'daysback' => 30, # (1, 2, 7, 10, 14, 21, 30)
'num_per_page' => 50, # (10, 20, 30, 40, 50)
'num_to_retrieve' => 2000 # (100, 200, 300, 400, 500, 600, 2000)
}
,'fieldTranslations' =>
{
'*' =>
{ 'skills' => 'query'
,'locations' => new WWW::Scraper::FieldTranslation('Dice', 'Job', 'locations')
,'*' => '*'
}
}
# Some more options for the Scraper operation.
,'cookies' => 0
# Some search engines don't connect every time - retry Dice this many times.
,'retry' => 2
};
my $scraperFrame =
[ 'HTML',
[
[ 'NEXT', 1, 'Show next \d+ jobs' ] , # meaning how to find the NEXT button.
[ 'COUNT', 'Jobs [-0123456789]+ of (\d+) matching your query' ] , # the total count can be found here.
[ 'HIT*', 'Job', # meaning the content of this array element represents hits!
[
[ 'DL', # meaning detail is in a definition list
[
[ 'DT', [[ 'F', \&titleJobID, 'url', 'title', 'jobID' ]] ] # meaning that the job description link is here, in the definition term,
,[ 'DD', [[ 'F', \&touchupLocation, 'location', 'description']] ] # meaning the location is in the definition data.
,[ 'RESIDUE', 'residue' ]
]
]
]
]
]
];
sub init {
my ($self) = @_;
$self->searchEngineHome('http://www.Dice.com');
return $self;
}
sub testParameters {
( run in 3.450 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )