App-Fetchware
view release on metacpan or search on metacpan
lib/App/FetchwareX/HTMLPageSync.pm view on Meta::CPAN
Mandatory => [ 'html_page_url', <<EOM ],
App-Fetchware: Your Fetchwarefile must specify a html_page_url configuration
option. Please add one, and try again.
EOM
Mandatory => [ 'destination_directory', <<EOM ],
App-Fetchware: Your Fetchwarefile must specify a destination_directory
configuration option. Please add one, and try again.
EOM
);
}
###BUGALERT### lookup() returns all files each time it is run; therefore, it
#breaks the way Fetchware is supposed to work! lookup() is supposed to return
#"the latest version." And in HTMLPageSync's case, it should not include files
#already downloaded, because it should only return "new files" by comparing the
#"availabe list of files" to the "already downloaded one."
sub lookup {
msg
"Looking up download urls using html_page_url [@{[config('html_page_url')]}]";
###BUGALERT### Create a user changeable version of lookup_check_args??(), so
#that App::Fetchware 'subclasses' can use it.
# Download the url the user specified.
my $filename = do {
if (defined config('user_agent')) {
download_http_url(config('html_page_url'),
user_agent => config('user_agent'));
} else {
download_http_url(config('html_page_url'));
}
};
vmsg "Downloaded html_page_url to local file [$filename].";
# Create a HTML::TreeBuilder object for the now downloaded file.
my $tree = HTML::TreeBuilder->new();
# Parse $filename into a HTML::Element tree.
$tree->parse_file($filename);
vmsg 'Created HTML::TreeBuilder object to parse downloaded html file.';
my $tree_callback = do {
if (config('html_treebuilder_callback')) {
vmsg <<EOM;
Using user supplied html_treebuilder_callback to parse downloaded HTML file:
[
@{[config('html_treebuilder_callback')]}
]
EOM
config('html_treebuilder_callback');
} else {
vmsg <<EOM;
Using built-in default html_treebuilder_callback that only wants images.
EOM
sub {
my $tag = shift;
my $link = $tag->attr('href');
if (defined $link) {
# If the anchor tag is an image...
if ($link =~ /\.(jpg|jpeg|png|bmp|tiff?|gif)$/) {
# ...return true...
return 'True';
} else {
# ...if not return false.
return undef; #false
}
}
};
}
};
# Find the links that match our default callback or the user specified one
# if the user specified one.
my @download_urls = $tree->look_down(
_tag => 'a',
$tree_callback
);
vmsg <<EOM;
Determined download urls to be:
@download_urls
EOM
# Sort through the list of HTML::Element tags to finalize the list to
# download.
my $links_callback = do {
if (config('download_links_callback')) {
vmsg <<EOM;
Determined download_links_callback to be user specified:
[
@{[config('download_links_callback')]}
]
EOM
config('download_links_callback');
} else {
# Strip off HTML::Element crap by default.
sub {
vmsg <<EOM;
Using built-in default download_links_callback that turns HTML::Elements into
download urls.
EOM
my @download_urls = @_;
for my $link (@download_urls) {
$link = $link->attr('href');
}
# Must return them, because this coderef was called by value not
# by reference.
return @download_urls;
};
}
};
# Call download_links_callback or call default one to strip off
# HTML::Element crap.
@download_urls = $links_callback->(@download_urls);
vmsg <<EOM;
Determined download urls to be:
[
@{[@download_urls]}
( run in 1.448 second using v1.01-cache-2.11-cpan-df04353d9ac )