App-Fetchware
view release on metacpan or search on metacpan
lib/App/FetchwareX/HTMLPageSync.pm view on Meta::CPAN
package App::FetchwareX::HTMLPageSync;
our $VERSION = '1.016'; # VERSION: generated by DZP::OurPkgVersion
# ABSTRACT: An App::Fetchware extension that downloads files based on an HTML page.
use strict;
use warnings;
# Enable Perl 6 knockoffs, and use 5.10.1, because smartmatching and other
# things in 5.10 were changed in 5.10.1+.
use 5.010001;
# Use fetchware's API's to help us out.
use App::Fetchware::Util ':UTIL';
use App::Fetchware::Config ':CONFIG';
use App::Fetchware::Fetchwarefile;
use App::Fetchware qw(
:OVERRIDE_NEW
:OVERRIDE_NEW_INSTALL
:OVERRIDE_CHECK_SYNTAX
);
# Local imports.
use File::Copy 'cp';
use File::Path 'remove_tree';
use URI::Split 'uri_split';
use File::Spec 'splitpath';
use Data::Dumper;
use Scalar::Util 'blessed';
# Use App::Fetchware::ExportAPI to specify which App::Fetchware API subroutines
# we are going to "KEEP", import from App::Fetchware, and which API subs we are
# going to "OVERRRIDE", implemente here in this package.
#
# ExportAPI takes care of the grunt work for us by setting our packages @EXPORT
# appropriatly, and even importing Exporter's import() method into our package
# for us, so that our App::Fetchware API subroutines and configuration options
# specified below can be import()ed properly.
use App::Fetchware::ExportAPI
# KEEP or "inherit" new_install, because I want my new_install to just call
# ask_to_install_now_to_test_fetchwarefile(), and App::Fetchware's does that
# already for me. And start() and end() are to create and manage the
# temporary directory for me, so I don't have to worry about polluting the
# current working directory with temporary files.
KEEP => [qw(new_install start end)],
# OVERRIDE everything else.
OVERRIDE =>
[qw(new check_syntax lookup download verify unarchive build install
uninstall upgrade)]
;
# Use App::Fetchware::CreateconfigOptions to build our App::Fetchware
# configuration options for us. These are subroutines with correct prototypes to
# turn a perl code file into something that resembles a configuration file.
use App::Fetchware::CreateConfigOptions
ONE => [qw(
page_name
html_page_url
destination_directory
user_agent
html_treebuilder_callback
download_links_callback
)],
BOOLEAN => [qw(keep_destination_directory)]
;
use Exporter 'import';
our %EXPORT_TAGS = (
TESTING => [qw(
get_html_page_url
get_destination_directory
ask_about_keep_destination_directory
new
new_install
)]
);
our @EXPORT_OK = map {@{$_}} values %EXPORT_TAGS;
sub new {
my ($term, $page_name) = @_;
# Instantiate a new Fetchwarefile object for managing and generating a
# Fetchwarefile, which we'll write to a file for the user or use to
# build a associated Fetchware package.
my $now = localtime;
my $fetchwarefile = App::Fetchware::Fetchwarefile->new(
header => <<EOF,
use App::FetchwareX::HTMLPageSync;
# Auto generated $now by HTMLPageSync's fetchware new command.
# However, feel free to edit this file if HTMLPageSync's new command's
# autoconfiguration is not enough.
#
# Please look up HTMLPageSync's documentation of its configuration file syntax at
# perldoc App::FetchwareX::HTMLPageSync, and only if its configuration file
# syntax is not malleable enough for your application should you resort to
# customizing fetchware's behavior. For extra flexible customization see perldoc
# App::Fetchwarex::HTMLPageSync.
EOF
descriptions => {
page_name => <<EOA,
page_name simply names the HTML page the Fetchwarefile is responsible for
lib/App/FetchwareX/HTMLPageSync.pm view on Meta::CPAN
# Relative ones could just be filenames without any knowledge of what the
# actual server or path or even scheme is. Fix this by prepending
# html_page_url to each link if there is no scheme.
for my $download_url (@download_urls) {
if ($download_url !~ m!^(ftp|http|file)://!) {
$download_url = config('html_page_url') . '/' . $download_url;
}
}
# Return a ref to the array of download urls, because lookup()'s API only
# allows it to return a single value, but that single value does not have to
# a scalar. It can be a array ref, which is used here. This works, because
# what is returned here by lookup() is passed unchanged to download(), which
# is also part of this API, so I can use what I return here as I please
# inside download().
return \@download_urls;
}
sub download {
my ($temp_dir, $download_url) = @_;
msg 'Downloading the download urls lookup() determined.';
my @download_file_paths;
# Loop over @$download_url to download all user specified URLs to temp_dir.
for my $url (@$download_url) {
# Use user specified agent if they asked for it.
if (defined config('user_agent')) {
vmsg <<EOM;
Downloadig url
[$url]
using the user specified user_agent
[@{[config('user_agent')]}]
EOM
my $downloaded_file =
download_http_url($url, agent => config('user_agent'));
push @download_file_paths, $downloaded_file;
} else {
vmsg "Downloading url [$url].";
my $downloaded_file = download_http_url($url);
push @download_file_paths, $downloaded_file;
}
}
local $" = "\n"; # print each @download_file_paths on its own line.
vmsg <<EOM;
Downloaded specified urls to the following paths:
[
@{[@download_file_paths]}
]
EOM
# AKA $package_path.
return \@download_file_paths;
}
sub verify {
vmsg <<EOM;
Skipping verify subroutine, because HTMLPageSync does not need to verify anything
EOM
do_nothing();
}
sub unarchive {
vmsg <<EOM;
Skipping unarchive subroutine, because HTMLPageSync does not need to unarchive
anything
EOM
do_nothing();
}
sub build {
vmsg <<EOM;
Skipping build subroutine, because HTMLPageSync does not need to build anything
EOM
do_nothing();
}
sub install {
# AKA $package_path.
my $download_file_paths = shift;
msg <<EOM;
Copying files downloaded to a local temp directory to final destination directory.
EOM
# Copy over the files that have been returned by download().
for my $file_path (@$download_file_paths) {
vmsg <<EOM;
Copying [$file_path] -> [@{[config('destination_directory')]}].
EOM
###BUGALERT### Should this die and all the rest be croaks instead???
cp($file_path, config('destination_directory')) or die <<EOD;
App-FetchwareX-HTMLPageSync: run-time error. Fetchware failed to copy the file [$file_path] to the
destination directory [@{[config('destination_directory')]}].
The OS error was [$!].
EOD
}
vmsg 'Successfully copied files to destination directory.';
return 'True indicating success!';
}
sub uninstall {
my $build_path = shift;
# Only delete destination_directory if keep_destination_directory is false.
unless (config('keep_destination_directory')) {
lib/App/FetchwareX/HTMLPageSync.pm view on Meta::CPAN
1;
=pod
=head1 NAME
App::FetchwareX::HTMLPageSync - An App::Fetchware extension that downloads files based on an HTML page.
=head1 VERSION
version 1.016
=head1 SYNOPSIS
=head2 Example App::FetchwareX::HTMLPageSync Fetchwarefile.
page_name 'Cool Wallpapers';
html_page_url 'http://some-html-page-with-cool.urls';
destination_directory 'wallpapers';
# pretend to be firefox
user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1';
# Customize the callbacks.
html_treebuilder_callback sub {
# Get one HTML::Element.
my $h = shift;
# Return true or false to indicate if this HTML::Element shoudd be a
# download link.
if (something) {
return 'True';
} else {
return undef;
}
};
download_links_callback sub {
my @download_urls = @_;
my @wanted_download_urls;
for my $link (@download_urls) {
# Pick ones to keep.
puse @wanted_download_urls, $link;
}
return @wanted_download_urls;
};
=head2 App::FetchwareX::HTMLPageSync App::Fetchware-like API.
my $temp_file = start();
my $download_url = lookup();
download($temp_dir, $download_url);
verify($download_url, $package_path);
unarchive($package_path);
build($build_path);
install();
uninstall($build_path);
=head1 MOTIVATION
I want to automatically parse a Web page with links to wall papers that I want
to download. Only I want software to do it for me. That's where this
App::Fetchware extension comes in.
=head1 DESCRIPTION
App::FetchwareX::HTMLPageSync is an example App::Fetchware extension. It's not
a large extension, but instead is a simple one meant to show how easy it is
extend App::Fetchware.
App::FetchwareX::HTMLPageSync parses the Web page you specify to create a list of
download links. Then it downloads those links, and installs them to your
C<destination_directory>.
In order to use App::FetchwareX::HTMLPageSync to help you mirror the download
links on a HTML page you need to create a App::FetchwareX::HTMLPageSync
Fetchwarefile, you can do this easily by just running C<fetchware new>, and
typing in C<HTMLPageSync> when it asks you what extension of Fetchwarefile you
want to create.
L<Or create a Fetchwarefile manually.|/"MANUALLY CREATING A App::FetchwareX::HTMLPageSync FETCHWAREFILE">
Then you'll need to
L<learn how to use that Fetchwarefile with fetchware.|/"USING YOUR App::FetchwareX::HTMLPageSync FETCHWAREFILE WITH FETCHWARE">
=head1 App::FetchwareX::HTMLPageSync API SUBROUTINES
This is App::FetchwareX::HTMLPageSync's API that fetchware uses to execute any
Fetchwarefile's that make use of App::FetchwareX::HTMLPageSync. This API is the
same that regular old App::Fetchware uses for most standard FOSS software, and
this internal documentation is only needed when debugging HTMLPageSync's code or
when studying it to create your own fetchware extension.
=head2 new()
my ($program_name, $fetchwarefile) = new($term, $program_name);
# Or in an extension, you can return whatever list of variables you want,
# and then cmd_new() will provide them as arguments to new_install() except
# a $term Term::ReadLine object will precede the others.
my ($term, $program_name, $fetchwarefile, $custom_argument1, $custom_argument2)
= new($term, $program_name);
new() is App::Fetchware's API subroutine that implements fetchware's new
command. It simply uses Term::UI to ask the user some questions that determine
what configuration options will be added to the genereted Fetchwarefile. new()
takes a $term, Term::UI/Term::Readline object, and the optional name of the
program or Website in this case that HTMLPageSync is page syncing.
Whatever scalars (not references just regular strings) that new() returns will
be shared with new()'s sister API subroutine new_install() that is called after
lib/App/FetchwareX/HTMLPageSync.pm view on Meta::CPAN
=over
=item * page_name, html_page_url, and destination_directory are required for all Fetchwarefiles.
=back
=back
=over
=item drop_privs() NOTES
This section notes whatever problems you might come accross implementing and
debugging your Fetchware extension due to fetchware's drop_privs mechanism.
See L<Util's drop_privs() subroutine for more info|App::Fetchware::Util/drop_privs()>.
=over
=item *
check_syntax() is run in the parent process before even start() has run, so no
temporary directory is available for use.
=back
=back
=head2 start()
my $temp_file = start();
start() creats a temp dir, chmod 700's it, and chdir()'s to it just like the one
in App::Fetchware does. App::FetchwareX::HTMLPageSync
start() is imported use L<App::Fetchware::ExportAPI> from App::Fetchware,
and also exported by App::FetchwareX::HTMLPageSync. This is how
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
=head2 lookup()
my $download_url = lookup();
lookup() downloads the user specified C<html_page_url>, parses it using
HTML::TreeBuilder, and uses C<html_treebuilder_callback> and
C<download_http_url> if specified to maniuplate the tree to determine what
download urls the user wants.
This list of download urls is returned as an array reference, $download_url.
=head2 download()
download($temp_dir, $download_url);
download() uses App::Fetchware's utility function download_http_url() to
download all of the urls that lookup() returned. If the user specifed a
C<user_agent> configuration option, then that option is passed along to
download_http_url()'s call to HTTP::Tiny.
=head2 verify()
verify($download_url, $package_path);
verify() simply calls App::Fetchware's :UTIL subroutine do_nothing(), which as
you can tell from its name does nothing, but return. The reason for the useless
do_nothing() call is simply for better documentation, and standardizing how to
override a App::Fetchware API subroutine in order for it to do nothing at all,
so that you can prevent the original App::Fetchware subroutine from doing what
it normally does.
=head2 unarchive()
unarchive();
unarchive() does nothing by calling App::Fetchware's :UTIL subroutine
do_nothing(), which does nothing.
=head2 build()
build($build_path);
build() does the same thing as verify(), and that is nothing by calling
App::Fetchware's do_nothing() subroutine to better document the fact
that it does nothing.
=head2 install()
install($package_path);
install() takes the $package_path, which is really an array ref of the paths
of the files that download() copied, and copies them the the user specified
destination directory, C<destination_directory>.
=head2 end()
end();
end() chdir()s back to the original directory, and cleans up the temp directory
just like the one in App::Fetchware does. App::FetchwareX::HTMLPageSync
end() is imported use L<App::Fetchware::ExportAPI> from App::Fetchware,
and also exported by App::FetchwareX::HTMLPageSync. This is how
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
=head2 uninstall()
uninstall($build_path);
Uninstalls App::FetchwareX::HTMLPageSync by recursivly deleting the
C<destination_directory> where it stores the wallpapers or whatever you
specified it to download for you. If you would like to keep your
C<destination_directory>, then set the C<keep_destination_directory> to true in
your Fetchwarefile, and Fetchware will I<not> delete you
C<destination_directory>, when you uninstall your Fetchware package.
=head2 upgrade()
my $upgrade = upgrade($download_path, $fetchware_package_path)
if ($upgrade) {
...
}
=over
=item Configuration subroutines used:
=over
=item none
=back
=back
Uses $download_path, an arrayref of URLs to download in HTMLPageSync, and
compares it against the list of files that has already been downloaded by
glob()ing C<destination_directory>. And then comparing the file names of the
specified files.
Returns true if $download_path has any URLs that have not already been
downloaded into C<destination_directory>. Note: HEAD HTTP querries are B<not>
lib/App/FetchwareX/HTMLPageSync.pm view on Meta::CPAN
of existing files is not supported. No timestamp checking is implemented
currently.
=item B<uninstall>
A C<fetchware uninstall> will cause fetchware to delete this fetchware package
from its database as well as recursively deleting everything inside your
C<destination_directory> as well as that directory itself. So when you uninstall
a HTMLPageSync fetchware package ensure that you really want to, because it will
delete whatever files it downloaded for you in the first place.
However, if you would like fetchware to preserve your C<destination_directory>,
you can set the boolean C<keep_destination_directory> configuration option to
true, like C<keep_destination_directory 'True';>, to keep HTMLPageSync from
deleting your destination directory.
=back
=head1 HOW App::FetchwareX::HTMLPageSync OVERRIDES App::Fetchware
This sections documents how App::FetchwareX::HTMLPageSync overrides
App::Fetchware's API, and is only interesting if you're debugging
App::FetchwareX::HTMLPageSync, or you're writing your own App::Fetcwhare
extension. If not, you don't need to know these details.
=head2 App::Fetchware API Subroutines
=head3 new()
HTMLPageSync overrides new(), and implements its own Q&A wizard interface
helping users create HTMLPageSync Fetchwarefiles.
=head3 new_install()
HTMLPageSync just inherits App::Fetchware's new_install(), which just asks the
user if they would like Fetchware to instell the already generated
Fetchwarefile.
=head3 check_syntax()
check_syntax() is also overridden to check HTMLPageSync's own Fetchware-level
syntax.
=head3 start() and end()
HTMLPageSync just imports start() and end() from App::Fetchware to take
advantage of their ability to manage a temporary directory.
=head3 lookup()
lookup() is overridden, and downloads the C<html_page_url>, which is the main
configuration option that HTMLPageSync uses. Then lookup() parses that
C<html_page_url>, and determines what the download urls should be. If the
C<html_trebuilder_callback> and C<download_links_callbacks> exist, then they are
called to customize lookup()'s default bahavior. See their descriptions below.
=head3 download()
download() downloads the array ref of download links that lookup() returns.
=head3 verify()
verify() is overridden to do nothing.
=head3 unarchive()
verify() is overridden to do nothing.
=head3 build()
build() is overridden to do nothing.
=head3 install()
install() takes its argument, which is an arrayref of of the paths of the
files that were downloaded to the tempdir created by start(), and copies them to
the user's provided C<destination_directory>.
=head3 end() and start()
HTMLPageSync just imports end() and start() from App::Fetchware to take
advantage of their ability to manage a temporary directory.
=head3 uninstall()
uninstall() recursively deletes your C<destination_directory> where it stores
whatever links you choose to download unless of course the
C<keep_destination_directory> configuration option is set to true.
=head3 upgrade()
Determines if any looked up URLs have not been downloaded yet, and returns true
if that is the case.
=head2 App::FetchwareX::HTMLPageSync's Configuration Subroutines
Because HTMLPageSync is a App::Fetchware extension, it can not just use the same
configuration subroutines that App::Fetchware uses. Instead, it must create its
own configuration subroutines with App::Fetchware::CreateConfigOptions. These
configuration subroutines are the configuration options that you use in your
App::Fetchware or App::Fetchware extension.
=head3 page_name [MANDATORY]
HTMLPageSync's equivelent to App::Fetchware's C<program_name>. It's simply the
name of the page or what you want to download on that page.
=head3 html_page_url [MANDATORY]
HTMLPageSync's equivelent to App::Fetchware's C<lookup_url>, and is just as
mandatory. This is the url of the HTML page that will be downloaded and
processed.
=head3 destination_directory [MANDATORY]
This option is also mandatory, and it specifies the directory where the files
that you want to download are downloaded to.
=head3 user_agent [OPTIONAL]
This option is optional, and it allows you to have HTML::Tiny pretend to be a
Web browser or perhaps bot if you want to.
=head3 html_treebuilder_callback [OPTIONAL]
This optional option allows you to specify a perl C<CODEREF> that lookup() will
execute instead of its default callback that just looks for images.
( run in 0.754 second using v1.01-cache-2.11-cpan-39bf76dae61 )