App-Fetchware

 view release on metacpan or  search on metacpan

lib/App/FetchwareX/HTMLPageSync.pm  view on Meta::CPAN

        Mandatory => [ 'html_page_url', <<EOM ],
App-Fetchware: Your Fetchwarefile must specify a html_page_url configuration
option. Please add one, and try again.
EOM
        Mandatory => [ 'destination_directory', <<EOM ],
App-Fetchware: Your Fetchwarefile must specify a destination_directory
configuration option. Please add one, and try again.
EOM
    );
}





###BUGALERT### lookup() returns all files each time it is run; therefore, it
#breaks the way Fetchware is supposed to work! lookup() is supposed to return
#"the latest version." And in HTMLPageSync's case, it should not include files
#already downloaded, because it should only return "new files" by comparing the
#"availabe list of files" to the "already downloaded one."
sub lookup {
    msg
    "Looking up download urls using html_page_url [@{[config('html_page_url')]}]";
    ###BUGALERT### Create a user changeable version of lookup_check_args??(), so
    #that App::Fetchware 'subclasses' can use it.
    # Download the url the user specified.
    my $filename = do {
        if (defined config('user_agent')) {
            download_http_url(config('html_page_url'),
                user_agent =>  config('user_agent'));
        } else {
            download_http_url(config('html_page_url'));
        }
    };
    vmsg "Downloaded html_page_url to local file [$filename].";

    # Create a HTML::TreeBuilder object for the now downloaded file.
    my $tree = HTML::TreeBuilder->new();
    # Parse $filename into a HTML::Element tree.
    $tree->parse_file($filename);
    vmsg 'Created HTML::TreeBuilder object to parse downloaded html file.';

    my $tree_callback = do {
        if (config('html_treebuilder_callback')) {
            vmsg <<EOM;
Using user supplied html_treebuilder_callback to parse downloaded HTML file:
[
@{[config('html_treebuilder_callback')]}
]
EOM
            config('html_treebuilder_callback');
        } else {
            vmsg <<EOM;
Using built-in default html_treebuilder_callback that only wants images.
EOM
            sub {
                my $tag = shift;
                my $link = $tag->attr('href');
                if (defined $link) {
                    # If the anchor tag is an image...
                    if ($link =~ /\.(jpg|jpeg|png|bmp|tiff?|gif)$/) {
                        # ...return true...
                        return 'True';
                    } else {
                        # ...if not return false.
                        return undef; #false
                    }
                }
            };
        }
    };

    # Find the links that match our default callback or the user specified one
    # if the user specified one.
    my @download_urls = $tree->look_down(
        _tag => 'a',
        $tree_callback
    );
    vmsg <<EOM;
Determined download urls to be:
@download_urls
EOM

    # Sort through the list of HTML::Element tags to finalize the list to
    # download.
    my $links_callback = do {
        if (config('download_links_callback')) {
            vmsg <<EOM;
Determined download_links_callback to be user specified:
[
@{[config('download_links_callback')]}
]
EOM
            config('download_links_callback');
        } else {
            # Strip off HTML::Element crap by default.
            sub {
                vmsg <<EOM;
Using built-in default download_links_callback that turns HTML::Elements into
download urls.
EOM
                my @download_urls = @_;

                for my $link (@download_urls) {
                    $link = $link->attr('href');
                }

                # Must return them, because this coderef was called by value not
                # by reference.
                return @download_urls;
            };
        }
    };

    # Call download_links_callback or call default one to strip off
    # HTML::Element crap.
    @download_urls = $links_callback->(@download_urls);
    vmsg <<EOM;
Determined download urls to be:
[
@{[@download_urls]}



( run in 1.448 second using v1.01-cache-2.11-cpan-df04353d9ac )