App-Fetchware
view release on metacpan or search on metacpan
lib/App/Fetchware.pm view on Meta::CPAN
# Use HTML::TreeBuilder to parse the scalar of html into a tree of tags.
my $tree = HTML::TreeBuilder->new_from_content($http_listing);
my @filename_listing;
my @matching_links = $tree->look_down(
_tag => 'a',
sub {
my $h = shift;
#parse out archive name.
my $link = $h->as_text();
# NOTE: The weird alternations adding .asc, .md5, and .sha.?,
# and also a KEYS file are to allow fetchware new to also use
# this subroutine to parse http file listings to analyze the
# contents of the user's lookup_url. It does not make any sense
# to copy and paste this function or even add a callback argument
# allowing you to change the regex.
if ($link =~
/(\.(tar\.(gz|bz2|xz)|(tgz|tbz2|txz))|(asc|md5|sha.?))|KEYS$/) {
# Should I strip out dirs just to be safe?
my $filename = $link;
# Obtain the tag to the right of the archive link to find the
# timestamp.
if (my $rh = $h->right()) {
my $listing_line;
if (blessed($rh)) {
$listing_line = $rh->as_text();
} else {
$listing_line = $rh;
}
my @fields = split ' ', $listing_line;
###BUGALERT### Internationalization probably breaks this
#datetime parsing? Can a library do it?
# day-month-year time
# $fields[0] $fields[1]
# Normalize format for lookup algorithms .
my ($day, $month, $year) = split /-/, $fields[0];
# Ditch the ':' in the time.
$fields[1] =~ s/://;
# Some dirlistings use string months Aug, Jun, etc...
if (looks_like_number($month)) {
# Strip leading 0 if it exists by converting the
# string with the useless leading 0 into an integer.
# The %num_month hash lookup will add back a leading
# 0 if there was one. This stupid roundabout code is
# to ensure that there always is a leading 0 if the
# number is less than 10 to ensure that all of the
# numbers this hacky datetime parser outputs all
# have the same length so that the numbers can
# easily be compared with each other.
$month = sprintf("%u", $month);
push @filename_listing, [$filename,
"$year$num_month{$month}$day$fields[1]"];
# ...and some use numbers 8, 6, etc....
} else {
push @filename_listing, [$filename,
"$year$month{$month}$day$fields[1]"];
}
} else {
###BUGALERT### Add support for other http servers such as lighttpd, nginx,
#cherokee, starman?, AND use the Server: header to determine which algorithm to
#use.
die <<EOD;
App-Fetchware: run-time error. A hardcoded algorithm to parse HTML directory
listings has failed! Fetchware currently only supports parseing Apache HTML
directory listings. This is a huge limitation, but surprisingly pretty much
everyone who runs a mirror uses apache for http support. This is a bug so
please report it. Also, if you want to try a possible workaround, just use a ftp
mirror instead of a http one, because ftp directory listings are a easy to
parse. See perldoc App::Fetchware.
EOD
}
}
}
);
# Delete the $tree, so perl can garbage collect it.
$tree = $tree->delete;
return \@filename_listing;
}
} # end bare block for %month.
sub file_parse_filelist {
my $file_listing = shift;
for my $file (@$file_listing) {
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,
$blksize,$blocks)
= stat($file) or die <<EOD;
App-Fetchware: Fetchware failed to stat() the file [$file] while trying to parse
your local [file://] lookup_url. The OS error was [$!]. This should not happen,
and is either a bug in fetchware or some sort of race condition.
EOD
# Replace scalar filename with a arrayref of the filename with its
# assocated timestamp for later processing for lookup().
#
# Use Path::Class's file() constructor & basename() method to strip out
# all unneeded directory information leaving just the file's name.
# Add all of the timestamp numbers together, so that only one numberical
# sort is needed instead of a descending list of numerical sorts.
$file = [file($file)->basename(), $mtime ];
}
return $file_listing;
}
sub lookup_by_timestamp {
my $file_listing = shift;
# Sort the timstamps to determine the latest one. The one with the higher
( run in 0.497 second using v1.01-cache-2.11-cpan-e93a5daba3e )