LoadWorm
view release on metacpan or search on metacpan
loadworm.pl view on Meta::CPAN
if ( $Link =~ /^\#/ ) {
$AlreadyVisited{$Link} += 1;
if ( $ENV{'VERBOSE'} ) { PrintTrace($Depth, "$Link skipped, this page refers to itself\n"); }
return;
}
$Link = url($Link, $BASE)->abs->as_string . $Parm;
# Remove any '#' label from the URL, since it never goes out anyway.
$Link =~ s/(\#\w+)//;
# We'll need to process this label reference somehow, later, but at
# least this prevents repeated downloads of the same page.
# $Link .= $1;
# Add it to the "Parents" list.
if ( $Referers ) {
if ( IsReferersURLs($Link) ) {
push @{ $Referers{$Link} }, $parent;
}
}
# Add it to the "already visited" list.
$AlreadyLink = $Link;
$AlreadyLink =~ s/nav=[^&]*/nav=[*]/g;
$AlreadyVisited{$AlreadyLink} += 1;
if ( $AlreadyVisited{$AlreadyLink} > $ENV{'RECURSE'} )
{
if ( $ENV{'VERBOSE'} )
{
if ( $AlreadyIgnored{$AlreadyLink} ) { $msg = "ignored.\n"; }
else { $msg = "visited.\n"; }
PrintTrace($Depth, "$Link already ".$msg);
}
return;
}
return if &IsLimitURL($Depth, $Link, $parent);
return if &IsIgnoreURL($Depth, $Link, $parent);
# Limit the depth!
if ( $Depth > $ENV{'DEPTH'} ) {
push @{ $Depths{$Link} }, $parent;
if ( $ENV{'VERBOSE'} ) { PrintTrace($Depth, "$Link skipped, it is too deep\n"); }
return;
}
PrintTrace($Depth, $Link);
print VISITS $Link."\n";
$response = &GetFile($GetOrPost, $Link);
unless (defined $response and &IsCheckedURL($Link, $response) )
{
print TIMING " failed\n";
print " failed\n";
return;
}
print "\n";
# We are done here, unless it is an HTML document . . .
unless ( ${$response}{'_headers'}{'_header'}{'content-type'}[0] eq "text/html" )
{print TIMING "\n"; return;}
# . . . then we must parse it, too.
($base, $html) = &ParseFile($response);
# I think that URI::URL::url() handles "func?..." wrong. It does not belong with the $BASE, so . . .
$base =~ s/\?.+//;
unless ( $ENV{NOIMAGES} )
{
@links = @{$html->extract_links(qw(img))};
if ( @links )
{
my $start = LoadWorm->GetTickCount();
for ( @links )
{
&ListFileLinks($Depth+1, 'GET', @$_[0], $base, $Link);
}
my $finish = LoadWorm->GetTickCount();
print TIMING "ALL_OF $Link\n$start,$finish 0\n";
}
}
unless ( $ENV{NOFRAMES} )
{
@links = @{$html->extract_links(qw(frame))};
if ( @links )
{
my $start = LoadWorm->GetTickCount();
for ( @links )
{
&ListFileLinks($Depth+1, 'GET', @$_[0], $base, $Link);
}
my $finish = LoadWorm->GetTickCount();
print TIMING "ALL_OF $Link\n$start,$finish 0\n";
}
}
&ProcessForms($base, $html);
my @anchors = @{$html->extract_links(qw(a))};
ANCHOR:
for my $anchor ( @anchors )
{
# Skip any ISMAP's in the anchor - these must be handled by <MAP>.
# (might handle it later as a specified [INPUT] value(s))
@map_links = @{@$anchor[1]->extract_links(qw(img))};
for $ml ( @map_links ) {
if ( @$ml[1]->{'ismap'} ) {
my $tmp = url(@$anchor[0], $base)->abs->as_string;
PrintTrace($Depth, "$tmp skipped. It is an ISMAP.\n");
next ANCHOR;
}
}
&ListFileLinks($Depth+1, 'GET', @$anchor[0], $base, $Link);
}
$html->delete();
}
loadworm.pl view on Meta::CPAN
sub ProcessForms { my($base, $html) = @_;
my (@forms, $form);
@forms = @{extract_tags($html, qw(form))};
for $form ( @forms )
{
my @linklist = IterateForm($base, $form);
for ( @linklist )
{
&ListFileLinks($Depth+1, $form->{'method'}, $_, $base, $html->{'_loadworm_Base'});
}
}
}
#sub process_forms { my($self, $flag, $depth) = @_;
#
# return 1 unless $flag;
# return 1 unless $self->{'_tag'} eq "form";
#
# print "FORM -> ${$self}{'action'}\n";
#
# @links = @{$html->extract_links(qw(input))};
# for ( @links ) {
# $inpt = @$_[1];
# print $inpt->as_HTML();
# }
#
# for $fmky ( sort keys %{$self} ) {
# if ( $fmky eq '_content' ) {
# for $cnt ( @{$self}{$fmky} ) {
# for ( @{$cnt} ) {
# print ${$_{_tag}}."=".$_."\n";
# }
# }
# }
# else {
# print $fmky."=".${$self}{$fmky}."\n";
# }
# }
# 0;
#}
sub PrintResponseParameters { ($response) = @_;
# %x = %{$response};
# %x = %{$x{'_headers'}};
# %x = %{$x{'_header'}};
# for ( sort keys %x ) {
# @y = @{$x{$_}};
# $yyy = "";
# for $yy ( @y) { $yyy .= $yy.","; }
# chop $yyy;
# print "$_=".$yyy."\n";
# }
print ${$response}{'_headers'}{'_header'}{'content-type'}[0]."\n";
}
# Skip certain documents, well, just because!
# (but don't skip them if they are listed in $TraversURLs)
#
sub IsIgnoreURL { my($Depth, $Link, $parent) = @_;
# We walk down http: links, only.
if ( $Link !~ /^https?:\/\// )
{
if ( $ENV{'VERBOSE'} )
{
PrintTrace($Depth, "$Link skipped, it is not an http link\n");
}
return 1;
}
# Do not ignore if it is explicitly listed in [Traverse].
for ( 0..$#TraverseURLs )
{
if ( $Link eq $TraverseURLs[$_] )
{
return 0;
}
}
for ( 0..$#IgnoreURLs )
{
if ( $Link =~ /$IgnoreURLs[$_]/ )
{
PrintTrace($Depth, "$Link skipped by [Ignore] $IgnoreURLs[$_]\n");
push @{@{$AlreadyIgnored{$AlreadyLink}}}, ($IgnoreURLs[$_], $parent);
return 1;
}
}
return 0;
}
# Skip certain documents if we've been the N times already!
#
sub IsLimitURL { my($Depth, $Link, $parent) = @_;
for ( @main::Limits )
{
my ($url_match, $url_limit) = split /=/;
if ( $Link =~ /$url_match/ )
{
$LimitCounts{$url_match} += 1;
if ( $LimitCounts{$url_match} > $url_limit )
{
PrintTrace($Depth, "$Link skipped by [Limit] on $url_match. This is $LimitCounts{$url_match} times.\n");
return 1;
}
}
}
return 0;
( run in 1.714 second using v1.01-cache-2.11-cpan-524268b4103 )