LoadWorm

 view release on metacpan or  search on metacpan

loadworm.pl  view on Meta::CPAN

	if ( $Link =~ /^\#/ ) {
		$AlreadyVisited{$Link} += 1;
		if ( $ENV{'VERBOSE'} ) { PrintTrace($Depth, "$Link skipped, this page refers to itself\n"); }
		return;
	}
	
	$Link = url($Link, $BASE)->abs->as_string . $Parm;
	
	# Remove any '#' label from the URL, since it never goes out anyway.
	$Link =~ s/(\#\w+)//;
# We'll need to process this label reference somehow, later, but at
#  least this prevents repeated downloads of the same page.
#	$Link .= $1;

	# Add it to the "Parents" list.
	if ( $Referers ) {
		if ( IsReferersURLs($Link) ) {
			push @{ $Referers{$Link} }, $parent;
		}
	}
	
	# Add it to the "already visited" list.
	$AlreadyLink = $Link;
	$AlreadyLink =~ s/nav=[^&]*/nav=[*]/g;
	$AlreadyVisited{$AlreadyLink} += 1;
	if ( $AlreadyVisited{$AlreadyLink} > $ENV{'RECURSE'} )
	{
		if ( $ENV{'VERBOSE'} )
		{
			if ( $AlreadyIgnored{$AlreadyLink} ) { $msg =  "ignored.\n"; }
			else { $msg = "visited.\n"; }
			PrintTrace($Depth, "$Link already ".$msg);
		}
		return;
	}

	return if &IsLimitURL($Depth, $Link, $parent);
	return if &IsIgnoreURL($Depth, $Link, $parent);

	# Limit the depth!
	if ( $Depth > $ENV{'DEPTH'} ) {
		push @{ $Depths{$Link} }, $parent;
		if ( $ENV{'VERBOSE'} ) { PrintTrace($Depth, "$Link skipped, it is too deep\n"); }
		return;
	}

	PrintTrace($Depth, $Link);
	print VISITS $Link."\n";

	$response = &GetFile($GetOrPost, $Link);
	
	unless (defined $response and &IsCheckedURL($Link, $response) )
	{
		print TIMING " failed\n";
		print " failed\n";
		return;
	}
	print "\n";
	
	# We are done here, unless it is an HTML document . . .
	unless ( ${$response}{'_headers'}{'_header'}{'content-type'}[0] eq "text/html" )
		{print TIMING "\n"; return;}
	# . . . then we must parse it, too.
	($base, $html) = &ParseFile($response);
	
	# I think that URI::URL::url() handles "func?..." wrong.  It does not belong with the $BASE, so . . .
	$base =~ s/\?.+//;

	unless ( $ENV{NOIMAGES} )
	{
		@links = @{$html->extract_links(qw(img))};
		if ( @links )
		{
         my $start = LoadWorm->GetTickCount();
			for ( @links )
			{
				&ListFileLinks($Depth+1, 'GET', @$_[0], $base, $Link);
			}
         my $finish = LoadWorm->GetTickCount();
			print TIMING "ALL_OF $Link\n$start,$finish 0\n";
		}
	}

	unless ( $ENV{NOFRAMES} )
	{
		@links = @{$html->extract_links(qw(frame))};
		if ( @links )
		{
         my $start = LoadWorm->GetTickCount();
			for ( @links )
			{
				&ListFileLinks($Depth+1, 'GET', @$_[0], $base, $Link);
			}
         my $finish = LoadWorm->GetTickCount();
			print TIMING "ALL_OF $Link\n$start,$finish 0\n";
		}
	}

	&ProcessForms($base, $html);
	
	my @anchors = @{$html->extract_links(qw(a))};
ANCHOR:	
	for my $anchor ( @anchors )
	{
		# Skip any ISMAP's in the anchor - these must be handled by <MAP>.
		# (might handle it later as a specified [INPUT] value(s))
		@map_links = @{@$anchor[1]->extract_links(qw(img))};
		for $ml ( @map_links ) {
			if ( @$ml[1]->{'ismap'} ) {
				my $tmp = url(@$anchor[0], $base)->abs->as_string;
				PrintTrace($Depth, "$tmp skipped.  It is an ISMAP.\n");
				next ANCHOR;
			}
		}
		&ListFileLinks($Depth+1, 'GET', @$anchor[0], $base, $Link);
	}
	
	
	$html->delete();
}

loadworm.pl  view on Meta::CPAN

sub ProcessForms { my($base, $html) = @_;
	my (@forms, $form);

	@forms = @{extract_tags($html, qw(form))};
	for $form ( @forms )
	{
		my @linklist = IterateForm($base, $form);
		for ( @linklist )
		{
			&ListFileLinks($Depth+1, $form->{'method'}, $_, $base, $html->{'_loadworm_Base'});
		}
	}
}




#sub process_forms { my($self, $flag, $depth) = @_;
#
#	return 1 unless $flag;
#	return 1 unless $self->{'_tag'} eq "form";
#
#	print "FORM -> ${$self}{'action'}\n";
#	
#	@links = @{$html->extract_links(qw(input))};
#	for ( @links ) {
#		$inpt = @$_[1];
#		print $inpt->as_HTML();
#	}
#	
#	for $fmky ( sort keys %{$self} ) {
#		if ( $fmky eq '_content' ) {
#			for $cnt ( @{$self}{$fmky} ) {
#				for ( @{$cnt} ) {
#					print ${$_{_tag}}."=".$_."\n";
#				}
#			}
#		}
#		else {
#			print $fmky."=".${$self}{$fmky}."\n";
#		}
#	}
#	0;
#}





sub PrintResponseParameters { ($response) = @_;
#	%x = %{$response};
#	%x = %{$x{'_headers'}};
#	%x = %{$x{'_header'}};
#	for ( sort keys %x ) {
#		@y = @{$x{$_}};
#		$yyy = "";
#		for $yy ( @y) { $yyy .= $yy.","; }
#		chop $yyy;
#		print "$_=".$yyy."\n";
#	}
	print ${$response}{'_headers'}{'_header'}{'content-type'}[0]."\n";
}



# Skip certain documents, well, just because!
# (but don't skip them if they are listed in $TraversURLs)
#
sub IsIgnoreURL { my($Depth, $Link, $parent) = @_;
	
	# We walk down http: links, only.
	if ( $Link !~ /^https?:\/\// )
	{
		if ( $ENV{'VERBOSE'} )
		{
			PrintTrace($Depth, "$Link skipped, it is not an http link\n");
		}
		return 1;
	}
	
	# Do not ignore if it is explicitly listed in [Traverse].
	for ( 0..$#TraverseURLs )
	{
		if ( $Link eq $TraverseURLs[$_] )
		{
			return 0;
		}
	}

	for ( 0..$#IgnoreURLs )
	{
		if ( $Link =~ /$IgnoreURLs[$_]/ )
		{
			PrintTrace($Depth, "$Link skipped by [Ignore] $IgnoreURLs[$_]\n");
			push @{@{$AlreadyIgnored{$AlreadyLink}}}, ($IgnoreURLs[$_], $parent);
			return 1;
		}
	}
	return 0;
}



# Skip certain documents if we've been the N times already!
#
sub IsLimitURL { my($Depth, $Link, $parent) = @_;
	
	for ( @main::Limits )
	{
      my ($url_match, $url_limit) = split /=/;
		if ( $Link =~ /$url_match/ )
		{
			$LimitCounts{$url_match} += 1;
			if ( $LimitCounts{$url_match} > $url_limit )
			{
				PrintTrace($Depth, "$Link skipped by [Limit] on $url_match.  This is $LimitCounts{$url_match} times.\n");
				return 1;
			}
		}
	}
	return 0;



( run in 1.714 second using v1.01-cache-2.11-cpan-524268b4103 )