Apache-Wyrd

 view release on metacpan or  search on metacpan

Wyrd/Site/IndexBot.pm  view on Meta::CPAN


=head1 BUGS/CAVEATS

Other bugs/caveats per C<Apache::Wyrd::Bot>.  Also reserves the methods
index_site and purge_missing.

=cut

sub index_site {
	my ($self, $index) = @_;
	my $lastindex = undef;
	my $hostname = $self->{'server_hostname'};
	my $root = $self->{'document_root'};
	my $lastfile = $root . '/var/lastindex.db';
	if ($self->{'basefile'}) {
		$lastfile = $self->{'basefile'} . '.last';
	}

	#purge_missing returns a list of existing files for which there is no
	#database entry and/or the entry has been deleted.
	my @no_skip = $self->purge_missing($index);
	my %no_skip = map {$_ , 1} @no_skip;
	if ($self->{'realclean'}) {
		print "Expired data purge complete.";
	}

	#create a user-agent to trigger the updates to the index with
	my $ua = $index->ua;
	$ua->timeout(60);
	local $| = 1;

	#go through the files in the document root that match ".html",
	#and read in the file that shows when the last update was done
	open (FILES, '-|', "/usr/bin/find $root -name \*.html");
	$lastindex = ${slurp_file($lastfile)};
	my $newest = $lastindex;
	my @files = ();
	while (<FILES>) {
		chomp;
		push @files, $_;
	}
	print "<P>" . scalar(@files) . " files to index.</p>";

	#For each file, try to navigate to it with the User-agent.  Use the auth
	#cookie of the viewer of this Wyrd.
	my $counter = 0;
	while ($_ = shift @files) {
		my @stats = stat($_);
		#warn "Document status/lastindex/current newest:" . join('/', $stats[9], $lastindex, $newest);
		$newest = $stats[9] if ($stats[9] > $newest);
		$counter++;
		s/$root//;
		unless ($no_skip{$_}) {
			next if ($self->{'fastindex'} and ($stats[9] <= $lastindex));
			next if $index->skip_file($_);
		}
		my $url = "http://$hostname$_";
		my $response = '';
		my $auth_cookie = $self->{'auth_cookie'};
		if ($auth_cookie) {
			$response = $ua->get($url, Cookie => $auth_cookie);
		} else {
			$response = $ua->get($url);
		}
		my $status = $response->status_line;
		if ($status =~ /200|OK/) {
			print "$counter. $_: OK";
		} else {
			print "$counter. $_: <span class=\"error\">$status</span>";
			system "touch $_" if (-f $_);
		}
	}
	print "<b><p>$counter files indexed</p></b>";

	#Save the date to the lastindex file.
	spit_file($lastfile, $newest);
	return;
}

sub purge_missing {
	my ($self, $instance) = @_;
	my @no_skip = ();
	my $root = $self->{'document_root'};
	print "<P>First checking for deleted documents and corrupt data";
	my $index = $instance->write_db;
	my %ismap = ();
	foreach my $value (keys %{$instance->maps}) {
		$value = $instance->attributes->{$value};
		$ismap{$value} = 1;
	}
	my %exists = ();
	my %reverse = ();
	my %force_purge = ();
	my $cursor = $index->db_cursor;
	$cursor->c_get(my $id, my $document, DB_FIRST);
	do {
		my ($first, $second, $identifier) = unpack('aaa*', $id);
		if ($second ne '%') {
			#if the metachar is not there, this is a primary filename map.
			$exists{$id} = $document || 'error: unnamed entry';
		} elsif ($first eq "\0") {
			#if the metachar is 0, this is a reversemap
			$reverse{$document} = $identifier;
		}
	} until ($cursor->c_get($id, $document, DB_NEXT));
	undef $cursor;
	foreach my $id (keys %exists) {
		my $document = $exists{$id};
		if ($reverse{$id} ne $exists{$id}) {
			print "Entry $id for $exists{$id} seems to be a duplicate entry.  Deleting it prior to purge...";
			my $result = $index->db_del($id);
			$force_purge{$id} = 1;
			if ($result) {
				print "Failed to delete dangling entry $id.  Manual repair may be necessary...";
			}
		} elsif (-f ($root . $document)) {
			#document exists as a file
			print"keeping $root$document";
		} else {
			my $entry = $instance->get_entry($id);
			my $file = $entry->{'file'};



( run in 1.241 second using v1.01-cache-2.11-cpan-39bf76dae61 )