Apache-Wyrd

 view release on metacpan or  search on metacpan

Wyrd/Services/Index.pm  view on Meta::CPAN

		unless ($item or ($item =~ /^0+$/o)) {
			warn 'null item here';
			warn 'but defined' if (defined($item));
		}
		my $value = undef;
		my $not_found = $self->db->db_get("$attribute\%$item", my $data);
		my(%entries) = ();
		%entries = unpack("n*", $data) unless ($not_found);
		$entries{$id} = $unique{$item};
		foreach my $item (keys %entries) {
			$value .= pack "n", $item;
			$value .= pack "n", $entries{$item};
		}
		#warn($self->translate_packed($attribute) . "\%$item: " . $self->translate_packed($value));
		$self->update_key("$attribute\%$item", $value);
	}
	if ($self->reversemaps) {
		my $rev_attribute = $self->attributes->{"_$attribute_name"};
		$self->update_key("$rev_attribute\%$id", join("\x00", @items));
	}
	return;
}

sub purge_map {
	my ($self, $attribute_name, $id) = @_;
	my $debug = $self->debug;
	$debug = 1 if ($self->{'runtime_flags'}->{'debug'});
	my $attribute = $self->attributes->{$attribute_name};
	my $rev_attribute = $self->attributes->{"_$attribute_name"};
	my $reverse_index = '';
	my $reversemap_notfound = 1;#by default, don't search for a reversemap unless it's supposed to have one.
	$reversemap_notfound = $self->db->db_get("$rev_attribute\%$id", $reverse_index) if ($self->reversemaps);
	my @updates = ();
	if (not($reversemap_notfound)) {
		$debug && warn ("Found reverse index for map $attribute_name.  Will purge based on that value.");
		foreach my $entry (split "\x00", $reverse_index) {
			#warn "purging $id from $entry";
			my $result = $self->db->db_get("$attribute\%$entry", my $current);
			if ($result) {
				$debug && warn "Reverse index for $attribute_name has a corrupt entry: $entry.  Will do a complete purge.";
				$reversemap_notfound = 1;
				$reversemap_notfound = 0 if ($attribute_name) eq 'word';
				last;
			}
			my(%entries) = unpack("n*", $current);
			#warn "$entry has " . scalar(keys(%entries)) . " documents";
			my $value = undef;
			foreach my $item (keys %entries) {
				#warn "$entry has doc $item";
				next if ($item eq $id);
				$value .= pack "n", $item;
				$value .= pack "n", $entries{$item};
			}
			push (@updates, "$attribute\%$entry", $value);
		}
		$self->db->db_del("$rev_attribute\%$id")  && $self->set_error("Could not remove reversemap for $attribute_name on id $id");
	}
	if ($reversemap_notfound) {
		$debug && $self->reversemaps && warn ("No reverse index for map $attribute_name.  Doing a full purge of $id from the map.");
		my ($key, $current, $removed) = ();
		my $cursor = $self->db->db_cursor;
		unless ($cursor) {
			$self->read_db;
			$cursor = $self->db->db_cursor;
			unless ($cursor) {
				warn 'Failed to obtain DB Cursor.  Aborting purge_map()';
				return 1;
			}
		}
		$cursor->c_get($key, $current, DB_FIRST);
		do {
			if (unpack("C", $key) == ord($attribute)) {
				my $value = undef;
				my $do_update = 0;
				use Apache::Wyrd::Services::SAK qw(spit_file);
				my @test = unpack("n*", $current);
				if (@test % 2) {
					warn 'broken at ' . ord($attribute);
					spit_file('/Users/barry/Desktop/dump', $current);
					die;
				}
				my(%entries) = unpack("n*", $current);
				
				foreach my $item (keys %entries) {
					if ($item eq $id) {
						$do_update = 1;
						next;
					}
					$value .= pack "n", $item;
					$value .= pack "n", $entries{$item};
				}
				push (@updates, $key, $value) if ($do_update);
			}
		} until ($cursor->c_get($key, $current, DB_NEXT));
		$cursor->c_close;
	}
	#cursors have fallen out of scope.  Time to perform the updates.
	while (@updates) {
		my $value = pop @updates;
		my $key = pop @updates;
		$self->update_key($key, $value);
	}
	return scalar($self->error);
}

sub index_words {
	my ($self, $id, $data) = @_;
	# Split text into Array of words
	my (@words) = split(/\s+/, $data);
	$self->index_map('word', $id, \@words);
	return scalar(@words);
}

=pod

=item (scalar) C<clean_html> (scalar)

Given a string of HTML, this method strips out all tags, comments, etc., and
returns only clean lowercase text for breaking down into tokens.

=cut

sub clean_html {
	my ($self, $data) = @_;
	$data = strip_html($data);
	$data = utf8_force($data);
	$data = lc($data);
	$data =~ s/\p{IsM}/ /gs; # Strip M_arks
	$data =~ s/\p{IsP}/ /gs; # Strip P_unct
	$data =~ s/\p{IsZ}/ /gs; # Strip S(Z_)eparators
	$data =~ s/\p{IsC}+/ /sg; # Flatten all whitespace & C_ontrol characters
	$data =~ s/^[\p{IsC} ]+//s; #Remove leading whitespace
	$data =~ s/[\p{IsC} ]+$//s; #Remove trailing whitespace
	$data =~ s/\+//g;
	$data = utf8_to_entities($data); #Encode all multibyte sequences to entities
	$data =~ s/([\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF])/'&#x' . sprintf('%X', ord($1)) . ';'/gexs; #Encode all single-byte "unusual" characters
	return $data;
}

sub clean_searchstring {
	goto &clean_html;
}

=pod

=item (array) C<word_search> (scalar, [scalar])

return entries matching tokens in a string within a given map attribute.  As map
attributes store one token, such as a word, against which all entries are
indexed, the string is broken into tokens before processing, with commas and
whitespaces delimiting the tokens unless they are enclosed in double quotes.

If a token begins with a plus sign (+), results must have the word, with a minus
sign, (-) they must not.  These signs can also be placed left of phrases
enclosed by double quotes.

Results are returned in an array of hashrefs ranked by "score".  The attribute

Wyrd/Services/Index.pm  view on Meta::CPAN

	#map actual names to matches
	foreach my $key (keys(%match)) {
		$output{$key}=$self->get_entry($key, $params);
		$output{$key}->{'score'} = $match{$key};
	}
	$self->close_db;
	my %matches=();
	foreach my $id (keys(%output)) {
		$matches{$output{$id}->{'score'}}=1;
	}
	#put matches in order of highest relevance down to lowest by mapping known
	#counts of words against the pages that are known to match that word.
	foreach my $relevance (sort {$b <=> $a} keys %matches){
		next unless $relevance;
		foreach my $id (sort keys(%output)) {
			if ($output{$id}->{'score'} == $relevance){
				push (@out, $output{$id});
			}
		}
	}
	if ($self->dirty) {#Dirt has no name, so drop it if the database is dirty
		@out = grep {$_->{'name'}} @out;
	}
	return @out;
}

=pod

=item (array) C<search> (scalar, [scalar])

Alias for word_search.  Required by C<Apache::Wyrd::Services::SearchParser>.

=cut

sub search {
	my $self = shift;
	return $self->word_search(@_);
}

=pod

=item (array) C<parsed_search> (scalar, [scalar])

Same as word_search, but with the logical qualifiers AND, OR, NOT and
DIFF. More complex searches can be accomplished, at a cost of reduced
speed proportional to the complexity of the logical phrase.  See
C<Apache::Wyrd::Services::SearchParser> for a description of this type
of search.

=cut

sub parsed_search {
	my $self = shift;
	my $parser = Apache::Wyrd::Services::SearchParser->new($self);
	return $parser->parse(@_);
}

sub get_all_entries {
	my $self=shift;
	my @entries = ();
	my $cursor = $self->db->db_cursor;
	$cursor->c_get(my $id, my $entry, DB_FIRST);
	do {
		push @entries, $entry if ($id =~ /^\x00%/);
	} until ($cursor->c_get($id, $entry, DB_NEXT));
	$cursor->c_close;
	return @entries;
}

sub make_key {
	my ($self, $attribute, $id) = @_;
	return $self->attributes->{$attribute} . '%' . $id;
}

sub translate_packed {
	return join('',  map {(($_ + 0) < 33 or ($_ + 0) > 122) ? '{' . $_ . '}' : chr($_)} unpack('c*', $_[1]) );
}

=pod

=back

=head1 AUTHOR

Barry King E<lt>wyrd@nospam.wyrdwright.comE<gt>

=head1 SEE ALSO

=over

=item Apache::Wyrd

General-purpose HTML-embeddable perl object

=item Apache::Wyrd::Interfaces::Indexable

Methods to be implemented by any item that wants to be indexed.

=item Apache::Wyrd::Services::SearchParser

Parser for handling logical searches (AND/OR/NOT/DIFF).

=back

=head1 LICENSE

Copyright 2002-2007 Wyrdwright, Inc. and licensed under the GNU GPL.

See LICENSE under the documentation for C<Apache::Wyrd>.

=cut


1;



( run in 0.764 second using v1.01-cache-2.11-cpan-437f7b0c052 )