utf8 results from the CPAN

Apache-Wyrd

package Apache::Wyrd::Services::Index;
use 5.006;
use strict;
use warnings;
no warnings qw(uninitialized);
our $VERSION = '0.98';
use Apache::Wyrd::Services::SAK qw(token_parse strip_html utf8_force utf8_to_entities);
use Apache::Wyrd::Services::SearchParser;
use BerkeleyDB;
use BerkeleyDB::Btree;
use Digest::SHA qw(sha1_hex);
use Carp;

=pod

=head1 NAME

Apache::Wyrd::Services::Index - Metadata index for word/data search engines

=head1 SYNOPSIS

    my $init = {
      file => '/var/lib/Wyrd/pageindex.db',
      strict => 1,
      attributes => [qw(author text subjects)],
      maps => [qw(subjects)]
    };
    my $index = Apache::Wyrd::Services::Index->new($init);

    my @subject_is_foobar = $index->word_search('foobar', 'subjects');

    my @pages =
      $index->word_search('+musthaveword -mustnothaveword
        other words to search for and add to results');
    foreach my $page (@pages) {
      print "title: $$page{title}, author: $$page{author};
    }
    
    my @pages = $index->parsed_search('(this AND that) OR "the other"');
    foreach my $page (@pages) {
      print "title: $$page{title}, author: $$page{author};
    }


=head1 DESCRIPTION

General purpose Index object for retrieving a variety of information on a
class of objects.  The objects can have any type, but must implement at a
minimum the C<Apache::Wyrd::Interfaces::Indexable> interface.

The information stored is broken down into attributes.  The main builtin
(and not override-able) attributes are B<data>, B<word>, B<title>, and
B<description>, as well as four internal attributes of B<reverse>,
B<timestamp>, B<digest> and B<count>.  Additional attributes are specified
via the hashref argument to the C<new> method (see below).  There can be
only 254 total attributes, unless reversemaps are turned on, in which case
all map attributes count as two attributes.

Attributes are of two types, either regular or map, and these relate to the
main index, B<id>.  A regular attribute stores information on a
one-id-to-one-attribute basis, such as B<title> or B<description>.  A map
attribute provides a reverse lookup, such as words in a document, or
subjects covered by documents, such as documents with the word "foo" in them
or items classified as "bar".  One builtin map exists, B<word> which
reverse-indexes every word of the attribute B<data>.

The Index is meant to be used as a storage for meta-data about web pages,

Wyrd/Services/Index.pm view on Meta::CPAN

	$self->check_error;
	return;
}

sub delete_key {
	my ($self, $key) = @_;
	my $result = $self->db->db_del($key);
	return undef if (($result == DB_NOTFOUND) or !$result);
	$self->recover_db;
	$self->write_db;
	$result = $self->db->db_del($key);
	return undef if (($result == DB_NOTFOUND) or !$result);
	$self->set_error("Could not delete key: " . $result);
	$self->check_error;
}

sub process_html {
	my ($self, $id, $data) = @_;

	return undef if ($self->{'runtime_flags'}->{'no_data'});

	#Remove all punctuation noise from the data and turn all control characters
	#and unicode into entities
	$data = $self->clean_html($data);

	#if we're doing bigfiles, we get a chance to override the re-indexing
	#of large swaths of data if there has been no change to the html of the
	#indexed object
	if ($self->bigfile and length($data) >= 2048) {
		$self->db->db_get("\x03\%$id", my $old_key);
		$old_key =~ s/^\x00://;
		my $current_key = sha1_hex($data);
		if ($current_key ne $old_key) {
			$self->db_big->db_put($current_key, $data);
			my $wordcount = $self->index_words($id, $data);
			$self->update_key("\x03\%$id", "\x00:$current_key");
			$self->update_key("\x05\%$id", $wordcount);
		}
		return;
	}

	$self->update_key("\x03\%$id", $data);
	my $wordcount = $self->index_words($id, $data);
	$self->update_key("\x05\%$id", $wordcount);
	#warn "\x03\%$id updated to $data";
	return;
}

sub extract_html {
	my ($self, $id) = @_;
	$self->db->db_get("\x03\%$id", my $data);
	if ($data =~ s/^\x00:(.+)//) {
		$self->db_big->db_get($1, $data);
	}
	return $data;
}

sub index_map {
	my ($self, $attribute_name, $id, $data) = @_;

	use Encode qw(_utf8_off);
	_utf8_off($data);
	#warn "mapping $id - $attribute : " . join (':', @$data);
	my $attribute = $self->attributes->{$attribute_name};
	my (%unique, $item, @items) = (); # for unique-ifying word list
	#remove duplicates if necessary
	if (ref($data) eq 'ARRAY') {
		@items = grep { $unique{$_}++ == 0 } @$data;
	} elsif (ref($data) eq 'HASH') {
		%unique = %$data;
		@items = keys(%unique);
	} else {
		#not sure why you'd want to do this, but hey.
		@items = ($data);
	}
	if ($attribute_name eq 'word') {
		@items = grep {length($_) >= $self->wordmin} @items;
	}
	# For each item, add id to map
	foreach my $item (sort @items) {

		#This actually does happen, strangely enough.
		unless ($item or ($item =~ /^0+$/o)) {
			warn 'null item here';
			warn 'but defined' if (defined($item));
		}
		my $value = undef;
		my $not_found = $self->db->db_get("$attribute\%$item", my $data);
		my(%entries) = ();
		%entries = unpack("n*", $data) unless ($not_found);
		$entries{$id} = $unique{$item};
		foreach my $item (keys %entries) {
			$value .= pack "n", $item;
			$value .= pack "n", $entries{$item};
		}
		#warn($self->translate_packed($attribute) . "\%$item: " . $self->translate_packed($value));
		$self->update_key("$attribute\%$item", $value);
	}
	if ($self->reversemaps) {
		my $rev_attribute = $self->attributes->{"_$attribute_name"};
		$self->update_key("$rev_attribute\%$id", join("\x00", @items));
	}
	return;
}

sub purge_map {
	my ($self, $attribute_name, $id) = @_;
	my $debug = $self->debug;
	$debug = 1 if ($self->{'runtime_flags'}->{'debug'});
	my $attribute = $self->attributes->{$attribute_name};
	my $rev_attribute = $self->attributes->{"_$attribute_name"};
	my $reverse_index = '';
	my $reversemap_notfound = 1;#by default, don't search for a reversemap unless it's supposed to have one.
	$reversemap_notfound = $self->db->db_get("$rev_attribute\%$id", $reverse_index) if ($self->reversemaps);
	my @updates = ();
	if (not($reversemap_notfound)) {
		$debug && warn ("Found reverse index for map $attribute_name.  Will purge based on that value.");
		foreach my $entry (split "\x00", $reverse_index) {
			#warn "purging $id from $entry";
			my $result = $self->db->db_get("$attribute\%$entry", my $current);
			if ($result) {
				$debug && warn "Reverse index for $attribute_name has a corrupt entry: $entry.  Will do a complete purge.";

Wyrd/Services/Index.pm view on Meta::CPAN

				warn 'Failed to obtain DB Cursor.  Aborting purge_map()';
				return 1;
			}
		}
		$cursor->c_get($key, $current, DB_FIRST);
		do {
			if (unpack("C", $key) == ord($attribute)) {
				my $value = undef;
				my $do_update = 0;
				use Apache::Wyrd::Services::SAK qw(spit_file);
				my @test = unpack("n*", $current);
				if (@test % 2) {
					warn 'broken at ' . ord($attribute);
					spit_file('/Users/barry/Desktop/dump', $current);
					die;
				}
				my(%entries) = unpack("n*", $current);
				
				foreach my $item (keys %entries) {
					if ($item eq $id) {
						$do_update = 1;
						next;
					}
					$value .= pack "n", $item;
					$value .= pack "n", $entries{$item};
				}
				push (@updates, $key, $value) if ($do_update);
			}
		} until ($cursor->c_get($key, $current, DB_NEXT));
		$cursor->c_close;
	}
	#cursors have fallen out of scope.  Time to perform the updates.
	while (@updates) {
		my $value = pop @updates;
		my $key = pop @updates;
		$self->update_key($key, $value);
	}
	return scalar($self->error);
}

sub index_words {
	my ($self, $id, $data) = @_;
	# Split text into Array of words
	my (@words) = split(/\s+/, $data);
	$self->index_map('word', $id, \@words);
	return scalar(@words);
}

=pod

=item (scalar) C<clean_html> (scalar)

Given a string of HTML, this method strips out all tags, comments, etc., and
returns only clean lowercase text for breaking down into tokens.

=cut

sub clean_html {
	my ($self, $data) = @_;
	$data = strip_html($data);
	$data = utf8_force($data);
	$data = lc($data);
	$data =~ s/\p{IsM}/ /gs; # Strip M_arks
	$data =~ s/\p{IsP}/ /gs; # Strip P_unct
	$data =~ s/\p{IsZ}/ /gs; # Strip S(Z_)eparators
	$data =~ s/\p{IsC}+/ /sg; # Flatten all whitespace & C_ontrol characters
	$data =~ s/^[\p{IsC} ]+//s; #Remove leading whitespace
	$data =~ s/[\p{IsC} ]+$//s; #Remove trailing whitespace
	$data =~ s/\+//g;
	$data = utf8_to_entities($data); #Encode all multibyte sequences to entities
	$data =~ s/([\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF])/'&#x' . sprintf('%X', ord($1)) . ';'/gexs; #Encode all single-byte "unusual" characters
	return $data;
}

sub clean_searchstring {
	goto &clean_html;
}

=pod

=item (array) C<word_search> (scalar, [scalar])

return entries matching tokens in a string within a given map attribute.  As map
attributes store one token, such as a word, against which all entries are
indexed, the string is broken into tokens before processing, with commas and
whitespaces delimiting the tokens unless they are enclosed in double quotes.

If a token begins with a plus sign (+), results must have the word, with a minus
sign, (-) they must not.  These signs can also be placed left of phrases
enclosed by double quotes.

Results are returned in an array of hashrefs ranked by "score".  The attribute
"score" is added to the hash, meaning number of matches for that given entry. 
All other regular attributes of the indexable object are values of the keys of
each hash returned.

The default map to use for this method is 'word'.  If the optional second
argument is given, that map will be used.

=cut

sub word_search { #accepts a search string, returns an arrayref of entry matches
	my ($self, $string, $attribute, $params) = @_;
	if ($attribute) {
		$self->_raise_exception("You cannot perform a word search on the attribute $attribute; It doesn't exist")
			unless ($self->maps->{$attribute});
		$attribute = $self->{'attributes'}->{$attribute};
	} else {
		$attribute = "\x04";
	}
	my $index = $self->read_db;
	my (@out, %match, %must, %mustnot, @match, @add, @remove, $restrict, @entries)=();
	$string =~ s/(\+|\-)\s+/$1/g;
	if ($string =~ /"/) {#first deal with exact word matches
		while ($string =~ m/(([\+-]?)"([^"]+?)")/) { #whole=1, modifier=2, phrase=3
			my $phrase = $self->clean_searchstring($3);
			my $modifier = $2;
			my $substring = $1;
			#escape out phrase and substring since they will be used in regexps
			#later in this subroutine.
			$substring =~ s/([\\\+\?\:\\*\&\@\$\!])/\\$1/g;
			$phrase =~ s/([\\\+\?\:\\*\&\@\$\!])/\\$1/g;
			$string =~ s/$substring//; #remove the phrase from the string;
			if ($modifier eq '+') {
				push (@add, "_$phrase");
				$restrict = 1;
			} elsif ($modifier eq '-') {
				push (@remove, "_$phrase");
			} else {
				push (@match, "_$phrase");

( run in 0.654 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )