Apache-Wyrd

 view release on metacpan or  search on metacpan

Wyrd/Services/Index.pm  view on Meta::CPAN

	my ($self, $data) = @_;
	$data = strip_html($data);
	$data = utf8_force($data);
	$data = lc($data);
	$data =~ s/\p{IsM}/ /gs; # Strip M_arks
	$data =~ s/\p{IsP}/ /gs; # Strip P_unct
	$data =~ s/\p{IsZ}/ /gs; # Strip S(Z_)eparators
	$data =~ s/\p{IsC}+/ /sg; # Flatten all whitespace & C_ontrol characters
	$data =~ s/^[\p{IsC} ]+//s; #Remove leading whitespace
	$data =~ s/[\p{IsC} ]+$//s; #Remove trailing whitespace
	$data =~ s/\+//g;
	$data = utf8_to_entities($data); #Encode all multibyte sequences to entities
	$data =~ s/([\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF])/'&#x' . sprintf('%X', ord($1)) . ';'/gexs; #Encode all single-byte "unusual" characters
	return $data;
}

sub clean_searchstring {
	goto &clean_html;
}

=pod

=item (array) C<word_search> (scalar, [scalar])

return entries matching tokens in a string within a given map attribute.  As map
attributes store one token, such as a word, against which all entries are
indexed, the string is broken into tokens before processing, with commas and
whitespaces delimiting the tokens unless they are enclosed in double quotes.

If a token begins with a plus sign (+), results must have the word, with a minus
sign, (-) they must not.  These signs can also be placed left of phrases
enclosed by double quotes.

Results are returned in an array of hashrefs ranked by "score".  The attribute
"score" is added to the hash, meaning number of matches for that given entry. 
All other regular attributes of the indexable object are values of the keys of
each hash returned.

The default map to use for this method is 'word'.  If the optional second
argument is given, that map will be used.

=cut

sub word_search { #accepts a search string, returns an arrayref of entry matches
	my ($self, $string, $attribute, $params) = @_;
	if ($attribute) {
		$self->_raise_exception("You cannot perform a word search on the attribute $attribute; It doesn't exist")
			unless ($self->maps->{$attribute});
		$attribute = $self->{'attributes'}->{$attribute};
	} else {
		$attribute = "\x04";
	}
	my $index = $self->read_db;
	my (@out, %match, %must, %mustnot, @match, @add, @remove, $restrict, @entries)=();
	$string =~ s/(\+|\-)\s+/$1/g;
	if ($string =~ /"/) {#first deal with exact word matches
		while ($string =~ m/(([\+-]?)"([^"]+?)")/) { #whole=1, modifier=2, phrase=3
			my $phrase = $self->clean_searchstring($3);
			my $modifier = $2;
			my $substring = $1;
			#escape out phrase and substring since they will be used in regexps
			#later in this subroutine.
			$substring =~ s/([\\\+\?\:\\*\&\@\$\!])/\\$1/g;
			$phrase =~ s/([\\\+\?\:\\*\&\@\$\!])/\\$1/g;
			$string =~ s/$substring//; #remove the phrase from the string;
			if ($modifier eq '+') {
				push (@add, "_$phrase");
				$restrict = 1;
			} elsif ($modifier eq '-') {
				push (@remove, "_$phrase");
			} else {
				push (@match, "_$phrase");
			}
		}
	}
	my @word=split(/\s+/, $string); #then deal with single words
	foreach my $word (@word){
		my ($modifier) = $word =~ /^([\+\-])/;
		$word = $self->clean_searchstring($word);
		if ($modifier eq '+') {
			push (@add, $word);
			$restrict = 1;
		} elsif ($modifier eq '-') {
			push (@remove, $word);
		} else {
			push (@match, $word);
		}
	}
	#warn "searching for:";
	#warn map {"\nmatch - $_"} @match;
	#warn map {"\nadd - $_"} @add;
	#warn map {"\nremove - $_"} @remove;
	#if this is a 100% negative search, all entries match
	unless (scalar(@match) or scalar(@add)) {
		@entries = $self->get_all_entries;
		foreach my $key (@entries) {
			$match{$key}=1;
		}
	}
	foreach my $word (@match){
		if ($word =~ s/^_//) {
			foreach my $entry ($self->get_all_entries) {
				my $data = $self->extract_html($entry);
				my @count = $data =~ m/($word)/g;
				my $count = @count;
				$match{$entry} += $count;
			}
		} else {
			#warn ord($attribute) . "\%$word";
			$index->db_get("$attribute\%$word", my $keys);
			#warn Dumper($keys);
			#warn "match - '" . translate_packed($keys) . "'";
			my (@keys) = unpack("n*",$keys);
			while (@keys) {
				my $entry = shift @keys;
				my $count = shift @keys;
				#warn "entry: $entry, $count: $count";
				$match{$entry} += $count;
			}
		}
	}



( run in 1.313 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )