Apache-Wyrd
view release on metacpan or search on metacpan
Wyrd/Services/Index.pm view on Meta::CPAN
unless ($item or ($item =~ /^0+$/o)) {
warn 'null item here';
warn 'but defined' if (defined($item));
}
my $value = undef;
my $not_found = $self->db->db_get("$attribute\%$item", my $data);
my(%entries) = ();
%entries = unpack("n*", $data) unless ($not_found);
$entries{$id} = $unique{$item};
foreach my $item (keys %entries) {
$value .= pack "n", $item;
$value .= pack "n", $entries{$item};
}
#warn($self->translate_packed($attribute) . "\%$item: " . $self->translate_packed($value));
$self->update_key("$attribute\%$item", $value);
}
if ($self->reversemaps) {
my $rev_attribute = $self->attributes->{"_$attribute_name"};
$self->update_key("$rev_attribute\%$id", join("\x00", @items));
}
return;
}
sub purge_map {
my ($self, $attribute_name, $id) = @_;
my $debug = $self->debug;
$debug = 1 if ($self->{'runtime_flags'}->{'debug'});
my $attribute = $self->attributes->{$attribute_name};
my $rev_attribute = $self->attributes->{"_$attribute_name"};
my $reverse_index = '';
my $reversemap_notfound = 1;#by default, don't search for a reversemap unless it's supposed to have one.
$reversemap_notfound = $self->db->db_get("$rev_attribute\%$id", $reverse_index) if ($self->reversemaps);
my @updates = ();
if (not($reversemap_notfound)) {
$debug && warn ("Found reverse index for map $attribute_name. Will purge based on that value.");
foreach my $entry (split "\x00", $reverse_index) {
#warn "purging $id from $entry";
my $result = $self->db->db_get("$attribute\%$entry", my $current);
if ($result) {
$debug && warn "Reverse index for $attribute_name has a corrupt entry: $entry. Will do a complete purge.";
$reversemap_notfound = 1;
$reversemap_notfound = 0 if ($attribute_name) eq 'word';
last;
}
my(%entries) = unpack("n*", $current);
#warn "$entry has " . scalar(keys(%entries)) . " documents";
my $value = undef;
foreach my $item (keys %entries) {
#warn "$entry has doc $item";
next if ($item eq $id);
$value .= pack "n", $item;
$value .= pack "n", $entries{$item};
}
push (@updates, "$attribute\%$entry", $value);
}
$self->db->db_del("$rev_attribute\%$id") && $self->set_error("Could not remove reversemap for $attribute_name on id $id");
}
if ($reversemap_notfound) {
$debug && $self->reversemaps && warn ("No reverse index for map $attribute_name. Doing a full purge of $id from the map.");
my ($key, $current, $removed) = ();
my $cursor = $self->db->db_cursor;
unless ($cursor) {
$self->read_db;
$cursor = $self->db->db_cursor;
unless ($cursor) {
warn 'Failed to obtain DB Cursor. Aborting purge_map()';
return 1;
}
}
$cursor->c_get($key, $current, DB_FIRST);
do {
if (unpack("C", $key) == ord($attribute)) {
my $value = undef;
my $do_update = 0;
use Apache::Wyrd::Services::SAK qw(spit_file);
my @test = unpack("n*", $current);
if (@test % 2) {
warn 'broken at ' . ord($attribute);
spit_file('/Users/barry/Desktop/dump', $current);
die;
}
my(%entries) = unpack("n*", $current);
foreach my $item (keys %entries) {
if ($item eq $id) {
$do_update = 1;
next;
}
$value .= pack "n", $item;
$value .= pack "n", $entries{$item};
}
push (@updates, $key, $value) if ($do_update);
}
} until ($cursor->c_get($key, $current, DB_NEXT));
$cursor->c_close;
}
#cursors have fallen out of scope. Time to perform the updates.
while (@updates) {
my $value = pop @updates;
my $key = pop @updates;
$self->update_key($key, $value);
}
return scalar($self->error);
}
sub index_words {
my ($self, $id, $data) = @_;
# Split text into Array of words
my (@words) = split(/\s+/, $data);
$self->index_map('word', $id, \@words);
return scalar(@words);
}
=pod
=item (scalar) C<clean_html> (scalar)
Given a string of HTML, this method strips out all tags, comments, etc., and
returns only clean lowercase text for breaking down into tokens.
=cut
sub clean_html {
my ($self, $data) = @_;
$data = strip_html($data);
$data = utf8_force($data);
$data = lc($data);
$data =~ s/\p{IsM}/ /gs; # Strip M_arks
$data =~ s/\p{IsP}/ /gs; # Strip P_unct
$data =~ s/\p{IsZ}/ /gs; # Strip S(Z_)eparators
$data =~ s/\p{IsC}+/ /sg; # Flatten all whitespace & C_ontrol characters
$data =~ s/^[\p{IsC} ]+//s; #Remove leading whitespace
$data =~ s/[\p{IsC} ]+$//s; #Remove trailing whitespace
$data =~ s/\+//g;
$data = utf8_to_entities($data); #Encode all multibyte sequences to entities
$data =~ s/([\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF])/'&#x' . sprintf('%X', ord($1)) . ';'/gexs; #Encode all single-byte "unusual" characters
return $data;
}
sub clean_searchstring {
goto &clean_html;
}
=pod
=item (array) C<word_search> (scalar, [scalar])
return entries matching tokens in a string within a given map attribute. As map
attributes store one token, such as a word, against which all entries are
indexed, the string is broken into tokens before processing, with commas and
whitespaces delimiting the tokens unless they are enclosed in double quotes.
If a token begins with a plus sign (+), results must have the word, with a minus
sign, (-) they must not. These signs can also be placed left of phrases
enclosed by double quotes.
Results are returned in an array of hashrefs ranked by "score". The attribute
Wyrd/Services/Index.pm view on Meta::CPAN
#map actual names to matches
foreach my $key (keys(%match)) {
$output{$key}=$self->get_entry($key, $params);
$output{$key}->{'score'} = $match{$key};
}
$self->close_db;
my %matches=();
foreach my $id (keys(%output)) {
$matches{$output{$id}->{'score'}}=1;
}
#put matches in order of highest relevance down to lowest by mapping known
#counts of words against the pages that are known to match that word.
foreach my $relevance (sort {$b <=> $a} keys %matches){
next unless $relevance;
foreach my $id (sort keys(%output)) {
if ($output{$id}->{'score'} == $relevance){
push (@out, $output{$id});
}
}
}
if ($self->dirty) {#Dirt has no name, so drop it if the database is dirty
@out = grep {$_->{'name'}} @out;
}
return @out;
}
=pod
=item (array) C<search> (scalar, [scalar])
Alias for word_search. Required by C<Apache::Wyrd::Services::SearchParser>.
=cut
sub search {
my $self = shift;
return $self->word_search(@_);
}
=pod
=item (array) C<parsed_search> (scalar, [scalar])
Same as word_search, but with the logical qualifiers AND, OR, NOT and
DIFF. More complex searches can be accomplished, at a cost of reduced
speed proportional to the complexity of the logical phrase. See
C<Apache::Wyrd::Services::SearchParser> for a description of this type
of search.
=cut
sub parsed_search {
my $self = shift;
my $parser = Apache::Wyrd::Services::SearchParser->new($self);
return $parser->parse(@_);
}
sub get_all_entries {
my $self=shift;
my @entries = ();
my $cursor = $self->db->db_cursor;
$cursor->c_get(my $id, my $entry, DB_FIRST);
do {
push @entries, $entry if ($id =~ /^\x00%/);
} until ($cursor->c_get($id, $entry, DB_NEXT));
$cursor->c_close;
return @entries;
}
sub make_key {
my ($self, $attribute, $id) = @_;
return $self->attributes->{$attribute} . '%' . $id;
}
sub translate_packed {
return join('', map {(($_ + 0) < 33 or ($_ + 0) > 122) ? '{' . $_ . '}' : chr($_)} unpack('c*', $_[1]) );
}
=pod
=back
=head1 AUTHOR
Barry King E<lt>wyrd@nospam.wyrdwright.comE<gt>
=head1 SEE ALSO
=over
=item Apache::Wyrd
General-purpose HTML-embeddable perl object
=item Apache::Wyrd::Interfaces::Indexable
Methods to be implemented by any item that wants to be indexed.
=item Apache::Wyrd::Services::SearchParser
Parser for handling logical searches (AND/OR/NOT/DIFF).
=back
=head1 LICENSE
Copyright 2002-2007 Wyrdwright, Inc. and licensed under the GNU GPL.
See LICENSE under the documentation for C<Apache::Wyrd>.
=cut
1;
( run in 0.764 second using v1.01-cache-2.11-cpan-437f7b0c052 )