Apache-Wyrd
view release on metacpan or search on metacpan
Wyrd/Services/Index.pm view on Meta::CPAN
sub get_id {
my ($self, $name) = @_;
my $result = $self->db->db_get("\x00%$name", my $id);
return $id unless ($result);
$result = $self->db->db_get("\xff%greatest_id", $id);
$id ||= 0;#make ID numerical
#warn ("Did not find $name. Higest ID found by metadata: " . ($result || $id));
#warn $self->_error("Index metadata failed to find a highest ID, scanning instead...") if ($result);
$id++;
while (not($self->db->db_get($id, my $null))) {
#make sure this really is a key, and if the metadata fails, we're scanning anyway
$id++;
}
return ($id, 1);#new id + flag
}
sub get_value {
my ($self, $key) = @_;
my $result = $self->db->db_get($key, my $value);
return undef if (($result eq 'DB_NOTFOUND') or !$result);
$self->recover_db;
$self->read_db;
$result = $self->db->db_get($key, $value);
return undef if (($result eq 'DB_NOTFOUND') or !$result);
$self->set_error("Could not get key: " . $result);
$self->check_error;
return;
}
sub update_key {
my ($self, $key, $value) = @_;
my $result = $self->db->db_put($key, $value);
return undef unless ($result);
$self->recover_db;
$self->write_db;
$result = $self->db->db_put($key, $value);
return undef unless ($result);
$self->set_error("Could not set key: " . $result);
$self->check_error;
return;
}
sub delete_key {
my ($self, $key) = @_;
my $result = $self->db->db_del($key);
return undef if (($result == DB_NOTFOUND) or !$result);
$self->recover_db;
$self->write_db;
$result = $self->db->db_del($key);
return undef if (($result == DB_NOTFOUND) or !$result);
$self->set_error("Could not delete key: " . $result);
$self->check_error;
}
sub process_html {
my ($self, $id, $data) = @_;
return undef if ($self->{'runtime_flags'}->{'no_data'});
#Remove all punctuation noise from the data and turn all control characters
#and unicode into entities
$data = $self->clean_html($data);
#if we're doing bigfiles, we get a chance to override the re-indexing
#of large swaths of data if there has been no change to the html of the
#indexed object
if ($self->bigfile and length($data) >= 2048) {
$self->db->db_get("\x03\%$id", my $old_key);
$old_key =~ s/^\x00://;
my $current_key = sha1_hex($data);
if ($current_key ne $old_key) {
$self->db_big->db_put($current_key, $data);
my $wordcount = $self->index_words($id, $data);
$self->update_key("\x03\%$id", "\x00:$current_key");
$self->update_key("\x05\%$id", $wordcount);
}
return;
}
$self->update_key("\x03\%$id", $data);
my $wordcount = $self->index_words($id, $data);
$self->update_key("\x05\%$id", $wordcount);
#warn "\x03\%$id updated to $data";
return;
}
sub extract_html {
my ($self, $id) = @_;
$self->db->db_get("\x03\%$id", my $data);
if ($data =~ s/^\x00:(.+)//) {
$self->db_big->db_get($1, $data);
}
return $data;
}
sub index_map {
my ($self, $attribute_name, $id, $data) = @_;
use Encode qw(_utf8_off);
_utf8_off($data);
#warn "mapping $id - $attribute : " . join (':', @$data);
my $attribute = $self->attributes->{$attribute_name};
my (%unique, $item, @items) = (); # for unique-ifying word list
#remove duplicates if necessary
if (ref($data) eq 'ARRAY') {
@items = grep { $unique{$_}++ == 0 } @$data;
} elsif (ref($data) eq 'HASH') {
%unique = %$data;
@items = keys(%unique);
} else {
#not sure why you'd want to do this, but hey.
@items = ($data);
}
if ($attribute_name eq 'word') {
@items = grep {length($_) >= $self->wordmin} @items;
}
# For each item, add id to map
foreach my $item (sort @items) {
#This actually does happen, strangely enough.
unless ($item or ($item =~ /^0+$/o)) {
( run in 1.099 second using v1.01-cache-2.11-cpan-39bf76dae61 )