unicode results from the CPAN

Apache-Wyrd
sub get_id {
	my ($self, $name) = @_;
	my $result = $self->db->db_get("\x00%$name", my $id);
	return $id unless ($result);
	$result = $self->db->db_get("\xff%greatest_id", $id);
	$id ||= 0;#make ID numerical
	#warn ("Did not find $name.  Higest ID found by metadata: " . ($result || $id));
	#warn $self->_error("Index metadata failed to find a highest ID, scanning instead...") if ($result);
	$id++;
	while (not($self->db->db_get($id, my $null))) {
		#make sure this really is a key, and if the metadata fails, we're scanning anyway
		$id++;
	}
	return ($id, 1);#new id + flag
}

sub get_value {
	my ($self, $key) = @_;
	my $result = $self->db->db_get($key, my $value);
	return undef if (($result eq 'DB_NOTFOUND') or !$result);
	$self->recover_db;
	$self->read_db;
	$result = $self->db->db_get($key, $value);
	return undef if (($result eq 'DB_NOTFOUND') or !$result);
	$self->set_error("Could not get key: " . $result);
	$self->check_error;
	return;
}

sub update_key {
	my ($self, $key, $value) = @_;
	my $result = $self->db->db_put($key, $value);
	return undef unless ($result);
	$self->recover_db;
	$self->write_db;
	$result = $self->db->db_put($key, $value);
	return undef unless ($result);
	$self->set_error("Could not set key: " . $result);
	$self->check_error;
	return;
}

sub delete_key {
	my ($self, $key) = @_;
	my $result = $self->db->db_del($key);
	return undef if (($result == DB_NOTFOUND) or !$result);
	$self->recover_db;
	$self->write_db;
	$result = $self->db->db_del($key);
	return undef if (($result == DB_NOTFOUND) or !$result);
	$self->set_error("Could not delete key: " . $result);
	$self->check_error;
}

sub process_html {
	my ($self, $id, $data) = @_;

	return undef if ($self->{'runtime_flags'}->{'no_data'});

	#Remove all punctuation noise from the data and turn all control characters
	#and unicode into entities
	$data = $self->clean_html($data);

	#if we're doing bigfiles, we get a chance to override the re-indexing
	#of large swaths of data if there has been no change to the html of the
	#indexed object
	if ($self->bigfile and length($data) >= 2048) {
		$self->db->db_get("\x03\%$id", my $old_key);
		$old_key =~ s/^\x00://;
		my $current_key = sha1_hex($data);
		if ($current_key ne $old_key) {
			$self->db_big->db_put($current_key, $data);
			my $wordcount = $self->index_words($id, $data);
			$self->update_key("\x03\%$id", "\x00:$current_key");
			$self->update_key("\x05\%$id", $wordcount);
		}
		return;
	}

	$self->update_key("\x03\%$id", $data);
	my $wordcount = $self->index_words($id, $data);
	$self->update_key("\x05\%$id", $wordcount);
	#warn "\x03\%$id updated to $data";
	return;
}

sub extract_html {
	my ($self, $id) = @_;
	$self->db->db_get("\x03\%$id", my $data);
	if ($data =~ s/^\x00:(.+)//) {
		$self->db_big->db_get($1, $data);
	}
	return $data;
}

sub index_map {
	my ($self, $attribute_name, $id, $data) = @_;

	use Encode qw(_utf8_off);
	_utf8_off($data);
	#warn "mapping $id - $attribute : " . join (':', @$data);
	my $attribute = $self->attributes->{$attribute_name};
	my (%unique, $item, @items) = (); # for unique-ifying word list
	#remove duplicates if necessary
	if (ref($data) eq 'ARRAY') {
		@items = grep { $unique{$_}++ == 0 } @$data;
	} elsif (ref($data) eq 'HASH') {
		%unique = %$data;
		@items = keys(%unique);
	} else {
		#not sure why you'd want to do this, but hey.
		@items = ($data);
	}
	if ($attribute_name eq 'word') {
		@items = grep {length($_) >= $self->wordmin} @items;
	}
	# For each item, add id to map
	foreach my $item (sort @items) {

		#This actually does happen, strangely enough.
		unless ($item or ($item =~ /^0+$/o)) {
( run in 0.792 second using v1.01-cache-2.11-cpan-600a1bdf6e4 )