Algorithm-MinPerfHashTwoLevel
view release on metacpan or search on metacpan
lib/Tie/Hash/MinPerfHashTwoLevel/OnDisk.pm view on Meta::CPAN
my ($self, $key)= @_;
return fetch_by_key($self->{mount},$key);
}
sub FIRSTKEY {
my ($self)= @_;
$self->{iter_idx}= 0;
return $self->NEXTKEY();
}
sub NEXTKEY {
my ($self, $lastkey)= @_;
fetch_by_index($self->{mount},$self->{iter_idx}++,my $key);
return $key;
}
sub SCALAR {
my ($self)= @_;
my $buckets= $self->get_hdr_num_buckets();
if ($scalar_has_slash) {
$buckets .= "/" . $buckets;
}
return $buckets;
}
sub UNTIE {
my ($self)= @_;
}
sub DESTROY {
my ($self)= @_;
unmount_file($self->{mount}) if $self->{mount};
}
sub STORE {
my ($self, $key, $value)= @_;
confess __PACKAGE__ . " is readonly, STORE operations are not supported";
}
sub DELETE {
my ($self, $key)= @_;
confess __PACKAGE__ . " is readonly, DELETE operations are not supported";
}
sub CLEAR {
my ($self)= @_;
confess __PACKAGE__ . " is readonly, CLEAR operations are not supported";
}
sub make_file {
my ($class, %opts)= @_;
my $ofile= $opts{file}
or die "file is a mandatory option to make_file";
my $source_hash= $opts{source_hash}
or die "source_hash is a mandatory option to make_file";
$opts{comment}= "" unless defined $opts{comment};
$opts{variant}= $DEFAULT_VARIANT unless defined $opts{variant};
my $comment= $opts{comment}||"";
my $debug= $opts{debug} || 0;
my $variant= int($opts{variant});
my $deterministic;
$deterministic //= delete $opts{canonical};
$deterministic //= delete $opts{deterministic};
$deterministic //= 1;
#1234567812345678
$opts{seed} = "MinPerfHash2Levl"
if !defined($opts{seed}) and $deterministic;
my $compute_flags= int($opts{compute_flags}||0);
$compute_flags |= MPH_F_NO_DEDUPE if delete $opts{no_dedupe};
$compute_flags |= MPH_F_DETERMINISTIC
if $deterministic;
$compute_flags |= MPH_F_FILTER_UNDEF
if delete $opts{filter_undef};
die "Unknown variant '$variant', max known is "
. MAX_VARIANT . " default is " . $DEFAULT_VARIANT
if $variant > MAX_VARIANT;
die "Unknown variant '$variant', min known is "
. MIN_VARIANT . " default is " . $DEFAULT_VARIANT
if $variant < MIN_VARIANT;
die "comment cannot contain null"
if index($comment,"\0") >= 0;
my $seed= $opts{seed};
my $hasher= Algorithm::MinPerfHashTwoLevel->new(
debug => $debug,
seed => (ref $seed ? $$seed : $seed),
variant => $variant,
compute_flags => $compute_flags,
max_tries => $opts{max_tries},
);
my $buckets= $hasher->compute($source_hash);
my $buf_length= $hasher->{buf_length};
my $state= $hasher->{state};
my $buf= packed_xs($variant, $buf_length, $state, $comment, $compute_flags, @$buckets);
$$seed= $hasher->get_seed if ref $seed;
my $tmp_file= "$ofile.$$";
open my $ofh, ">", $tmp_file
or die "Failed to open $tmp_file for output";
print $ofh $buf
or die "failed to print to '$tmp_file': $!";
close $ofh
or die "failed to close '$tmp_file': $!";
rename $tmp_file, $ofile
or die "failed to rename '$tmp_file' to '$ofile': $!";
return $ofile;
}
sub validate_file {
my ($class, %opts)= @_;
my $file= $opts{file}
or die "file is a mandatory option to validate_file";
my $verbose= $opts{verbose};
my ($variant,$msg);
my $error_sv;
my $self= $class->new(file => $file, flags => MPH_F_VALIDATE, error_rsv => \$error_sv);
if ($self) {
$msg= sprintf "file '%s' is a valid '%s' file\n"
. " variant: %d\n"
. " keys: %d\n"
. " hash-state: %s\n"
. " table checksum: %016x\n"
. " string checksum: %016x\n"
. " comment: %s"
, $file,
MAGIC_STR,
$self->get_hdr_variant,
$self->get_hdr_num_buckets,
unpack("H*", $self->get_state),
$self->get_hdr_table_checksum,
$self->get_hdr_str_buf_checksum,
$self->get_comment,
;
$variant = $self->get_hdr_variant;
} else {
$msg= $error_sv;
}
if ($verbose) {
if (defined $variant) {
print $msg;
} else {
die $msg."\n";
}
}
return ($variant, $msg);
}
1;
__END__
=head1 NAME
Tie::Hash::MinPerfHashTwoLevel::OnDisk - construct or tie a "two level" minimal perfect hash based on disk
=head1 SYNOPSIS
use Tie::Hash::MinPerfHashTwoLevel::OnDisk;
Tie::Hash::MinPerfHashTwoLevel::OnDisk->make_file(
file => $some_file,
source_hash => $some_hash,
comment => "this is a comment",
debug => 0,
);
my %hash;
tie %hash, "Tie::Hash::MinPerfHashTwoLevel::OnDisk", $some_file;
=head1 DESCRIPTION
This module allows one to either construct, or use a precomputed minimal
perfect hash on disk via tied interface. The disk image of the hash is
loaded by using mmap, which means that multiple processes may use the
same disk image at the same time without memory duplication. The hash
is readonly, and may only contain string values.
=head2 METHODS
=over 4
=item make_file
Construct a new file from a given 'source_hash' argument. Takes the following arguments:
=over 4
=item file
The file name to produce, mandatory.
=item comment
An arbitrary piece of text of your choosing. Can be extracted from
the file later if needed. Only practical restriction on the value is
that it cannot contain a null.
=item seed
A 16 byte string (or a reference to one) to use as the seed for
the hashing and generation process. If this is omitted a standard default
is chosen.
If it should prove impossible to construct a solution using the seed chosen
then a new one will be constructed deterministically from the old until a
solution is found (see L<max_tries>) (prior to version v0.10 this used rand()).
Should you wish to access the seed actually used for the final solution
then you can pass in a reference to a scalar containing your chosen seed.
The reference scalar will be updated after successful construction.
Thus both of the following are valid:
Tie::Hash::MinPerfHashTwoLevel::OnDisk->make_file(seed => "1234567812345678", ...);
Tie::Hash::MinPerfHashTwoLevel::OnDisk->make_file(seed => \my $seed= "1234567812345678", ...);
=item compute_flags
This is an integer which contains various flags which control construction.
They are as follows:
MPH_F_FILTER_UNDEF => 1 - filter keys with undef values
MPH_F_DETERMINISTIC => 2 - repeatable results (sort keys during processing)
MPH_F_NO_DEDUPE => 4 - do not dedupe strings in final buffer
These constants can be imported via the ":flags" tag, but there are also options that
have the equivalent result, see C<no_dedupe>, C<deterministic> and C<filter_undef>.
=item no_dedupe
Speed up construction at the cost of a larger string buffer by disabling
deduplication of values and keys. Same as setting the MPH_F_NO_DEDUPE bit
in compute_flags.
=item deterministic
=item canonical
Produce a canonical result from the source data set, albeit somewhat less quickly
than normal. Note this is independent of supplying a seed, the same seed may produce
a different result for the same set of keys without this option. Same
as setting the MPH_F_DETERMINISTIC bit in compute_flags.
=item filter_undef
Ignore keys with undef values during construction. This means that exists() checks
may differ between source and the constructed hash table, but avoids the need to
store such keys in the resulting file, saving space. Same as setting the
MPH_F_FILTER_UNDEF bit in compute_flags.
=item max_tries
The maximum number of attempts to make to find a solution for this keyset.
Defaults to 3.
=item debug
Enable debug during generation.
=item variant
Select which variant of construction algorithm and file format to produce.
When omitted the variant is determined by the global var
$Tie::Hash::MinPerfHashTwoLevel::DEFAULT_VARIANT
which itself defaults to the latest version. This is mostly for testing,
Older variants will be deprecated and removed eventually.
The list of supported variants is as follows:
5 - Xor, siphash, with inthash, 8 byte alignment, one checksum.
In version 0.15 we switched hash functions to use SipHash(1-3), which
unfortunately made supporting variants prior to 5 impossible.
=back
=item validate_file
Validate the file specified by the 'file' argument. Returns a list of
two values, 'variant' and 'message'. If the file fails validation the 'variant'
will be undef and the 'message' will contain an error message. If the file
passes validation the 'variant' will specify the variant of the file
(currently only 0 is valid), and 'message' will contain some basic information
about the file, such as how many keys it contains, the comment it was
created with, etc.
=back
=head2 SUBS
=over 4
=item mph2l_tied_hashref
Simple wrapper to replace the cumbersome
tie my %hash, "Tie::Hash::MinPerfHashTwoLevel::OnDisk", $file;
with a simple sub that can be imported
my $hashref= mph2l_tied_hashref($file,$validate);
The validate flag causes MPH_F_VALIDATE validations to occur on load.
=item mph2l_make_file
Sub form of L<make_file>. Eg:
use Tie::Hash::MinPerfHashTwoLevel::OnDisk;
Tie::Hash::MinPerfHashTwoLevel::OnDisk->make_file(@args);
is identical to
use Tie::Hash::MinPerfHashTwoLevel::OnDisk qw(mph2l_make_file);
mph2l_make_file(@args);
( run in 1.572 second using v1.01-cache-2.11-cpan-0bd6704ced7 )