App-BloomUtils
view release on metacpan or search on metacpan
lib/App/BloomUtils.pm view on Meta::CPAN
}
return $res unless $res->[0] == 200;
my $m = $args{num_bits} // $res->[2]{actual_m};
my $k = $args{num_hashes} // $res->[2]{actual_k};
log_info "Will be creating bloom filter with num_bits (m)=%d (actual %d), num_hashes (k)=%d, actual false-positive rate=%.5f%% (when num_items=%d), actual bloom filter size=%d bytes",
$m, $res->[2]{actual_m}, $k, $res->[2]{actual_p}*100, $res->[2]{n}, $res->[2]{actual_bloom_size};
my $bf = Algorithm::BloomFilter->new($m, $k);
my $i = 0;
while (defined(my $line = <STDIN>)) {
chomp $line;
$bf->add($line);
$i++;
if (defined $args{num_items} && $i == $args{num_items}+1) {
log_warn "You created bloom filter for num_items=%d, but now have added more than that", $args{num_items};
}
}
print $bf->serialize;
[200];
}
$SPEC{check_with_bloom_filter} = {
v => 1.1,
summary => 'Check with bloom filter',
description => <<'_',
You supply the bloom filter in STDIN, items to check as arguments, and this
utility will print lines containing 0 or 1 depending on whether items in the
arguments are tested to be, respectively, not in the set (0) or probably in the
set (1).
_
args => {
items => {
summary => 'Items to check',
schema => ['array*', of=>'str*'],
req => 1,
pos => 0,
greedy => 1,
},
},
'cmdline.skip_format' => 1,
links => [
],
};
sub check_with_bloom_filter {
require Algorithm::BloomFilter;
my %args = @_;
my $bf_str = "";
while (read(STDIN, my $block, 8192)) {
$bf_str .= $block;
}
my $bf = Algorithm::BloomFilter->deserialize($bf_str);
for (@{ $args{items} }) {
say $bf->test($_) ? 1:0;
}
[200];
}
$SPEC{bloom_filter_calculator} = {
v => 1.1,
summary => 'Help calculate num_bits (m) and num_hashes (k)',
description => $desc1,
args => {
num_items => {
summary => 'Expected number of items to add to bloom filter',
schema => 'posint*',
pos => 0,
req => 1,
cmdline_aliases => {n=>{}},
},
num_bits => {
summary => 'Number of bits to set for the bloom filter',
schema => 'posint*',
cmdline_aliases => {m=>{}},
},
false_positive_rate => {
schema => ['float*', max=>0.5],
default => 0.02,
cmdline_aliases => {
fp_rate => {},
p => {},
},
},
num_hashes => {
schema => 'posint*',
cmdline_aliases => {k=>{}},
},
num_hashes_to_bits_per_item_ratio => {
summary => '0.7 (the default) is optimal',
schema => 'num*',
},
},
args_rels => {
'choose_one&' => [
[qw/num_hashes num_hashes_to_bits_per_item_ratio/],
],
},
};
sub bloom_filter_calculator {
require Algorithm::BloomFilter;
my %args = @_;
my $num_hashes_to_bits_per_item_ratio = $args{num_hashes_to_bits_per_item_ratio};
$num_hashes_to_bits_per_item_ratio //= 0.7 unless defined($args{num_bits}) && defined($args{num_items});
my $num_items = $args{num_items};
my $fp_rate = $args{false_positive_rate} // 0.02;
my $num_bits = $args{num_bits} // ($num_items * log(1/$fp_rate)/ log(2)**2);
my $num_bits_per_item = $num_bits / $num_items;
my $num_hashes = $args{num_hashes} //
(defined $num_hashes_to_bits_per_item_ratio ? $num_hashes_to_bits_per_item_ratio*$num_bits_per_item : undef) //
( run in 0.980 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )