Acme-Tools
view release on metacpan or search on metacpan
142114221423142414251426142714281429143014311432143314341435143614371438143914401441
TTD
=> 1.150931,
#trinidad/tobago dollar
TWD
=> 0.267321,
#taiwan dollar
USD
=> 7.780201,
#us dollar
'$'
=> 7.780201,
#us doller, symbol
VEF
=> 0.778994,
#venezuelan bolivares fuertes
XBT
=> 84864.0984477,
#synonym for BTC
XRP
=> 8.96808208868,
#ripple
ZAR
=> 0.667117,
#south africa rand
},
numbers
=>{
dec
=>1,
hex
=>1,
bin
=>1,
oct
=>1,
roman
=>1,
des
=>1,
#des: spelling error in v0.15-0.16
dusin
=>1,
dozen
=>1,
doz
=>1,
dz
=>1,
gross
=>144,
gr
=>144,
gro
=>144,
great_gross
=>12*144,
small_gross
=>10*12,
}
);
our
$conv_prepare_time
=0;
our
$conv_prepare_money_time
=0;
sub
conv_prepare {
my
%b
=(
da
=>1e+1,
h
=>1e+2,
k
=>1e+3,
M
=>1e+6,
G
=>1e+9,
T
=>1e+12,
P
=>1e+15,
E
=>1e+18,
Z
=>1e+21,
Y
=>1e+24,
H
=>1e+27);
my
%big
=(
deca
=>1e+1,
hecto
=>1e+2,
kilo
=>1e+3,
mega
=>1e+6,
giga
=>1e+9,
tera
=>1e+12,
peta
=>1e+15,
exa
=>1e+18,
zetta
=>1e+21,
yotta
=>1e+24,
hella
=>1e+27);
my
%s
=(
d
=>1e-1,
c
=>1e-2,
m
=>1e-3,
'µ'
=>1e-6,
u
=>1e-6,
n
=>1e-9,
p
=>1e-12,
f
=>1e-15,
a
=>1e-18,
z
=>1e-21,
y
=>1e-24);
my
%small
=(
deci
=>1e-1,
centi
=>1e-2,
milli
=>1e-3,
micro
=>1e-6,
nano
=>1e-9,
pico
=>1e-12,
femto
=>1e-15,
atto
=>1e-18,
zepto
=>1e-21,
yocto
=>1e-24);
7310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352Bloom filters can be used to check whether an element (a string) is a
member of a large set using much less memory or disk space than other
data structures. Trading speed and accuracy
for
memory usage. While
risking false positives, Bloom filters have a very strong space
advantage over other data structures
for
representing sets.
In the example below, a set of 100000 phone numbers (or any string of
any
length
) can be
"stored"
in just 91230 bytes
if
you
accept
that you
can only check the data structure
for
existence of a string and
accept
false positives
with
an error rate of 0.03 (that is three percent, error
rates are
given
in numbers larger than 0 and smaller than 1).
You can not retrieve the strings in the set without using "brute
force" methods and even then you would get slightly more strings than
you put in because of the error rate inaccuracy.
Bloom Filters have many uses.
See also: L<http://en.wikipedia.org/wiki/Bloom_filter>
See also: L<Bloom::Filter>
=head2 bfinit
Initialize a new Bloom Filter:
my $bf = bfinit( error_rate=>0.01, capacity=>100000 );
The same:
my $bf = bfinit( 0.01, 100000 );
since two arguments is interpreted as error_rate and capacity accordingly.
=head2 bfadd
bfadd($bf, $_) for @phone_numbers; # Adding strings one at a time
bfadd($bf, @phone_numbers); # ...or all at once (faster)
Returns 1 on success. Dies (croaks) if more strings than capacity is added.
7391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457@not_found
=
grep
!bfcheck(
$bf
,
$_
),
@keys
);
# same but slower
=head2 bfdelete
Deletes from a counting bloom filter.
To enable deleting be sure to initialize the bloom filter with the
numeric C<counting_bits> argument. The number of bits could be 2 or 3*)
for small filters with a small capacity (a small number of keys), but
setting the number to 4 ensures that even very large filters with very
small error rates would not overflow.
*) Acme::Tools do not currently support C<< counting_bits => 3 >> so 4
and 8 are the only practical alternatives where 8 is almost always overkill.
my $bf=bfinit(
error_rate => 0.001,
capacity => 10000000,
counting_bits => 4 # power of 2, that is 2, 4, 8, 16 or 32
);
bfadd( $bf, @unique_phone_numbers);
bfdelete($bf, @unique_phone_numbers);
Example: examine the frequency of the counters with 4 bit counters and 4 million keys:
my $bf=bfinit( error_rate=>0.001, capacity=>4e6, counting_bits=>4 );
bfadd($bf,[1e3*$_+1 .. 1e3*($_+1)]) for 0..4000-1; # adding 4 million keys one thousand at a time
my %c; $c{vec($$bf{filter},$_,$$bf{counting_bits})}++ for 0..$$bf{filterlength}-1;
printf "%8d counters = %d\n",$c{$_},$_ for sort{$a<=>$b}keys%c;
The output:
28689562 counters = 0
19947673 counters = 1
6941082 counters = 2
1608250 counters = 3
280107 counters = 4
38859 counters = 5
4533 counters = 6
445 counters = 7
46 counters = 8
1 counters = 9
Even after the error_rate is changed from 0.001 to a percent of that, 0.00001, the limit of 16 (4 bits) is still far away:
47162242 counters = 0
33457237 counters = 1
11865217 counters = 2
2804447 counters = 3
497308 counters = 4
70608 counters = 5
8359 counters = 6
858 counters = 7
65 counters = 8
4 counters = 9
In algorithmic terms the number of bits needed is C<ln of ln of n>. Thats why 4 bits (counters up
to 15) is "always" good enough except for extremely large capasities or extremely small error rates.
(Except when adding the same key many times, which should be avoided, and Acme::Tools::bfadd do not
check for that, perhaps in future versions).
Bloom filters of the counting type are not very space efficient: The tables above shows that 84%-85%
of the counters are 0 or 1. This means most bits are zero-bits. This doesn't have to be a problem if
a counting bloom filter is used to be sent over slow networks because they are very compressable by
common compression tools like I<gzip> or L<Compress::Zlib> and such.
Deletion of non-existing keys makes C<bfdelete> die (croak).
7468746974707471747274737474747574767477747874797480748174827483748474857486748774887489Croaks (dies) on deleting a non-existing key or deleting from an previouly overflown counter in a counting bloom filter.
=head2 bfaddbf
Adds another bloom filter to a bloom filter.
Bloom filters has the proberty that bit-wise I<OR>-ing the bit-filters
of two filters with the same capacity and the same number and type of
hash functions, adds the filters:
my $bf1=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[1..500]);
my $bf2=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[501..1000]);
bfaddbf($bf1,$bf2);
print "Yes!" if bfgrep($bf1, 1..1000) == 1000;
Prints yes since C<bfgrep> now returns an array of all the 1000 elements.
Croaks if the filters are of different dimensions.
Works for counting bloom filters as well (C<< counting_bits=>4 >> e.g.)
7492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522Returns the number of 1's in the filter.
my
$percent
=100
*bfsum
(
$bf
)/
$$bf
{filterlength};
printf
"The filter is %.1f%% filled\n"
,
$percent
;
#prints 50.0% or so if filled to capacity
Sums the counters
for
counting bloom filters (much slower than
for
non counting).
=head2 bfdimensions
Input, two numeric arguments: Capacity and error_rate.
Outputs an array of two numbers: m and k.
m = - n * log(p) / log(2)**2 # n = capacity, m = bits in filter (divide by 8 to get bytes)
k = log(1/p) / log(2) # p = error_rate, uses perls internal log() with base e (2.718)
...that is: m = the best number of bits in the filter and k = the best
number of hash functions optimized for the given capacity (n) and
error_rate (p). Note that k is a dependent only of the error_rate. At
about two percent error rate the bloom filter needs just the same
number of bytes as the number of keys.
Storage (bytes):
Capacity Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate
0.000000001 0.00000001 0.0000001 0.000001 0.00001 0.0001 0.001 0.01 0.02141585 0.1 0.5 0.99
------------- ----------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
10 54.48 48.49 42.5 36.51 30.52 24.53 18.53 12.54 10.56 6.553 2.366 0.5886
100 539.7 479.8 419.9 360 300.1 240.2 180.3 120.4 100.6 60.47 18.6 0.824
1000 5392 4793 4194 3595 2996 2397 1798 1199 1001 599.6 180.9 3.177
10000 5.392e+04 4.793e+04 4.194e+04 3.594e+04 2.995e+04 2.396e+04 1.797e+04 1.198e+04 1e+04 5991 1804 26.71
76357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693See also Scaleable Bloom Filters: L<http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf> (not implemented in Acme::Tools)
...and perhaps L<http://intertrack.naist.jp/Matsumoto_IEICE-ED200805.pdf>
=cut
sub bfinit {
return bfretrieve(@_) if @_==1;
return bfinit(error_rate=>$_[0], capacity=>$_[1]) if @_==2 and 0<$_[0] and $_[0]<1 and $_[1]>1;
return bfinit(error_rate=>$_[1], capacity=>$_[0]) if @_==2 and 0<$_[1] and $_[1]<1 and $_[0]>1;
require Digest::MD5;
@_%2&&croak "Arguments should be a hash of equal number of keys and values";
my %arg=@_;
my @ok_param=qw/error_rate capacity min_hashfuncs max_hashfuncs hashfuncs counting_bits adaptive keys/;
my @not_ok=sort(grep!in($_,@ok_param),keys%arg);
croak "Not ok param to bfinit: ".join(", ",@not_ok) if @not_ok;
croak "Not an arrayref in keys-param" if exists $arg{keys} and ref($arg{keys}) ne 'ARRAY';
croak "Not implemented counting_bits=$arg{counting_bits}, should be 2, 4, 8, 16 or 32" if !in(nvl($arg{counting_bits},1),1,2,4,8,16,32);
croak "An bloom filters here can not be in both adaptive and counting_bits modes" if $arg{adaptive} and $arg{counting_bits}>1;
my $bf={error_rate => 0.001, #default p
capacity => 100000, #default n
min_hashfuncs => 1,
max_hashfuncs => 100,
counting_bits => 1, #default: not counting filter
adaptive => 0,
%arg, #arguments
key_count => 0,
overflow => {},
version => $Acme::Tools::VERSION,
};
croak "Error rate ($$bf{error_rate}) should be larger than 0 and smaller than 1" if $$bf{error_rate}<=0 or $$bf{error_rate}>=1;
@$bf{'min_hashfuncs','max_hashfuncs'}=(map$arg{hashfuncs},1..2) if $arg{hashfuncs};
@$bf{'filterlength','hashfuncs'}=bfdimensions($bf); #m and k
$$bf{filter}=pack("b*", '0' x ($$bf{filterlength}*$$bf{counting_bits}) ); #hm x new empty filter
$$bf{unpack}= $$bf{filterlength}<=2**16/4 ? "n*" # /4 alleviates skewing if m just slightly < 2**x
:$$bf{filterlength}<=2**32/4 ? "N*"
: "Q*";
bfadd($bf,@{$arg{keys}}) if $arg{keys};
return $bf;
}
sub bfaddbf {
my($bf,$bf2)=@_;
my $differror=join"\n",
map "Property $_ differs ($$bf{$_} vs $$bf2{$_})",
grep $$bf{$_} ne $$bf2{$_},
qw/capacity counting_bits adaptive hashfuncs filterlength/; #not error_rate
croak $differror if $differror;
croak "Can not add adaptive bloom filters" if $$bf{adaptive};
my $count=$$bf{key_count}+$$bf2{key_count};
croak "Exceeded filter capacity $$bf{key_count} + $$bf2{key_count} = $count > $$bf{capacity}"
if $count > $$bf{capacity};
$$bf{key_count}+=$$bf2{key_count};
if($$bf{counting_bits}==1){
$$bf{filter} |= $$bf2{filter};
#$$bf{filter} = $$bf{filter} | $$bf2{filter}; #or-ing
}
else {
785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882
my
$bf
=Storable::retrieve(
@_
);
carp
"Retrieved bloom filter was stored in version $$bf{version}, this is version $VERSION"
if
$$bf
{version}>
$VERSION
;
return
$bf
;
}
sub
bfclone {
return
Storable::dclone(
@_
);
#could be faster
}
sub
bfdimensions_old {
my
(
$n
,
$p
,
$mink
,
$maxk
,
$k
,
$flen
,
$m
)=
@_
==1 ? (@{
$_
[0]}{
'capacity'
,
'error_rate'
,
'min_hashfuncs'
,
'max_hashfuncs'
},1)
:
@_
==2 ? (
@_
,1,100,1)
: croak
"Wrong number of arguments ("
.
@_
.
"), should be 2"
;
croak
"p ($p) should be > 0 and < 1"
if
not ( 0<
$p
&&
$p
<1 );
$m
=-1
*$_
*$n
/
log
(1-
$p
**(1/
$_
)) and (!
defined
$flen
or
$m
<
$flen
) and (
$flen
,
$k
)=(
$m
,
$_
)
for
$mink
..
$maxk
;
$flen
=
int
(1+
$flen
);
return
(
$flen
,
$k
);
}
sub
bfdimensions {
my
(
$n
,
$p
,
$mink
,
$maxk
)=
@_
==1 ? (@{
$_
[0]}{
'capacity'
,
'error_rate'
,
'min_hashfuncs'
,
'max_hashfuncs'
})
:
@_
==2 ? (
@_
,1,100)
: croak
"Wrong number of arguments ("
.
@_
.
"), should be 2"
;
my
$k
=
log
(1/
$p
)/
log
(2);
# k hash funcs
my
$m
=-
$n
*log
(
$p
)/
log
(2)**2;
# m bits in filter
return
(
$m
+0.5,min(
$maxk
,max(
$mink
,
int
(
$k
+0.5))));
}
#crontab -e
#01 4,10,16,22 * * * /usr/bin/perl -MAcme::Tools -e'Acme::Tools::_update_currency_file("/var/www/html/currency-rates")' > /dev/null 2>&1
80158016801780188019802080218022802380248025802680278028802980308031803280338034
z2z [-p -k -v -o -1 -2 -3 -4 -5 -6 -7 -8 -9 ] files
Converts (recompresses) files from one compression type to another. For instance from .gz to .bz2
Keeps uid, gid, mode (
chmod
) and mtime.
-p Show a progress meter using the pv program
if
installed
-k Keeps original file
-v Verbose, shows info on degree of compression and file
number
if
more than one file is being converted
-o Overwrites existing result file, otherwise stop
with
error msg
-1 .. -9 Degree of compression, -1 fastest .. -9 best
-e With -t xz (or 2xz) passes -e to xz (-9e = extreme compression)
-L rate With -p. Slow down, ex: -L 200K means 200 kilobytes per second
-D sec With -p. Only turn on progress meter (pv)
after
x seconds
-i sec With -p. Info update rate
-l With -p. Line mode
-I With -p. Show ETA as
time
of arrival as well as
time
left
-
q
With -p. Quiet. Useful
with
-L to limit rate, but
no
output
t/03_bloomfilter.t view on Meta::CPAN
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163# perl Makefile.PL;make;perl -Iblib/lib t/3_bloomfilter.t
# perl Makefile.PL;make;ATDEBUG=1 perl -Iblib/lib t/3_bloomfilter.t
# time ( perl Makefile.PL;make;ATDEBUG=1 perl -Iblib/lib t/03_bloomfilter.t )
# perl Makefile.PL;make;perl -Iblib/lib t/03_bloomfilter.t
my
$error_rate
=0.02;
my
$capacity
=10000;
my
$bf
=bfinit(
$error_rate
,
$capacity
);
my
$t
=time_fp();
bfadd(
$bf
,
map
$_
*2,0..
$capacity
-1);
#deb "Adds pr sec: ".int($capacity/(time_fp()-$t))."\n";
#bfadd($bf, $_) for map $_*2,0..$capacity-1;
deb serialize({
%$bf
,
filter
=>
''
},
'bf'
,
''
,1);
deb
"Filter has capacity $$bf{capacity}\n"
;
deb
"Filter has $$bf{key_count} keys\n"
;
deb
"Filter has "
.
length
(
$$bf
{filter}).
" bytes\n"
;
deb
"Filter has $$bf{filterlength} bits of which "
.bfsum(
$bf
).
" ("
.
int
(100
*bfsum
(
$bf
)/
$$bf
{filterlength}).
"%) are on\n"
;
deb
"Filter has $$bf{hashfuncs} hash functions\n"
;
my
@c
=bfcheck(
$bf
,0..
$capacity
*2);
#test next ok: $c[2000]=0;
#deb "$_->".bfcheck($bf,$_)."\n" for 0..200;
my
$sum
;
$sum
+=
$c
[
$_
*2+1 ],
for
0..
$capacity
-1;
deb
"Filter has $sum false positives\n"
;
ok(!(
grep
$c
[
$_
]!=1,
map
$_
*2, 0..
$capacity
-1),
'no false negatives'
);
ok(
$sum
>=
$capacity
*$error_rate
*80/100
&&
$sum
<=
$capacity
*$error_rate
*120/100
,
sprintf
"real error rate (%.6f) vs wanted error_rate ($error_rate) within ok ratio 80-120%% (%d%%)"
,
$sum
/
$capacity
,
100
*$sum
/(
$capacity
*$error_rate
)
);
eval
{bfinit(
a
=>1,
b
=>2)};
#deb $@;
ok($@=~/Not ok param to bfinit: a, b\b/,
'param check'
);
eval
{bfinit(
capacity
=>10,
keys
=>[1..11])};
ok($@=~/Exceeded filter capacity 10/,
'capacity check'
);
eval
{bfinit(
error_rate
=>0.0,
capacity
=>1e3)};ok($@=~/\QError rate (0) should be larger than 0 and smaller than 1\E/,
'error_rate check1'
);
eval
{bfinit(
error_rate
=>1.0,
capacity
=>1e3)};ok($@=~/\QError rate (1) should be larger than 0 and smaller than 1\E/,
'error_rate check2'
);
#deb "<<$@>>\n";
#---------- OO
my
$bfoo
=new Acme::Tools::BloomFilter(0.1,1000);
$bfoo
->add(1..500);
$bfoo
->add([501..1000]);
ok(0+
grep
(
$_
,
$bfoo
->check(1..1000)) == 1000,
'oo ok1'
);
ok(
$bfoo
->clone()->
grep
([1..1000]) == 1000,
'oo ok2'
);
ok(0+
grep
(
$_
,
$bfoo
->check(1001..2000)) < 150,
'oo ok3'
);
#---------- counting bloom filter
my
(
$er
,
$cap
,
$cb
)=(0.1,1000,4);
my
$cbf
=bfinit(
error_rate
=>
$er
,
capacity
=>
$cap
,
counting_bits
=>
$cb
,
keys
=>[1..
$cap
]);
ok(0+
grep
(
$_
,bfcheck(
$cbf
,1..
$cap
)) ==
$cap
,
'cbf no false negatives'
);
ok(bfgrepnot(
$cbf
,[1..
$cap
]) == 0,
'cbf grepnot'
);
my
$errs
=
grep
(
$_
,bfcheck(
$cbf
,
$cap
+1..
$cap
*2));
deb
"Errs $errs\n"
;
ok(between(
$errs
/
$cap
/
$er
,0.7,1.3),
'error rate rating '
.(
$errs
/
$cap
/
$er
).
' within ok range 0.7-1.3'
);
#---------- see doc about this example:
#do{
# my $bf=bfinit( error_rate=>0.00001, capacity=>4e6, counting_bits=>4 );
# bfadd($bf,[1000*$_+1 .. 1000*($_+1)]),deb"." for 0..4000-1; # adding 4 million keys one thousand at a time
# my %c; $c{vec($$bf{filter},$_,$$bf{counting_bits})}++ for 0..$$bf{filterlength}-1;
# deb sprintf("%8d counters is %2d\n",$c{$_},$_) for sort{$a<=>$b}keys%c;
#};
my
%c
;
$c
{
vec
(
$$cbf
{filter},
$_
,
$cb
)}++
for
0..
$$cbf
{filterlength}-1;
ok(sum(
map
$c
{
$_
}
*$_
,
keys
%c
)/
$$cbf
{key_count} ==
$$cbf
{hashfuncs},
'counter check'
);
#deb sprintf("%8d counters is %2d\n",$c{$_},$_) for sort{$a<=>$b}keys%c;
#---------- counting bloom filter, test delete
do
{
my
(
$er
,
$cap
,
$cb
)=(0.1,500,4);
my
$bf
=bfinit(
error_rate
=>
$er
,
capacity
=>
$cap
*2,
counting_bits
=>
$cb
,
keys
=>[1..
$cap
*2]);
bfdelete(
$bf
,
$cap
+1 ..
$cap
*1.5);
bfdelete(
$bf
,[
$cap
*1.5+1 ..
$cap
*2]);
ok(bfgrep(
$bf
,[1..
$cap
]) ==
$cap
,
'cbf, delete test, no false negatives'
);
my
$err
=bfgrep(
$bf
,[
$cap
+1..
$cap
*2]);
deb
"Err $err\n"
;
ok(
$err
/
$cap
/
$er
<1.3,
"cbf, delete test, after delete ($err)"
);
my
%c
=();
$c
{
vec
(
$$bf
{filter},
$_
,
$cb
)}++
for
0..
$$bf
{filterlength}-1;
ok(sum(
map
$c
{
$_
}
*$_
,
keys
%c
)/
$$bf
{key_count} ==
$$bf
{hashfuncs},
'cbf, delete test, counter check after delete'
);
eval
{ok(bfdelete(
$bf
,
'x'
))};ok($@=~/Cannot
delete
a non-existing key x/,
'delete non-existing key'
);
};
#---------- test filter lengths
my
$r
;
ok(between(
$r
=
length
(bfinit(
counting_bits
=>
$_
,
error_rate
=>0.01,
capacity
=>100)->{filter}) /
length
(bfinit(
counting_bits
=>1,
error_rate
=>0.01,
capacity
=>100)->{filter}) /
$_
, 0.95, 1.05),
"filter length ($r), cb $_"
)
for
qw/2 4 8 16/
;
eval
{bfinit(
counting_bits
=>2,
error_rate
=>0.1,
capacity
=>1000,
keys
=>[1..1000])};ok($@=~/Too many overflows/,
'overflow check'
);
#----------storing and retrieving
my
$tmp
=tmp();
if
(-w
$tmp
){
my
$file
=
"$tmp/cbf.bf"
;
bfstore(
$cbf
,
$file
);
deb
"Stored size of $file: "
.(-s
$file
).
" bytes\n"
;
my
$cbfr
=bfretrieve(
$file
);
ok(bfgrep(
$cbfr
,[1..
$cap
]) ==
$cap
,
'store+retrieve: cbf no false negatives'
);
$errs
=bfgrep(
$cbf
,[
$cap
+1..
$cap
*2]);
#deb "Errs $errs\n";
ok(between(
$errs
/
$cap
/
$er
,0.7,1.3),
'store+retrieve: error rate rating '
.(
$errs
/
$cap
/
$er
).
' within ok range 0.7-1.3'
);
my
$bf
=Acme::Tools::BloomFilter->new(
$file
);
ok(
$$bf
{key_count}==
$cap
,
'store+retrieve, oo'
);
unlink
$file
;
}
else
{
ok(1,
'skipped, not linux'
)
for
1..3;
}
#----------adaptive bloom filter, not implemented/tested, see http://intertrack.naist.jp/Matsumoto_IEICE-ED200805.pdf
# $cap=100;
# $bf=bfinit(adaptive=>0,error_rate=>0.001,capacity=>$cap,keys=>[1..$cap]);
# @c=bfcheck($bf,[1..$cap]);
# %c=(); $c{$_}++ for @c;
# deb "Filter has $$bf{filterlength} bits of which ".bfsum($bf)." (".int(100*bfsum($bf)/$$bf{filterlength})."%) are on\n";
# deb "Filter has ".int(1+$$bf{filterlength}/8)." bytes (".sprintf("%.1f",int(1+$$bf{filterlength}/8)/1024)." kb)\n";
# deb "Filter has $$bf{hashfuncs} hash functions\n";
# deb "Number of $_: $c{$_}\n" for sort{$a<=>$b}keys%c;
# deb "Sum bits ".sum(map $$bf{hashfuncs}+$_-1,bfcheck($bf,1..$cap))."\n";
# deb "False negatives: ".grep(!$_,@c)."\n";
# deb "Error rate: ".(($errs=grep($_,bfcheck($bf,$cap+1..$cap*2)))/$cap)."\n";
# deb "Errors: $errs\n";
#---------- bfaddbf, adding two bloom filters
do
{
my
$cap
=100;
my
$bf1
=bfinit(
error_rate
=>0.01,
capacity
=>
$cap
,
keys
=>[1..
$cap
/2]);
my
$bf2
=bfinit(
error_rate
=>0.01,
capacity
=>
$cap
,
keys
=>[
$cap
/2+1..
$cap
]);
deb
"bf1 key_count: $$bf1{key_count}, bf1 ones: "
.bfsum(
$bf1
).
"\n"
;
deb
"bf2 key_count: $$bf2{key_count}, bf2 ones: "
.bfsum(
$bf2
).
"\n"
;
bfaddbf(
$bf1
,
$bf2
);
deb
"bf1 key_count: $$bf1{key_count}, bf1 ones: "
.bfsum(
$bf1
).
"\n"
;
my
@found
=bfgrep(
$bf1
,[1..
$cap
]);
ok(
@found
==
$cap
,
"bfaddbf(), found "
.
@found
.
" of $cap"
);
};
do
{
my
$cap
=1000;
my
$er
=0.1;
my
$bf1
=bfinit(
counting_bits
=>4,
error_rate
=>
$er
,
capacity
=>
$cap
,
keys
=>[1..
$cap
/2]);
my
$bf2
=bfinit(
counting_bits
=>4,
error_rate
=>
$er
,
capacity
=>
$cap
,
keys
=>[
$cap
/2+1..
$cap
]);
deb
"bf1 key_count: $$bf1{key_count}, bf1 sum: "
.bfsum(
$bf1
).
"\n"
;
deb
"bf2 key_count: $$bf2{key_count}, bf2 sum: "
.bfsum(
$bf2
).
"\n"
;
deb serialize(
$$bf1
{overflow},
'bf1overflow'
);
deb serialize(
$$bf2
{overflow},
'bf2overflow'
);
bfaddbf(
$bf1
,
$bf2
);
deb
"bf1 key_count: $$bf1{key_count}, bf1 sum: "
.bfsum(
$bf1
).
"\n"
;
deb serialize(
$$bf1
{overflow},
'bf1overflow'
);
my
@found
=bfgrep(
$bf1
,[1..
$cap
]);
ok(
@found
==
$cap
,
"bfaddbf(), found "
.
@found
.
" of $cap"
);
my
$errs
=bfgrep(
$bf1
,[
$cap
+1..
$cap
*2]);
deb
"erate: "
.(
$errs
/
$cap
).
"\n"
;
my
$p
=100
*$errs
/
$cap
/
$er
;
ok(between(
$p
,70,130),
"error rate "
.(
$errs
/
$cap
).
" within 70%-130% of $er ($p%)"
);
# deb "Error rate: ".(($errs=grep($_,bfcheck($bf1,$cap+1..$cap*2)))/$cap)."\n";
};
t/test_fork_bloom.pl view on Meta::CPAN
123456789101112131415161718192021222324252627282930#!/usr/bin/perl
use
Acme::Tools;
my
$jobs
=16;
my
$cap
=1000000;
my
$error_rate
=0.01;
my
(
$pid
,
@pid
);
for
my
$job
(0..
$jobs
-1){
unlink
"/tmp/bf$job.bf"
;
next
if
fork
();
my
$t
=time_fp();
my
@keys
=
grep
$_
%$jobs
==
$job
,1..
$cap
;
#my @keys=map rand(), 1..$cap/$jobs;
my
$bf
=bfinit(
error_rate
=>
$error_rate
,
capacity
=>
$cap
,
keys
=>\
@keys
);
bfstore(
$bf
,
"/tmp/bf$job.bf"
);
"job $job finished, "
.(time_fp()-
$t
).
" sec\n"
;
exit
;
}
1
while
wait
() != -1;
"building finished\n"
;
my
$bf
=bfinit(
error_rate
=>
$error_rate
,
capacity
=>
$cap
);
for
my
$job
(0..
$jobs
-1){
"Adding bloom filter $job..."
;
my
$t
=time_fp();
bfaddbf(
$bf
,bfretrieve(
"/tmp/bf$job.bf"
));
"took "
.(time_fp()-
$t
).
" sec\n"
;
}
int
(
$$bf
{filterlength}/8),
" bytes\n"
;
printf
"%.1f%%\n"
,100
*bfsum
(
$bf
)/
$$bf
{filterlength};
"keys: $$bf{key_count}\n"
;
"found: "
.bfgrep(
$bf
,[1..
$cap
/10]).
"\n"
;
( run in 0.265 second using v1.01-cache-2.11-cpan-bb97c1e446a )