Acme-Tools

 view release on metacpan or  search on metacpan

Tools.pm  view on Meta::CPAN

1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
           TTD => 1.150931,             #trinidad/tobago dollar
           TWD => 0.267321,             #taiwan dollar
           USD => 7.780201,             #us dollar
          '$'  => 7.780201,             #us doller, symbol
           VEF => 0.778994,             #venezuelan bolivares fuertes
           XBT => 84864.0984477,        #synonym for BTC
           XRP => 8.96808208868,        #ripple
           ZAR => 0.667117,             #south africa rand
          },
          numbers =>{
            dec=>1,hex=>1,bin=>1,oct=>1,roman=>1,      des=>1,#des: spelling error in v0.15-0.16
            dusin=>1,dozen=>1,doz=>1,dz=>1,gross=>144,gr=>144,gro=>144,great_gross=>12*144,small_gross=>10*12,
          }
        );
our $conv_prepare_time=0;
our $conv_prepare_money_time=0;
sub conv_prepare {
  my %b    =(da  =>1e+1, h    =>1e+2, k    =>1e+3, M     =>1e+6,          G   =>1e+9, T   =>1e+12, P    =>1e+15, E   =>1e+18, Z    =>1e+21, Y    =>1e+24, H    =>1e+27);
  my %big  =(deca=>1e+1, hecto=>1e+2, kilo =>1e+3, mega  =>1e+6,          giga=>1e+9, tera=>1e+12, peta =>1e+15, exa =>1e+18, zetta=>1e+21, yotta=>1e+24, hella=>1e+27);
  my %s    =(d   =>1e-1, c    =>1e-2, m    =>1e-3,'µ'    =>1e-6, u=>1e-6, n   =>1e-9, p   =>1e-12, f    =>1e-15, a   =>1e-18, z    =>1e-21, y    =>1e-24);
  my %small=(deci=>1e-1, centi=>1e-2, milli=>1e-3, micro =>1e-6,          nano=>1e-9, pico=>1e-12, femto=>1e-15, atto=>1e-18, zepto=>1e-21, yocto=>1e-24);

Tools.pm  view on Meta::CPAN

7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
Bloom filters can be used to check whether an element (a string) is a
member of a large set using much less memory or disk space than other
data structures. Trading speed and accuracy for memory usage. While
risking false positives, Bloom filters have a very strong space
advantage over other data structures for representing sets.
 
In the example below, a set of 100000 phone numbers (or any string of
any length) can be "stored" in just 91230 bytes if you accept that you
can only check the data structure for existence of a string and accept
false positives with an error rate of 0.03 (that is three percent, error
rates are given in numbers larger than 0 and smaller than 1).
 
You can not retrieve the strings in the set without using "brute
force" methods and even then you would get slightly more strings than
you put in because of the error rate inaccuracy.
 
Bloom Filters have many uses.
 
 
See also: L<Bloom::Filter>
 
=head2 bfinit
 
Initialize a new Bloom Filter:
 
  my $bf = bfinit( error_rate=>0.01, capacity=>100000 );
 
The same:
 
  my $bf = bfinit( 0.01, 100000 );
 
since two arguments is interpreted as error_rate and capacity accordingly.
 
 
=head2 bfadd
 
  bfadd($bf, $_) for @phone_numbers;   # Adding strings one at a time
 
  bfadd($bf, @phone_numbers);          # ...or all at once (faster)
 
Returns 1 on success. Dies (croaks) if more strings than capacity is added.

Tools.pm  view on Meta::CPAN

7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
@not_found = grep !bfcheck($bf,$_), @keys);  # same but slower
 
=head2 bfdelete
 
Deletes from a counting bloom filter.
 
To enable deleting be sure to initialize the bloom filter with the
numeric C<counting_bits> argument. The number of bits could be 2 or 3*)
for small filters with a small capacity (a small number of keys), but
setting the number to 4 ensures that even very large filters with very
small error rates would not overflow.
 
*) Acme::Tools do not currently support C<< counting_bits => 3 >> so 4
and 8 are the only practical alternatives where 8 is almost always overkill.
 
 my $bf=bfinit(
   error_rate    => 0.001,
   capacity      => 10000000,
   counting_bits => 4              # power of 2, that is 2, 4, 8, 16 or 32
 );
 bfadd(   $bf, @unique_phone_numbers);
 bfdelete($bf, @unique_phone_numbers);
 
Example: examine the frequency of the counters with 4 bit counters and 4 million keys:
 
 my $bf=bfinit( error_rate=>0.001, capacity=>4e6, counting_bits=>4 );
 bfadd($bf,[1e3*$_+1 .. 1e3*($_+1)]) for 0..4000-1;  # adding 4 million keys one thousand at a time
 my %c; $c{vec($$bf{filter},$_,$$bf{counting_bits})}++ for 0..$$bf{filterlength}-1;
 printf "%8d counters = %d\n",$c{$_},$_ for sort{$a<=>$b}keys%c;
 
The output:
 
 28689562 counters = 0
 19947673 counters = 1
  6941082 counters = 2
  1608250 counters = 3
   280107 counters = 4
    38859 counters = 5
     4533 counters = 6
      445 counters = 7
       46 counters = 8
        1 counters = 9
 
Even after the error_rate is changed from 0.001 to a percent of that, 0.00001, the limit of 16 (4 bits) is still far away:
 
 47162242 counters = 0
 33457237 counters = 1
 11865217 counters = 2
  2804447 counters = 3
   497308 counters = 4
    70608 counters = 5
     8359 counters = 6
      858 counters = 7
       65 counters = 8
        4 counters = 9
 
In algorithmic terms the number of bits needed is C<ln of ln of n>.  Thats why 4 bits (counters up
to 15) is "always" good enough except for extremely large capasities or extremely small error rates.
(Except when adding the same key many times, which should be avoided, and Acme::Tools::bfadd do not
check for that, perhaps in future versions).
 
Bloom filters of the counting type are not very space efficient: The tables above shows that 84%-85%
of the counters are 0 or 1. This means most bits are zero-bits. This doesn't have to be a problem if
a counting bloom filter is used to be sent over slow networks because they are very compressable by
common compression tools like I<gzip> or L<Compress::Zlib> and such.
 
Deletion of non-existing keys makes C<bfdelete> die (croak).

Tools.pm  view on Meta::CPAN

7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
Croaks (dies) on deleting a non-existing key or deleting from an previouly overflown counter in a counting bloom filter.
 
=head2 bfaddbf
 
Adds another bloom filter to a bloom filter.
 
Bloom filters has the proberty that bit-wise I<OR>-ing the bit-filters
of two filters with the same capacity and the same number and type of
hash functions, adds the filters:
 
  my $bf1=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[1..500]);
  my $bf2=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[501..1000]);
 
  bfaddbf($bf1,$bf2);
 
  print "Yes!" if bfgrep($bf1, 1..1000) == 1000;
 
Prints yes since C<bfgrep> now returns an array of all the 1000 elements.
 
Croaks if the filters are of different dimensions.
 
Works for counting bloom filters as well (C<< counting_bits=>4 >> e.g.)

Tools.pm  view on Meta::CPAN

7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
Returns the number of 1's in the filter.
 
 my $percent=100*bfsum($bf)/$$bf{filterlength};
 printf "The filter is %.1f%% filled\n",$percent; #prints 50.0% or so if filled to capacity
 
Sums the counters for counting bloom filters (much slower than for non counting).
 
=head2 bfdimensions
 
Input, two numeric arguments: Capacity and error_rate.
 
Outputs an array of two numbers: m and k.
 
  m = - n * log(p) / log(2)**2   # n = capacity, m = bits in filter (divide by 8 to get bytes)
  k = log(1/p) / log(2)          # p = error_rate, uses perls internal log() with base e (2.718)
 
...that is: m = the best number of bits in the filter and k = the best
number of hash functions optimized for the given capacity (n) and
error_rate (p). Note that k is a dependent only of the error_rate.  At
about two percent error rate the bloom filter needs just the same
number of bytes as the number of keys.
 
 Storage (bytes):
 Capacity      Error-rate  Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate
               0.000000001 0.00000001 0.0000001  0.000001   0.00001    0.0001     0.001      0.01       0.02141585 0.1        0.5        0.99
 ------------- ----------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
            10 54.48       48.49      42.5       36.51      30.52      24.53      18.53      12.54      10.56      6.553      2.366      0.5886
           100 539.7       479.8      419.9      360        300.1      240.2      180.3      120.4      100.6      60.47      18.6       0.824
          1000 5392        4793       4194       3595       2996       2397       1798       1199       1001       599.6      180.9      3.177
         10000 5.392e+04   4.793e+04  4.194e+04  3.594e+04  2.995e+04  2.396e+04  1.797e+04  1.198e+04  1e+04      5991       1804       26.71

Tools.pm  view on Meta::CPAN

7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
 
See also Scaleable Bloom Filters: L<http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf> (not implemented in Acme::Tools)
 
 
=cut
 
sub bfinit {
  return bfretrieve(@_)                             if @_==1;
  return bfinit(error_rate=>$_[0], capacity=>$_[1]) if @_==2 and 0<$_[0] and $_[0]<1 and $_[1]>1;
  return bfinit(error_rate=>$_[1], capacity=>$_[0]) if @_==2 and 0<$_[1] and $_[1]<1 and $_[0]>1;
  require Digest::MD5;
  @_%2&&croak "Arguments should be a hash of equal number of keys and values";
  my %arg=@_;
  my @ok_param=qw/error_rate capacity min_hashfuncs max_hashfuncs hashfuncs counting_bits adaptive keys/;
  my @not_ok=sort(grep!in($_,@ok_param),keys%arg);
  croak "Not ok param to bfinit: ".join(", ",@not_ok) if @not_ok;
  croak "Not an arrayref in keys-param" if exists $arg{keys} and ref($arg{keys}) ne 'ARRAY';
  croak "Not implemented counting_bits=$arg{counting_bits}, should be 2, 4, 8, 16 or 32" if !in(nvl($arg{counting_bits},1),1,2,4,8,16,32);
  croak "An bloom filters here can not be in both adaptive and counting_bits modes" if $arg{adaptive} and $arg{counting_bits}>1;
  my $bf={error_rate    => 0.001,  #default p
          capacity      => 100000, #default n
          min_hashfuncs => 1,
          max_hashfuncs => 100,
          counting_bits => 1,      #default: not counting filter
          adaptive      => 0,
          %arg,                    #arguments
          key_count     => 0,
          overflow      => {},
          version       => $Acme::Tools::VERSION,
         };
  croak "Error rate ($$bf{error_rate}) should be larger than 0 and smaller than 1" if $$bf{error_rate}<=0 or $$bf{error_rate}>=1;
  @$bf{'min_hashfuncs','max_hashfuncs'}=(map$arg{hashfuncs},1..2) if $arg{hashfuncs};
  @$bf{'filterlength','hashfuncs'}=bfdimensions($bf); #m and k
  $$bf{filter}=pack("b*", '0' x ($$bf{filterlength}*$$bf{counting_bits}) ); #hm x   new empty filter
  $$bf{unpack}= $$bf{filterlength}<=2**16/4 ? "n*" # /4 alleviates skewing if m just slightly < 2**x
               :$$bf{filterlength}<=2**32/4 ? "N*"
               :                              "Q*";
  bfadd($bf,@{$arg{keys}}) if $arg{keys};
  return $bf;
}
sub bfaddbf {
  my($bf,$bf2)=@_;
  my $differror=join"\n",
    map "Property $_ differs ($$bf{$_} vs $$bf2{$_})",
    grep $$bf{$_} ne $$bf2{$_},
    qw/capacity counting_bits adaptive hashfuncs filterlength/; #not error_rate
  croak $differror if $differror;
  croak "Can not add adaptive bloom filters" if $$bf{adaptive};
  my $count=$$bf{key_count}+$$bf2{key_count};
  croak "Exceeded filter capacity $$bf{key_count} + $$bf2{key_count} = $count > $$bf{capacity}"
    if $count > $$bf{capacity};
  $$bf{key_count}+=$$bf2{key_count};
  if($$bf{counting_bits}==1){
    $$bf{filter} |= $$bf2{filter};
    #$$bf{filter} = $$bf{filter} | $$bf2{filter}; #or-ing
  }
  else {

Tools.pm  view on Meta::CPAN

7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
  my $bf=Storable::retrieve(@_);
  carp  "Retrieved bloom filter was stored in version $$bf{version}, this is version $VERSION" if $$bf{version}>$VERSION;
  return $bf;
}
sub bfclone {
  require Storable;
  return Storable::dclone(@_); #could be faster
}
sub bfdimensions_old {
  my($n,$p,$mink,$maxk, $k,$flen,$m)=
    @_==1 ? (@{$_[0]}{'capacity','error_rate','min_hashfuncs','max_hashfuncs'},1)
   :@_==2 ? (@_,1,100,1)
          : croak "Wrong number of arguments (".@_."), should be 2";
  croak "p ($p) should be > 0 and < 1" if not ( 0<$p && $p<1 );
  $m=-1*$_*$n/log(1-$p**(1/$_)) and (!defined $flen or $m<$flen) and ($flen,$k)=($m,$_) for $mink..$maxk;
  $flen = int(1+$flen);
  return ($flen,$k);
}
sub bfdimensions {
  my($n,$p,$mink,$maxk)=
    @_==1 ? (@{$_[0]}{'capacity','error_rate','min_hashfuncs','max_hashfuncs'})
   :@_==2 ? (@_,1,100)
          : croak "Wrong number of arguments (".@_."), should be 2";
  my $k=log(1/$p)/log(2);           # k hash funcs
  my $m=-$n*log($p)/log(2)**2;      # m bits in filter
  return ($m+0.5,min($maxk,max($mink,int($k+0.5))));
}
 
#crontab -e
#01 4,10,16,22 * * * /usr/bin/perl -MAcme::Tools -e'Acme::Tools::_update_currency_file("/var/www/html/currency-rates")' > /dev/null 2>&1

Tools.pm  view on Meta::CPAN

8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
 z2z [-p -k -v -o -1 -2 -3 -4 -5 -6 -7 -8 -9 ] files
 
Converts (recompresses) files from one compression type to another. For instance from .gz to .bz2
Keeps uid, gid, mode (chmod) and mtime.
 
 -p              Show a progress meter using the pv program if installed
 -k              Keeps original file
 -v              Verbose, shows info on degree of compression and file
                 number if more than one file is being converted
 -o              Overwrites existing result file, otherwise stop with error msg
 -1 .. -9        Degree of compression, -1 fastest .. -9 best
 -e              With -t xz (or 2xz) passes -e to xz (-9e = extreme compression)
 
 -L rate         With -p. Slow down, ex:  -L 200K  means 200 kilobytes per second
 -D sec          With -p. Only turn on progress meter (pv) after x seconds
 -i sec          With -p. Info update rate
 -l              With -p. Line mode
 -I              With -p. Show ETA as time of arrival as well as time left
 -            With -p. Quiet. Useful with -L to limit rate, but no output

t/03_bloomfilter.t  view on Meta::CPAN

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# perl Makefile.PL;make;perl -Iblib/lib t/3_bloomfilter.t
# perl Makefile.PL;make;ATDEBUG=1 perl -Iblib/lib t/3_bloomfilter.t
# time ( perl Makefile.PL;make;ATDEBUG=1 perl -Iblib/lib t/03_bloomfilter.t )
# perl Makefile.PL;make;perl -Iblib/lib t/03_bloomfilter.t
 
use lib '.'; BEGIN{require 't/common.pl'}
use Test::More tests => 28;
 
my $error_rate=0.02;
my $capacity=10000;
my $bf=bfinit($error_rate, $capacity);
my $t=time_fp();
bfadd($bf, map $_*2,0..$capacity-1);
#deb "Adds pr sec: ".int($capacity/(time_fp()-$t))."\n";
#bfadd($bf, $_) for map $_*2,0..$capacity-1;
 
deb serialize({%$bf,filter=>''},'bf','',1);
deb "Filter has capacity $$bf{capacity}\n";
deb "Filter has $$bf{key_count} keys\n";
deb "Filter has ".length($$bf{filter})." bytes\n";
deb "Filter has $$bf{filterlength} bits of which ".bfsum($bf)." (".int(100*bfsum($bf)/$$bf{filterlength})."%) are on\n";
deb "Filter has $$bf{hashfuncs} hash functions\n";
my @c=bfcheck($bf,0..$capacity*2); #test next ok: $c[2000]=0;
#deb "$_->".bfcheck($bf,$_)."\n" for 0..200;
 
my $sum; $sum+=$c[ $_*2+1 ],  for 0..$capacity-1;
deb "Filter has $sum false positives\n";
ok(!(grep $c[$_]!=1, map $_*2, 0..$capacity-1), 'no false negatives');
ok(
     $sum >= $capacity*$error_rate*80/100
  && $sum <= $capacity*$error_rate*120/100
  , sprintf "real error rate (%.6f) vs wanted error_rate ($error_rate) within ok ratio 80-120%% (%d%%)",
            $sum/$capacity,
            100*$sum/($capacity*$error_rate)
);
eval{bfinit(a=>1,b=>2)};
#deb $@;
ok($@=~/Not ok param to bfinit: a, b\b/,'param check');
 
eval{bfinit(capacity=>10,keys=>[1..11])};
ok($@=~/Exceeded filter capacity 10/,'capacity check');
 
eval{bfinit(error_rate=>0.0,capacity=>1e3)};ok($@=~/\QError rate (0) should be larger than 0 and smaller than 1\E/,'error_rate check1');
eval{bfinit(error_rate=>1.0,capacity=>1e3)};ok($@=~/\QError rate (1) should be larger than 0 and smaller than 1\E/,'error_rate check2');
#deb "<<$@>>\n";
 
#---------- OO
my $bfoo=new Acme::Tools::BloomFilter(0.1,1000);
$bfoo->add(1..500);
$bfoo->add([501..1000]);
ok(0+grep($_,$bfoo->check(1..1000)) == 1000, 'oo ok1');
ok($bfoo->clone()->grep([1..1000]) == 1000, 'oo ok2');
ok(0+grep($_,$bfoo->check(1001..2000)) < 150, 'oo ok3');
 
#---------- counting bloom filter
my($er,$cap,$cb)=(0.1,1000,4);
my $cbf=bfinit(error_rate=>$er,capacity=>$cap,counting_bits=>$cb,keys=>[1..$cap]);
ok(0+grep($_,bfcheck($cbf,1..$cap)) == $cap, 'cbf no false negatives');
ok(bfgrepnot($cbf,[1..$cap]) == 0, 'cbf grepnot');
my $errs=grep($_,bfcheck($cbf,$cap+1..$cap*2));
deb "Errs $errs\n";
ok(between($errs/$cap/$er,0.7,1.3),'error rate rating '.($errs/$cap/$er).' within ok range 0.7-1.3');
 
#---------- see doc about this example:
#do{
# my $bf=bfinit( error_rate=>0.00001, capacity=>4e6, counting_bits=>4 );
# bfadd($bf,[1000*$_+1 .. 1000*($_+1)]),deb"." for 0..4000-1;  # adding 4 million keys one thousand at a time
# my %c; $c{vec($$bf{filter},$_,$$bf{counting_bits})}++ for 0..$$bf{filterlength}-1;
# deb sprintf("%8d counters is %2d\n",$c{$_},$_) for sort{$a<=>$b}keys%c;
#};
 
my %c; $c{vec($$cbf{filter},$_,$cb)}++ for 0..$$cbf{filterlength}-1;
ok(sum(map$c{$_}*$_,keys%c)/$$cbf{key_count} == $$cbf{hashfuncs}, 'counter check');
#deb sprintf("%8d counters is %2d\n",$c{$_},$_) for sort{$a<=>$b}keys%c;
 
#---------- counting bloom filter, test delete
do{
  my($er,$cap,$cb)=(0.1,500,4);
  my $bf=bfinit(error_rate=>$er,capacity=>$cap*2,counting_bits=>$cb,keys=>[1..$cap*2]);
  bfdelete($bf, $cap+1 .. $cap*1.5);
  bfdelete($bf,[$cap*1.5+1 .. $cap*2]);
  ok(bfgrep($bf,[1..$cap]) == $cap, 'cbf, delete test, no false negatives');
  my $err=bfgrep($bf,[$cap+1..$cap*2]);
  deb "Err $err\n";
  ok($err/$cap/$er<1.3,"cbf, delete test, after delete ($err)");
  my %c=(); $c{vec($$bf{filter},$_,$cb)}++ for 0..$$bf{filterlength}-1;
  ok(sum(map$c{$_}*$_,keys%c)/$$bf{key_count} == $$bf{hashfuncs}, 'cbf, delete test, counter check after delete');
  eval{ok(bfdelete($bf,'x'))};ok($@=~/Cannot delete a non-existing key x/,'delete non-existing key');
};
 
#---------- test filter lengths
my $r;
ok(between($r=
length(bfinit(counting_bits=>$_,error_rate=>0.01,capacity=>100)->{filter}) /
length(bfinit(counting_bits=>1, error_rate=>0.01,capacity=>100)->{filter}) / $_, 0.95, 1.05), "filter length ($r), cb $_") for qw/2 4 8 16/;
 
eval{bfinit(counting_bits=>2,error_rate=>0.1,capacity=>1000,keys=>[1..1000])};ok($@=~/Too many overflows/,'overflow check');
 
#----------storing and retrieving
my $tmp=tmp();
if(-w$tmp){
  my $file="$tmp/cbf.bf";
  bfstore($cbf,$file);
  deb "Stored size of $file: ".(-s$file)." bytes\n";
  my $cbfr=bfretrieve($file);
  ok(bfgrep($cbfr,[1..$cap]) == $cap, 'store+retrieve: cbf no false negatives');
  $errs=bfgrep($cbf,[$cap+1..$cap*2]);
  #deb "Errs $errs\n";
  ok(between($errs/$cap/$er,0.7,1.3),'store+retrieve: error rate rating '.($errs/$cap/$er).' within ok range 0.7-1.3');
  my $bf=Acme::Tools::BloomFilter->new($file);
  ok($$bf{key_count}==$cap,'store+retrieve, oo');
  unlink $file;
}
else{
  ok(1,'skipped, not linux') for 1..3;
}
 
#----------adaptive bloom filter, not implemented/tested, see http://intertrack.naist.jp/Matsumoto_IEICE-ED200805.pdf
# $cap=100;
# $bf=bfinit(adaptive=>0,error_rate=>0.001,capacity=>$cap,keys=>[1..$cap]);
# @c=bfcheck($bf,[1..$cap]);
# %c=(); $c{$_}++ for @c;
# deb "Filter has $$bf{filterlength} bits of which ".bfsum($bf)." (".int(100*bfsum($bf)/$$bf{filterlength})."%) are on\n";
# deb "Filter has ".int(1+$$bf{filterlength}/8)." bytes (".sprintf("%.1f",int(1+$$bf{filterlength}/8)/1024)." kb)\n";
# deb "Filter has $$bf{hashfuncs} hash functions\n";
# deb "Number of $_: $c{$_}\n" for sort{$a<=>$b}keys%c;
# deb "Sum bits ".sum(map $$bf{hashfuncs}+$_-1,bfcheck($bf,1..$cap))."\n";
# deb "False negatives: ".grep(!$_,@c)."\n";
# deb "Error rate: ".(($errs=grep($_,bfcheck($bf,$cap+1..$cap*2)))/$cap)."\n";
# deb "Errors: $errs\n";
 
#---------- bfaddbf, adding two bloom filters
do{
  my $cap=100;
  my $bf1=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[1..$cap/2]);
  my $bf2=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[$cap/2+1..$cap]);
  deb "bf1 key_count: $$bf1{key_count}, bf1 ones: ".bfsum($bf1)."\n";
  deb "bf2 key_count: $$bf2{key_count}, bf2 ones: ".bfsum($bf2)."\n";
  bfaddbf($bf1,$bf2);
  deb "bf1 key_count: $$bf1{key_count}, bf1 ones: ".bfsum($bf1)."\n";
  my @found=bfgrep($bf1,[1..$cap]);
  ok(@found==$cap,"bfaddbf(), found ".@found." of $cap");
};
do{
  my $cap=1000;
  my $er=0.1;
  my $bf1=bfinit(counting_bits=>4,error_rate=>$er,capacity=>$cap,keys=>[1..$cap/2]);
  my $bf2=bfinit(counting_bits=>4,error_rate=>$er,capacity=>$cap,keys=>[$cap/2+1..$cap]);
  deb "bf1 key_count: $$bf1{key_count}, bf1 sum: ".bfsum($bf1)."\n";
  deb "bf2 key_count: $$bf2{key_count}, bf2 sum: ".bfsum($bf2)."\n";
  deb serialize($$bf1{overflow},'bf1overflow');
  deb serialize($$bf2{overflow},'bf2overflow');
  bfaddbf($bf1,$bf2);
  deb "bf1 key_count: $$bf1{key_count}, bf1 sum: ".bfsum($bf1)."\n";
  deb serialize($$bf1{overflow},'bf1overflow');
  my @found=bfgrep($bf1,[1..$cap]);
  ok(@found==$cap,"bfaddbf(), found ".@found." of $cap");
  my $errs=bfgrep($bf1,[$cap+1..$cap*2]);
  deb "erate: ".($errs/$cap)."\n";
  my $p=100*$errs/$cap/$er;
  ok(between($p,70,130),"error rate ".($errs/$cap)." within 70%-130% of $er ($p%)");
#  deb "Error rate: ".(($errs=grep($_,bfcheck($bf1,$cap+1..$cap*2)))/$cap)."\n";
};

t/test_fork_bloom.pl  view on Meta::CPAN

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/usr/bin/perl
my $jobs=16;
my $cap=1000000;
my $error_rate=0.01;
my($pid,@pid);
for my $job (0..$jobs-1){
  unlink"/tmp/bf$job.bf";
  next if fork();
  my $t=time_fp();
  my @keys=grep$_%$jobs==$job,1..$cap;
  #my @keys=map rand(), 1..$cap/$jobs;
  my $bf=bfinit(error_rate=>$error_rate,capacity=>$cap,keys=>\@keys);
  bfstore($bf,"/tmp/bf$job.bf");
  print "job $job finished, ".(time_fp()-$t)." sec\n";
  exit;
}
1 while wait() != -1;
print "building finished\n";
my $bf=bfinit(error_rate=>$error_rate,capacity=>$cap);
for my $job (0..$jobs-1){
  print "Adding bloom filter $job...";
  my $t=time_fp();
  bfaddbf($bf,bfretrieve("/tmp/bf$job.bf"));
  print "took ".(time_fp()-$t)." sec\n";
}
print int($$bf{filterlength}/8)," bytes\n";
printf "%.1f%%\n",100*bfsum($bf)/$$bf{filterlength};
print "keys: $$bf{key_count}\n";
print "found: ".bfgrep($bf,[1..$cap/10])."\n";



( run in 0.265 second using v1.01-cache-2.11-cpan-bb97c1e446a )