Acme-Tools

 view release on metacpan or  search on metacpan

Tools.pm  view on Meta::CPAN

#!/usr/bin/perl
package Acme::Tools;

our $VERSION = '0.27';

use 5.008;     #Perl 5.8 was released July 18th 2002
use strict;
use warnings;
use Carp;      #todo: rid of deps, make own carp+croak here

require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( all => [ qw() ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } );
our @EXPORT = qw(
  min
  max
  mins
  maxs
  sum
  avg
  geomavg
  harmonicavg
  stddev
  rstddev
  median
  percentile
  $Resolve_iterations
  $Resolve_last_estimate
  $Resolve_time
  resolve
  resolve_equation
  conv
  rank
  rankstr
  egrep
  eqarr
  sorted
  sortedstr
  sortby
  subarrays
  pushsort
  pushsortstr
  binsearch
  binsearchstr
  random
  random_gauss
  big
  bigi
  bigf
  bigr
  bigscale
  nvl
  repl
  replace
  decode
  decode_num
  between
  btw
  curb
  bound
  log10
  log2
  logn
  distinct
  in
  in_num
  uniq
  union
  union_all
  minus
  minus_all
  intersect
  intersect_all
  not_intersect
  mix
  zip
  sim
  sim_perm
  subarr
  subhash
  hashtrans
  zipb64
  zipbin
  unzipb64
  unzipbin
  gzip

Tools.pm  view on Meta::CPAN

  eachr
  joinr
  pile
  aoh2sql
  aoh2xls
  base64
  unbase64
  opts
  ed
  changed
  $Edcursor
  brainfu
  brainfu2perl
  brainfu2perl_optimized
  bfinit
  bfsum
  bfaddbf
  bfadd
  bfcheck
  bfgrep
  bfgrepnot
  bfdelete
  bfstore
  bfretrieve
  bfclone
  bfdimensions
  $PI
  install_acme_command_tools

  $Dbh
  dlogin
  dlogout
  drow
  drows
  drowc
  drowsc
  dcols
  dpk
  dsel
  ddo
  dins
  dupd
  ddel
  dcommit
  drollback
);

our $PI = '3.141592653589793238462643383279502884197169399375105820974944592307816406286';

=head1 NAME

Acme::Tools - Lots of more or less useful subs lumped together and exported into your namespace

=head1 SYNOPSIS

 use Acme::Tools;

 print sum(1,2,3);                   # 6
 print avg(2,3,4,6);                 # 3.75
 print median(2,3,4,6);              # 3.5
 print percentile(25, 101..199);     # 125

 my @list = minus(\@listA, \@listB); # set operation
 my @list = union(\@listA, \@listB); # set operation

 print length(gzip("abc" x 1000));   # far less than 3000

 writefile("/dir/filename",$string); # convenient
 my $s=readfile("/dir/filename");    # also convenient

 print "yes!" if between($PI,3,4);

 print percentile(0.05, @numbers);

 my @even = range(1000,2000,2);      # even numbers between 1000 and 2000
 my @odd  = range(1001,2001,2);

 my $dice = random(1,6);
 my $color = random(['red','green','blue','yellow','orange']);

 pushr $arrayref[$num], @stuff;      # push @{ $arrayref[$num] }, @stuff ... popr, shiftr, unshiftr

 print 2**200;       # 1.60693804425899e+60
 print big(2)**200;  # 1606938044258990275541962092341162602522202993782792835301376

 ...and much more.

=encoding utf8

=head1 ABSTRACT

About 120 more or less useful perl subroutines lumped together and exported into your namespace.

=head1 DESCRIPTION

Subs created and collected since the mid-90s.

=head1 INSTALLATION

 sudo cpan Acme::Tools
 sudo cpanm Acme::Tools   # after: sudo apt-get install cpanminus make   # for Ubuntu 12.04

Or to get the very newest:

 git clone https://github.com/kjetillll/Acme-Tools.git
 cd Acme-Tools
 perl Makefile.PL
 make test
 sudo make install

=head1 EXPORT

Almost every sub, about 90 of them.

Beware of namespace pollution. But what did you expect from an Acme module?

=head1 NUMBERS

=head2 num2code

See L</code2num>

=head2 code2num

C<num2code()> convert numbers (integers) from the normal decimal system to some arbitrary other number system.
That can be binary (2), oct (8), hex (16) or others.

Example:

 print num2code(255,2,"0123456789ABCDEF");  # prints FF
 print num2code( 14,2,"0123456789ABCDEF");  # prints 0E

...because 255 are converted to hex FF (base C<< length("0123456789ABCDEF") >> ) which is 2 digits of 0-9 or A-F.

Tools.pm  view on Meta::CPAN

C<< Standard_Deviation = sqrt(variance) >>

Standard deviation (stddev) is a measurement of the width of a normal
distribution where one stddev on each side of the mean covers 68% and
two stddevs 95%.  Normal distributions are sometimes called Gauss curves
or Bell shapes. L<https://en.wikipedia.org/wiki/Standard_deviation>

 stddev(4,5,6,5,6,4,3,5,5,6,7,6,5,7,5,6,4)         # = 1.0914103126635
 avg(@testscores) + stddev(@testscores)            # = the score for one stddev above avg, 115
 avg(@testscores) - stddev(@testscores)            # = the score for one stddev below avg, 85

=cut

sub stddev {
  return undef        if @_==0;
  return stddev(\@_)  if @_>0 and !ref($_[0]);
  my $ar=shift;
  return undef        if @$ar==0;
  return 0            if @$ar==1;
  my $sumx2; $sumx2 += $_*$_ for @$ar;
  my $sumx;  $sumx  += $_    for @$ar;
  sqrt( (@$ar*$sumx2-$sumx*$sumx)/(@$ar*(@$ar-1)) );
}

=head2 rstddev

Relative stddev = stddev / avg

=cut

sub rstddev { stddev(@_) / avg(@_) }

=head2 median

Returns the median value of a list of numbers. The list do not have to
be sorted.

Example 1, list having an odd number of numbers:

 print median(1, 100, 101);   # 100

100 is the middlemost number after sorting.

Example 2, an even number of numbers:

 print median(1005, 100, 101, 99);   # 100.5

100.5 is the average of the two middlemost numbers.

=cut

sub median {
  no warnings;
  my @list = sort {$a<=>$b} @_;
  my $n=@list;
  $n%2 ?  $list[($n-1)/2]
       : ($list[$n/2-1] + $list[$n/2])/2;
}


=head2 percentile

Returns one or more percentiles of a list of numbers.

Percentile 50 is the same as the I<median>, percentile 25 is the first
quartile, 75 is the third quartile.

B<Input:>

First argument is your wanted percentile, or a refrence to a list of percentiles you want from the dataset.

If the first argument to percentile() is a scalar, this percentile is returned.

If the first argument is a reference to an array, then all those percentiles are returned as an array.

Second, third, fourth and so on argument are the numbers from which you want to find the percentile(s).

B<Examples:>

This finds the 50-percentile (the median) to the four numbers 1, 2, 3 and 4:

 print "Median = " . percentile(50, 1,2,3,4);   # 2.5

This:

 @data=(11, 5, 3, 5, 7, 3, 1, 17, 4, 2, 6, 4, 12, 9, 0, 5);
 @p = map percentile($_,@data), (25, 50, 75);

Is the same as this:

 @p = percentile([25, 50, 75], @data);

But the latter is faster, especially if @data is large since it sorts
the numbers only once internally.

B<Example:>

Data: 1, 4, 6, 7, 8, 9, 22, 24, 39, 49, 555, 992

Average (or mean) is 143

Median is 15.5 (which is the average of 9 and 22 who both equally lays in the middle)

The 25-percentile is 6.25 which are between 6 and 7, but closer to 6.

The 75-percentile is 46.5, which are between 39 and 49 but close to 49.

Linear interpolation is used to find the 25- and 75-percentile and any
other x-percentile which doesn't fall exactly on one of the numbers in
the set.

B<Interpolation:>

As you saw, 6.25 are closer to 6 than to 7 because 25% along the set of
the twelve numbers is closer to the third number (6) than to he fourth
(7). The median (50-percentile) is also really interpolated, but it is
always in the middle of the two center numbers if there are an even count
of numbers.

However, there is two methods of interpolation:

Example, we have only three numbers: 5, 6 and 7.

Method 1: The most common is to say that 5 and 7 lays on the 25- and
75-percentile. This method is used in Acme::Tools.

Method 2: In Oracle databases the least and greatest numbers
always lay on the 0- and 100-percentile.

As an argument on why Oracles (and others?) definition is not the best way is to
look at your data as for instance temperature measurements.  If you
place the highest temperature on the 100-percentile you are sort of
saying that there can never be a higher temperatures in future measurements.

A quick non-exhaustive Google survey suggests that method 1 here is most used.

The larger the data sets, the less difference there is between the two methods.

B<Extrapolation:>

In method one, when you want a percentile outside of any possible
interpolation, you use the smallest and second smallest to extrapolate
from. For instance in the data set C<5, 6, 7>, if you want an
x-percentile of x < 25, this is below 5.

If you feel tempted to go below 0 or above 100, C<percentile()> will
I<die> (or I<croak> to be more precise)

Another method could be to use "soft curves" instead of "straight
lines" in interpolation. Maybe B-splines or Bezier curves. This is not
used here.

For large sets of data Hoares algorithm would be faster than the
simple straightforward implementation used in C<percentile()>
here. Hoares don't sort all the numbers fully.

B<Differences between the two main methods described above:>

 Data: 1, 4, 6, 7, 8, 9, 22, 24, 39, 49, 555, 992

 Percentile    Method 1                      Method 2
               (Acme::Tools::percentile      (Oracle)
               and others)
 ------------- ----------------------------- ---------
 0             -2                            1
 1             -1.61                         1.33
 25            6.25                          6.75
 50 (median)   15.5                          15.5
 75            46.5                          41.5
 99            1372.19                       943.93
 100           1429                          992

Found like this:

 perl -MAcme::Tools -le 'print for percentile([0,1,25,50,75,99,100], 1,4,6,7,8,9,22,24,39,49,555,992)'

And like this in Oracle-databases:

 select
   percentile_cont(0.00) within group(order by n) per0,
   percentile_cont(0.01) within group(order by n) per1,
   percentile_cont(0.25) within group(order by n) per25,
   percentile_cont(0.50) within group(order by n) per50,
   percentile_cont(0.75) within group(order by n) per75,
   percentile_cont(0.99) within group(order by n) per99,
   percentile_cont(1.00) within group(order by n) per100
 from (
   select 0+regexp_substr('1,4,6,7,8,9,22,24,39,49,555,992','[^,]+',1,i) n
   from dual,(select level i from dual connect by level <= 12)
 );

(Oracle also provides a similar function: C<percentile_disc> where I<disc>
is short for I<discrete>, meaning no interpolation is taking
place. Instead the closest number from the data set is picked.)

=cut

sub percentile {
  my(@p,@t,@ret);
  if(ref($_[0]) eq 'ARRAY'){ @p=@{shift()} }
  elsif(not ref($_[0]))    { @p=(shift())  }
  else{croak()}
  @t=@_;
  return if !@p;
  croak if !@t;
  @t=sort{$a<=>$b}@t;
  push@t,$t[0] if @t==1;
  for(@p){
    croak if $_<0 or $_>100;
    my $i=(@t+1)*$_/100-1;
    push@ret,
      $i<0       ? $t[0]+($t[1]-$t[0])*$i:
      $i>$#t     ? $t[-1]+($t[-1]-$t[-2])*($i-$#t):
      $i==int($i)? $t[$i]:
                   $t[$i]*(int($i+1)-$i) + $t[$i+1]*($i-int($i));
  }
  return @p==1 ? $ret[0] : @ret;
}

=head1 RANDOM

=head2 random

B<Input:> One or two arguments.

B<Output:>

If two integer arguments: returns a random integer between the integers in argument one and two.

If the first argument is an arrayref: returns a random member of that array without changing the array.

If the first argument is an arrayref and there is a second arg: return that many random members of that array

If the first argument is an hashref and there is no second arg: return a random key weighted by the values of that hash

If the first argument is an hashref and there is a second arg: return that many random keys weighted by the values of that hash

If there is no second argument and the first is an integer, a random integer between 0 and that number is returned. Including 0 and the number itself.

B<Examples:>

 $dice=random(1,6);                                      # 1, 2, 3, 4, 5 or 6
 $dice=random([1..6]);                                   # same as previous
 @dice=random([1..6],10);                                # 10 dice tosses
 $dice=random({1=>1, 2=>1, 3=>1, 4=>1, 5=>1, 6=>2});     # weighted dice with 6 being twice as likely as the others
 @dice=random({1=>1, 2=>1, 3=>1, 4=>1, 5=>1, 6=>2},10);  # 10 weighted dice tosses
 print random({head=>0.4999,tail=>0.4999,edge=>0.0002}); # coin toss (sum 1 here but not required to be)
 print random(2);                                        # prints 0, 1 or 2
 print 2**random(7);                                     # prints 1, 2, 4, 8, 16, 32, 64 or 128
 @dice=map random([1..6]), 1..10;                        # as third example above, but much slower
 perl -MAcme::Tools -le 'print for random({head=>0.499,tail=>0.499,edge=>0.002},10000);' | sort | uniq -c

=cut

sub random {
  my($from,$to)=@_;
  my $ref=ref($from);
  if($ref eq 'ARRAY'){

Tools.pm  view on Meta::CPAN

 1998 Per   182    183    76     74
 1998 Tone                70     71

.

 my @reportB=pivot([map{$_=[@$_[0,3,2,1,4]]}(@t=@table)],"Year","Season");
 print "\n\nReport B\n\n".tablestring(\@reportB);

Will print:

 Report B

 Year Season Height Height Height Weight Weight Weight Weight
             Gerd   Hilde  Per    Gerd   Hilde  Per    Tone
 ---- ------ ------ ------ -----  -----  ------ ------ ------
 1997 Summer 170    168    182    66     62     75     70
 1997 Winter 158    164    180    64     61     73     69
 1998 Summer 171    168    182    64     62     76     70
 1998 Winter 171    168    183    64     62     74     71

.

 my @reportC=pivot([map{$_=[@$_[1,2,0,3,4]]}(@t=@table)],"Name","Attributt");
 print "\n\nReport C\n\n".tablestring(\@reportC);

Will print:

 Report C

 Name  Attributt 1997   1997   1998   1998
                 Summer Winter Summer Winter
 ----- --------- ------ ------ ------ ------
 Gerd  Height     170    158    171    171
 Gerd  Weight      66     64     64     64
 Hilde Height     168    164    168    168
 Hilde Weight      62     61     62     62
 Per   Height     182    180    182    183
 Per   Weight      75     73     76     74
 Tone  Weight      70     69     70     71

.

 my @reportD=pivot([map{$_=[@$_[1,2,0,3,4]]}(@t=@table)],"Name");
 print "\n\nReport D\n\n".tablestring(\@reportD);

Will print:

 Report D

 Name  Height Height Height Height Weight Weight Weight Weight
       1997   1997   1998   1998   1997   1997   1998   1998
       Summer Winter Summer Winter Summer Winter Summer Winter
 ----- ------ ------ ------ ------ ------ ------ ------ ------
 Gerd  170    158    171    171    66     64     64     64
 Hilde 168    164    168    168    62     61     62     62
 Per   182    180    182    183    75     73     76     74
 Tone                              70     69     70     71

Options:

Options to sort differently and show sums and percents are available. (...MORE DOC ON THAT LATER...)

See also L<Data::Pivot>

=cut

sub pivot {
  my($tabref,@vertikalefelt)=@_;
  my %opt=ref($vertikalefelt[-1]) eq 'HASH' ? %{pop(@vertikalefelt)} : ();
  my $opt_sum=1 if $opt{sum};
  my $opt_pro=exists $opt{prosent}?$opt{prosent}||0:undef;
  my $sortsub          = $opt{'sortsub'}          || \&_sortsub;
  my $sortsub_bortover = $opt{'sortsub_bortover'} || $sortsub;
  my $sortsub_nedover  = $opt{'sortsub_nedover'}  || $sortsub;
  #print serialize(\%opt,'opt');
  #print serialize(\$opt_pro,'opt_pro');
  my $antned=0+@vertikalefelt;
  my $bakerst=-1+@{$$tabref[0]};
  my(%h,%feltfinnes,%sum);
  #print "Bakerst<$bakerst>\n";
  for(@$tabref){
    my $rad=join($;,@$_[0..($antned-1)]);
    my $felt=join($;,@$_[$antned..($bakerst-1)]);
    my $verdi=$$_[$bakerst];
    length($rad) or $rad=' ';
    length($felt) or $felt=' ';
    $h{$rad}{$felt}=$verdi;
    $h{$rad}{"%$felt"}=$verdi;
    if($opt_sum or defined $opt_pro){
      $h{$rad}{Sum}+=$verdi;
      $sum{$felt}+=$verdi;
      $sum{Sum}+=$verdi;
    }
    $feltfinnes{$felt}++;
    $feltfinnes{"%$felt"}++ if $opt_pro;
  }
  my @feltfinnes = sort $sortsub_bortover keys%feltfinnes;
  push @feltfinnes, "Sum" if $opt_sum;
  my @t=([@vertikalefelt,map{replace($_,$;,"\n")}@feltfinnes]);
  #print serialize(\@feltfinnes,'feltfinnes');
  #print serialize(\%h,'h');
  #print "H = ".join(", ",sort _sortsub keys%h)."\n";
  for my $rad (sort $sortsub_nedover keys(%h)){
    my @rad=(split($;,$rad),
             map { defined($_)?$_:exists$opt{undefined}?$opt{undefined}:undef }
	     map {
	       if(/^\%/ and defined $opt_pro){
		 my $sum=$h{$rad}{Sum};
		 my $verdi=$h{$rad}{$_};
		 if($sum!=0){
		   defined $verdi
                   ?sprintf("%*.*f",3+1+$opt_pro,$opt_pro,100*$verdi/$sum)
		   :$verdi;
		 }
		 else{
		   $verdi!=0?"div0":$verdi;
		 }
	       }
	       else{
                 $h{$rad}{$_};
	       }

Tools.pm  view on Meta::CPAN


Example:

 print brainfu2perl('>++++++++[<++++++++>-]<++++++++.>++++++[<++++++>-]<---.');

Prints this string:

 my($c,$o,@b)=(0); sub out{$o.=chr($b[$c]) for 1..$_[0]||1}
 ++$c;++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];
 while($b[$c]){--$c;++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];
 ++$b[$c];++$c;--$b[$c];}--$c;++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];
 ++$b[$c];++$b[$c];out;++$c;++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];
 while($b[$c]){--$c;++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$b[$c];++$c;--$b[$c];}
 --$c;--$b[$c];--$b[$c];--$b[$c];out;$o;

=head2 brainfu2perl_optimized

Just as L</brainfu2perl> but optimizes the perl code. The same
example as above with brainfu2perl_optimized returns this equivalent
but shorter perl code:

 $b[++$c]+=8;while($b[$c]){$b[--$c]+=8;--$b[++$c]}$b[--$c]+=8;out;$b[++$c]+=6;
 while($b[$c]){$b[--$c]+=6;--$b[++$c]}$b[--$c]-=3;out;$o;

=cut

sub brainfu { eval(brainfu2perl(@_)) }

sub brainfu2perl {
  my($bf,$inp)=@_;
  my $perl='my($c,$inp,$o,@b)=(0,\''.$inp.'\'); no warnings; sub out{$o.=chr($b[$c]) for 1..$_[0]||1}'."\n";
  $perl.='sub inp{$inp=~s/(.)//s and $b[$c]=ord($1)}'."\n" if $inp and $bf=~/,/;
  $perl.=join("",map/\+/?'++$b[$c];':/\-/?'--$b[$c];':/\[/?'while($b[$c]){':/\]/?'}':/>/?'++$c;':/</?'--$c;':/\./?'out;':/\,/?'inp;':'',split//,$bf).'$o;';
  $perl;
}

sub brainfu2perl_optimized {
  my $perl=brainfu2perl(@_);
  $perl=~s{(((\+|\-)\3\$b\[\$c\];){2,})}{ '$b[$c]'.$3.'='.(grep/b/,split//,$1).';' }gisex;
  1 while $perl=~s/\+\+\$c;\-\-\$c;//g + $perl=~s/\-\-\$c;\+\+\$c;//g;
  $perl=~s{((([\-\+])\3\$c;){2,})}{"\$c$3=".(grep/c/,split//,$1).';'}gisex;
  $perl=~s{((\+\+|\-\-)\$c;([^;{}]+;))}{my($o,$s)=($2,$3);$s=~s/\$c/$o\$c/?$s:$1}ge;
  $perl=~s/\$c(\-|\+)=(\d+);(\+\+|\-\-)\$b\[\$c\]/$3.'$b[$c'.$1.'='.$2.'];'/ge;
  $perl=~s{((out;){2,})}{'out('.(grep/o/,split//,$1).');'}ge;
  $perl=~s/;}/}/g;$perl=~s/;+/;/g;
  $perl;
}


=head1 BLOOM FILTER SUBROUTINES

Bloom filters can be used to check whether an element (a string) is a
member of a large set using much less memory or disk space than other
data structures. Trading speed and accuracy for memory usage. While
risking false positives, Bloom filters have a very strong space
advantage over other data structures for representing sets.

In the example below, a set of 100000 phone numbers (or any string of
any length) can be "stored" in just 91230 bytes if you accept that you
can only check the data structure for existence of a string and accept
false positives with an error rate of 0.03 (that is three percent, error
rates are given in numbers larger than 0 and smaller than 1).

You can not retrieve the strings in the set without using "brute
force" methods and even then you would get slightly more strings than
you put in because of the error rate inaccuracy.

Bloom Filters have many uses.

See also: L<http://en.wikipedia.org/wiki/Bloom_filter>

See also: L<Bloom::Filter>

=head2 bfinit

Initialize a new Bloom Filter:

  my $bf = bfinit( error_rate=>0.01, capacity=>100000 );

The same:

  my $bf = bfinit( 0.01, 100000 );

since two arguments is interpreted as error_rate and capacity accordingly.


=head2 bfadd

  bfadd($bf, $_) for @phone_numbers;   # Adding strings one at a time

  bfadd($bf, @phone_numbers);          # ...or all at once (faster)

Returns 1 on success. Dies (croaks) if more strings than capacity is added.

=head2 bfcheck

  my $phone_number="99999999";
  if ( bfcheck($bf, $phone_number) ) {
    print "Yes, $phone_number was PROBABLY added\n";
  }
  else {
    print "No, $phone_number was DEFINITELY NOT added\n";
  }

Returns true if C<$phone_number> exists in C<@phone_numbers>.

Returns false most of the times, but sometimes true*), if C<$phone_number> doesn't exists in C<@phone_numbers>.

*) This is called a false positive.

Checking more than one key:

 @bools = bfcheck($bf, @keys);          # or ...
 @bools = bfcheck($bf, \@keys);         # better, uses less memory if @keys is large

Returns an array the same size as @keys where each element is true or false accordingly.

=head2 bfgrep

Same as C<bfcheck> except it returns the keys that exists in the bloom filter

 @found = bfgrep($bf, @keys);           # or ...
 @found = bfgrep($bf, \@keys);          # better, uses less memory if @keys is large, or ...
 @found = grep bfcheck($bf,$_), @keys;  # same but slower

=head2 bfgrepnot

Same as C<bfgrep> except it returns the keys that do NOT exists in the bloom filter:

 @not_found = bfgrepnot($bf, @keys);          # or ...
 @not_found = bfgrepnot($bf, \@keys);         # better, uses less memory if @keys is large, or ...
 @not_found = grep !bfcheck($bf,$_), @keys);  # same but slower

=head2 bfdelete

Deletes from a counting bloom filter.

To enable deleting be sure to initialize the bloom filter with the
numeric C<counting_bits> argument. The number of bits could be 2 or 3*)
for small filters with a small capacity (a small number of keys), but
setting the number to 4 ensures that even very large filters with very
small error rates would not overflow.

*) Acme::Tools do not currently support C<< counting_bits => 3 >> so 4
and 8 are the only practical alternatives where 8 is almost always overkill.

 my $bf=bfinit(
   error_rate    => 0.001,
   capacity      => 10000000,
   counting_bits => 4              # power of 2, that is 2, 4, 8, 16 or 32
 );
 bfadd(   $bf, @unique_phone_numbers);
 bfdelete($bf, @unique_phone_numbers);

Example: examine the frequency of the counters with 4 bit counters and 4 million keys:

 my $bf=bfinit( error_rate=>0.001, capacity=>4e6, counting_bits=>4 );
 bfadd($bf,[1e3*$_+1 .. 1e3*($_+1)]) for 0..4000-1;  # adding 4 million keys one thousand at a time
 my %c; $c{vec($$bf{filter},$_,$$bf{counting_bits})}++ for 0..$$bf{filterlength}-1;
 printf "%8d counters = %d\n",$c{$_},$_ for sort{$a<=>$b}keys%c;

The output:

 28689562 counters = 0
 19947673 counters = 1
  6941082 counters = 2
  1608250 counters = 3
   280107 counters = 4
    38859 counters = 5
     4533 counters = 6
      445 counters = 7
       46 counters = 8
        1 counters = 9

Even after the error_rate is changed from 0.001 to a percent of that, 0.00001, the limit of 16 (4 bits) is still far away:

 47162242 counters = 0
 33457237 counters = 1
 11865217 counters = 2
  2804447 counters = 3
   497308 counters = 4
    70608 counters = 5
     8359 counters = 6
      858 counters = 7
       65 counters = 8
        4 counters = 9

In algorithmic terms the number of bits needed is C<ln of ln of n>.  Thats why 4 bits (counters up
to 15) is "always" good enough except for extremely large capasities or extremely small error rates.
(Except when adding the same key many times, which should be avoided, and Acme::Tools::bfadd do not
check for that, perhaps in future versions).

Bloom filters of the counting type are not very space efficient: The tables above shows that 84%-85%
of the counters are 0 or 1. This means most bits are zero-bits. This doesn't have to be a problem if
a counting bloom filter is used to be sent over slow networks because they are very compressable by
common compression tools like I<gzip> or L<Compress::Zlib> and such.

Deletion of non-existing keys makes C<bfdelete> die (croak).

=head2 bfdelete

Deletes from a counting bloom filter:

 bfdelete($bf, @keys);
 bfdelete($bf, \@keys);

Returns C<$bf> after deletion.

Croaks (dies) on deleting a non-existing key or deleting from an previouly overflown counter in a counting bloom filter.

=head2 bfaddbf

Adds another bloom filter to a bloom filter.

Bloom filters has the proberty that bit-wise I<OR>-ing the bit-filters
of two filters with the same capacity and the same number and type of
hash functions, adds the filters:

  my $bf1=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[1..500]);
  my $bf2=bfinit(error_rate=>0.01,capacity=>$cap,keys=>[501..1000]);

  bfaddbf($bf1,$bf2);

  print "Yes!" if bfgrep($bf1, 1..1000) == 1000;

Prints yes since C<bfgrep> now returns an array of all the 1000 elements.

Croaks if the filters are of different dimensions.

Works for counting bloom filters as well (C<< counting_bits=>4 >> e.g.)

=head2 bfsum

Returns the number of 1's in the filter.

 my $percent=100*bfsum($bf)/$$bf{filterlength};
 printf "The filter is %.1f%% filled\n",$percent; #prints 50.0% or so if filled to capacity

Sums the counters for counting bloom filters (much slower than for non counting).

=head2 bfdimensions

Input, two numeric arguments: Capacity and error_rate.

Outputs an array of two numbers: m and k.

  m = - n * log(p) / log(2)**2   # n = capacity, m = bits in filter (divide by 8 to get bytes)
  k = log(1/p) / log(2)          # p = error_rate, uses perls internal log() with base e (2.718)

...that is: m = the best number of bits in the filter and k = the best
number of hash functions optimized for the given capacity (n) and
error_rate (p). Note that k is a dependent only of the error_rate.  At
about two percent error rate the bloom filter needs just the same
number of bytes as the number of keys.

 Storage (bytes):
 Capacity      Error-rate  Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate Error-rate
               0.000000001 0.00000001 0.0000001  0.000001   0.00001    0.0001     0.001      0.01       0.02141585 0.1        0.5        0.99
 ------------- ----------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
            10 54.48       48.49      42.5       36.51      30.52      24.53      18.53      12.54      10.56      6.553      2.366      0.5886
           100 539.7       479.8      419.9      360        300.1      240.2      180.3      120.4      100.6      60.47      18.6       0.824
          1000 5392        4793       4194       3595       2996       2397       1798       1199       1001       599.6      180.9      3.177
         10000 5.392e+04   4.793e+04  4.194e+04  3.594e+04  2.995e+04  2.396e+04  1.797e+04  1.198e+04  1e+04      5991       1804       26.71
        100000 5.392e+05   4.793e+05  4.193e+05  3.594e+05  2.995e+05  2.396e+05  1.797e+05  1.198e+05  1e+05      5.991e+04  1.803e+04  262
       1000000 5.392e+06   4.793e+06  4.193e+06  3.594e+06  2.995e+06  2.396e+06  1.797e+06  1.198e+06  1e+06      5.991e+05  1.803e+05  2615
      10000000 5.392e+07   4.793e+07  4.193e+07  3.594e+07  2.995e+07  2.396e+07  1.797e+07  1.198e+07  1e+07      5.991e+06  1.803e+06  2.615e+04
     100000000 5.392e+08   4.793e+08  4.193e+08  3.594e+08  2.995e+08  2.396e+08  1.797e+08  1.198e+08  1e+08      5.991e+07  1.803e+07  2.615e+05
    1000000000 5.392e+09   4.793e+09  4.193e+09  3.594e+09  2.995e+09  2.396e+09  1.797e+09  1.198e+09  1e+09      5.991e+08  1.803e+08  2.615e+06
   10000000000 5.392e+10   4.793e+10  4.193e+10  3.594e+10  2.995e+10  2.396e+10  1.797e+10  1.198e+10  1e+10      5.991e+09  1.803e+09  2.615e+07
  100000000000 5.392e+11   4.793e+11  4.193e+11  3.594e+11  2.995e+11  2.396e+11  1.797e+11  1.198e+11  1e+11      5.991e+10  1.803e+10  2.615e+08
 1000000000000 5.392e+12   4.793e+12  4.193e+12  3.594e+12  2.995e+12  2.396e+12  1.797e+12  1.198e+12  1e+12      5.991e+11  1.803e+11  2.615e+09

 Error rate:               0.99   Hash functions:  1
 Error rate:                0.5   Hash functions:  1
 Error rate:                0.1   Hash functions:  3
 Error rate: 0.0214158522653385   Hash functions:  6
 Error rate:               0.01   Hash functions:  7
 Error rate:              0.001   Hash functions: 10
 Error rate:             0.0001   Hash functions: 13
 Error rate:            0.00001   Hash functions: 17
 Error rate:           0.000001   Hash functions: 20
 Error rate:          0.0000001   Hash functions: 23
 Error rate:         0.00000001   Hash functions: 27
 Error rate:        0.000000001   Hash functions: 30

=head2 bfstore

Storing and retrieving bloom filters to and from disk uses L<Storable>s C<store> and C<retrieve>. This:

 bfstore($bf,'filename.bf');

It the same as:

 use Storable qw(store retrieve);
 ...
 store($bf,'filename.bf');

=head2 bfretrieve

This:

 my $bf=bfretrieve('filename.bf');

Or this:

 my $bf=bfinit('filename.bf');

Is the same as:

 use Storable qw(store retrieve);
 my $bf=retrieve('filename.bf');

=head2 bfclone

Tools.pm  view on Meta::CPAN

 ccmd grep string /huge/file   #caches stdout+stderr for 15 minutes (default) for much faster results later
 ccmd "sleep 2;echo hello"     #slow first time. Note the quotes!
 ccmd "du -s ~/*|sort -n|tail" #ccmd store stdout+stderr in /tmp files (default)
 z2z [-pvk1-9oe -t type] files #convert from/to .gz/bz2/xz files, -p progress, -v verbose (output result),
                               #-k keep org file, -o overwrite, 1-9 compression degree, -e for xz does "extreme"
                               #compressions, very slow. For some data types this reduces size significantly
                               #2xz and 2bz2 depends on xz and bzip2 being installed on system
 2xz                           #same as z2z with -t xz
 2bz2                          #same as z2z with -t bz2
 2gz                           #same as z2z with -t gz

 rttop
 trunc file(s)
 wipe file(s)

=head3 z2z

=head3 2xz

=head3 2bz2

=head3 2gz

The commands C<2xz>, C<2bz2> and C<2gz> are just synonyms for C<z2z> with an implicitly added option C<-t xz>, C<-t xz> or C<-t gz> accordingly.

 z2z [-p -k -v -o -1 -2 -3 -4 -5 -6 -7 -8 -9 ] files

Converts (recompresses) files from one compression type to another. For instance from .gz to .bz2
Keeps uid, gid, mode (chmod) and mtime.

 -p              Show a progress meter using the pv program if installed
 -k              Keeps original file
 -v              Verbose, shows info on degree of compression and file
                 number if more than one file is being converted
 -o              Overwrites existing result file, otherwise stop with error msg
 -1 .. -9        Degree of compression, -1 fastest .. -9 best
 -e              With -t xz (or 2xz) passes -e to xz (-9e = extreme compression)

 -L rate         With -p. Slow down, ex:  -L 200K  means 200 kilobytes per second
 -D sec          With -p. Only turn on progress meter (pv) after x seconds
 -i sec          With -p. Info update rate
 -l              With -p. Line mode
 -I              With -p. Show ETA as time of arrival as well as time left
 -q              With -p. Quiet. Useful with -L to limit rate, but no output

The options -L -D -i -l -I -q implicitly turns on -p. Those options are passed
through to pv. See: man pv.

=head3 due

Like C<du> command but views space used by file extentions instead of dirs. Options:

 due [-options] [dirs] [files]
 due -h          View bytes "human readable", i.e. C<8.72 MB> instead of C<9145662 b> (bytes)
 due -k | -m     View bytes in kilobytes | megabytes (1024 | 1048576)
 due -K          Like -k but uses 1000 instead of 1024
 due -z          View two extentions if .z .Z .gz .bz2 .rz or .xz (.tar.gz, not just .gz)
 due -M          Also show min, medium and max date (mtime) of files, give an idea of their age
 due -C          Like -M, but create time instead (ctime)
 due -A          Like -M, but access time instead (atime)
 due -P          Also show 10, 50 (medium) and 90 percentile of file date
 due -MP         Both -M and -P, shows min, 10p, 50p, 90p and max
 due -a          Sort output alphabetically by extention (default order is by size)
 due -c          Sort output by number of files
 due -i          Ignore case, .GZ and .gz is the same, output in lower case
 due -t          Adds time of day to -M and -P output
 due -e 'regex'  Exclude files (full path) matching regex. Ex: due -e '\.git'
 TODO: due -l    TODO: Exclude hardlinks (dont count "same" file more than once, "man du")
 ls -l | due     Parses output of ls -l, find -ls, tar tvf for size+filename and reports
 find | due      List of filenames from stdin produces same as just command 'due'
 ls | due        Reports on just files in current dir without recursing into subdirs

=head3 finddup

Find duplicate files. Three steps to speed this up in case of many
large files: 1) Find files of same size, 2) of those: find files with
the same first 8 kilobytes, 3) of those: find duplicate files by
finding the MD5sums of the whole files.

 finddup [-d -s -h] paths/ files/* ...  #reports (+deletes with -d) duplicate files
                                        #-s for symlinkings dups, -h for hardlink
 finddup <files>    # print duplicate files, <files> might be filenames and directories
 finddup -a <files> # print duplicate files, also print the first file
 finddup -d <files> # delete duplicate files, use -v to also print them before deletion
 finddup -s <files> # make symbolic links of duplicate files
 finddup -h <files> # make hard links of duplicate files
 finddup -v ...     # verbose, print before -d, -s or -h
 finddup -n -d <files>  # dry run: show rm commands without actually running them
 finddup -n -s <files>  # dry run: show ln commands to make symlinks of duplicate files todo:NEEDS FIX!
 finddup -n -h <files>  # dry run: show ln commands to make hard links of duplicate files
 finddup -q ...         # quiet
 finddup -k o           # keep oldest with -d, -s, -h, consider newer files duplicates
 finddup -k n           # keep newest with -d, -s, -h, consider older files duplicates
 finddup -k O           # same as -k o, just use access time instead of modify time
 finddup -k N           # same as -k n, just use access time instead of modify time
 finddup -0 ...         # use ascii 0 instead of the normal \n, for xargs -0
 finddup -P n           # use n bytes from start of file in 1st md5 check (default 8192)
 finddup -p             # view progress in last and slowest of the three steps

Default ordering of files without C<-k n> or C<-k o> is the order they
are mentioned on the command line. For directory args the order might be
random: use C<< dir/* >> to avoid that (but then dot files are not included).

=cut

sub install_acme_command_tools {
  my $dir=(grep -d$_, @_, '/usr/local/bin', '/usr/bin')[0];
  for( qw( conv due xcat freq finddup ccmd trunc wipe rttop  z2z 2gz 2gzip 2bz2 2bzip2 2xz resubst zsize) ){
    unlink("$dir/$_");
    writefile("$dir/$_", "#!$^X\nuse Acme::Tools;\nAcme::Tools::cmd_$_(\@ARGV);\n");
    sys("/bin/chmod +x $dir/$_"); #hm umask
    print "Wrote executable $dir/$_\n";
  }
}
sub cmd_conv { print conv(@ARGV)."\n"  }
our @Due_fake_stdin;
#TODO: output from tar tvf and ls and find -ls
sub cmd_due {
  my %o;
  my @argv=opts("zkKmhciMCAPate:lE:t",\%o,@_);
  require File::Find;
  no warnings 'uninitialized';
  die"$0: -l not implemented yet\n"                if $o{l}; #man du: default is not to count hardlinks more than once, with -l it does
  die"$0: -h, -k or -m can not be used together\n" if $o{h}+$o{k}+$o{m}>1;
  die"$0: -c and -a can not be used together\n"    if $o{a}+$o{c}>1;
  die"$0: -k and -m can not be used together\n"    if $o{k}+$o{m}>1;
  die"$0: -M, -C, -A can not be used together\n"   if $o{M}+$o{C}+$o{A}>1;
  my(%c,%b,$cnt,$bts,%xtime);
  my $zext=$o{z}?'(\.(z|Z|gz|bz2|xz|rz|kr|lrz|rz))?':'';
  $o{E}||=11;
  my $r=qr/(\.[^\.\/]{1,$o{E}}$zext)$/i;
  my $qrexcl=exists$o{e}?qr/$o{e}/:0;
  #TODO: ought to work: tar cf - .|tar tvf -|due
  my $x=$o{M}?9:$o{C}?10:$o{A}?8:9;
  if(-p STDIN or @Due_fake_stdin){
    die "due: can not combine STDIN and args\n" if @argv;
    my $stdin=join"",map"$_\n",@Due_fake_stdin; #test
    open(local *STDIN, '<', \$stdin) or die "ERR: $! $?\n" if $stdin;
    my $rl=qr/(^| )\-[rwx\-sS]{9}\s+(?:\d )?(?:[\w\-]+(?:\/|\s+)[\w\-]+)\s+(\d+)\s+.*?([^\/]*\.[\w,\-]+)?$/;
    my $MorP=$o{M}||$o{C}||$o{A}||$o{P}?"due: -M, -C, -A and -P not yet implemented for STDIN unless list of filenames only\n":0;
    while(<STDIN>){
      chomp;
      next if /\/$/;
      my($f,$sz,$xtime)=(/$rl/?($3,$2):-f$_?($_,(stat)[7,$x]):next);
      #   1576142    240 -rw-r--r--   1 root     root       242153 april  4  2016 /opt/wine-staging/share/wine/wine.inf
      my $ext=$f=~$r?$1:'';
      $ext=lc($ext) if $o{i};
      $cnt++;    $c{$ext}++;
      $bts+=$sz; $b{$ext}+=$sz;
      defined $xtime and $xtime{$ext}.=",$xtime" or die $MorP if $MorP;
    }
  }
  else { #hm DRY
    @argv=('.') if !@argv;
    File::Find::find({follow=>0, wanted =>
      sub {
        return if !-f$_;
        return if $qrexcl and defined $File::Find::name and $File::Find::name=~$qrexcl;
        my($sz,$xtime)=(stat($_))[7,$x];
        my $ext=m/$r/?$1:'';
        $ext=lc($ext) if $o{i};
        $cnt++;    $c{$ext}++;
        $bts+=$sz; $b{$ext}+=$sz;
        $xtime{$ext}.=",$xtime" if $o{M} || $o{C} || $o{A} || $o{P};
	1;
      } },@argv);
  }
  my($f,$s)=$o{k}?("%14.2f kb",sub{$_[0]/1024})
           :$o{K}?("%14.2f Kb",sub{$_[0]/1000})
           :$o{m}?("%14.2f mb",sub{$_[0]/1024**2})
           :$o{h}?("%14s",     sub{bytes_readable($_[0])})
           :      ("%14d b",   sub{$_[0]});
  my @e=$o{a}?(sort(keys%c))
       :$o{c}?(sort{$c{$a}<=>$c{$b} or $a cmp $b}keys%c)
       :      (sort{$b{$a}<=>$b{$b} or $a cmp $b}keys%c);
  my $perc=!$o{M}&&!$o{C}&&!$o{A}&&!$o{P}?sub{""}:
    sub{
      my @p=$o{P}?(10,50,90):(50);
      my @m=@_>0 ? do {grep$_, split",", $xtime{$_[0]}}
                 : do {grep$_, map {split","} values %xtime};
      my @r=percentile(\@p,@m);
      @r=(min(@m),@r,max(@m)) if $o{M}||$o{C}||$o{A};
      @r=map int($_), @r;
      my $fmt=$o{t}?'YYYY/MM/DD-MM:MI:SS':'YYYY/MM/DD';
      @r=map tms($_,$fmt), @r;
      "  ".join(" ",@r);
  };
  my $width=max( 10, grep $_, map length($_), @e );
  @e=@e[-10..-1] if $o{t} and @e>10; #-t tail
  printf("%-*s %8d $f %7.2f%%%s\n",$width,$_,$c{$_},&$s($b{$_}),100*$b{$_}/$bts,&$perc($_)) for @e;
  printf("%-*s %8d $f %7.2f%%%s\n",$width,"Sum",$cnt,&$s($bts),100,&$perc());
}
sub cmd_resubst {
  my %o;
  my $zo="123456789e";
  my @argv=opts("f:t:vno:gi$zo",\%o,@_);
  if(exists$o{t}){ $o{t}=~s,\\,\$, } else { $o{t}='' }
  my($i,$tc,$tbfr,$tbto)=(0,0,0,0);
  for my $file (@argv){
      my $zopt=join" ",map"-$_",grep$o{$_},split//,$zo;
      my $oext=$o{o}?$o{o}:$file=~/\.(gz|bz2|xz)$/i?$1:'';
      my $open_out_pre=$oext?"|".openstr_prog({qw/gz gzip bz2 bzip2 xz xz/}->{lc($oext)})." $zopt":'';
      my $open_out="$open_out_pre > $file.tmp$$";
      my $open_in=openstr($file);
      #      die srlz(\%o,'o','',1);
      open my $I, $open_in  or croak"ERR: open $open_in failed. $! $?\n";
      open my $O, $open_out or croak"ERR: open $open_out failed. $! $?\n";
      my $c=0;
      my $mod=join"",grep$o{$_},qw(g i);
      eval"while(<\$I>){ \$c+=s/\$o{f}/$o{t}/$mod;print \$O \$_ }";
      $tc+=$c;
      close($I);close($O);
      chall($file,"$file.tmp$$") or croak"ERR: chall $file\n" if !$o{n};
      my($bfr,$bto)=(-s$file,-s"$file.tmp$$");
      unlink $file or croak"ERR: cant rm $file\n";
      my $newfile=$o{o}?repl($file,qr/\.(gz|bz2|xz)$/i,".$oext"):$file;
      rename("$file.tmp$$",$newfile) or croak"ERR: rename $file.tmp$$ -> $newfile failed\n";
      if($o{v}){
	my $pr=$bfr?100*$bto/$bfr:0;
	printf "%*d/%d %*s %7d =>%8d b (%2d%%) %s\n",
	  length(0+@argv), ++$i, 0+@argv, -15, "$tc/$c", $bfr, $bto, $pr, $file;
	$tbfr+=$bfr;
	$tbto+=$bto;
      }
  }
  if($o{v} and @argv>1){
      printf "Replaces: %d  Bytes before: %d  After: %d   Change: %.1f%%\n",
        $tc, $tbfr, $tbto, $tbfr?100*($tbto-$tbfr)/$tbfr:0
  }
  $tc;
}
sub cmd_xcat {
  for my $fn (@_){
    my $os=openstr($fn);
    open my $FH, $os or warn "xcat: cannot open $os ($!)\n" and next;
    #binmode($FH);#hm?
    print while <$FH>;
    close($FH);
  }
}
sub cmd_freq {



( run in 3.309 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )