Acme-Tools
view release on metacpan or search on metacpan
require Digest::MD5;
@_%2&&croak "Arguments should be a hash of equal number of keys and values";
my %arg=@_;
my @ok_param=qw/error_rate capacity min_hashfuncs max_hashfuncs hashfuncs counting_bits adaptive keys/;
my @not_ok=sort(grep!in($_,@ok_param),keys%arg);
croak "Not ok param to bfinit: ".join(", ",@not_ok) if @not_ok;
croak "Not an arrayref in keys-param" if exists $arg{keys} and ref($arg{keys}) ne 'ARRAY';
croak "Not implemented counting_bits=$arg{counting_bits}, should be 2, 4, 8, 16 or 32" if !in(nvl($arg{counting_bits},1),1,2,4,8,16,32);
croak "An bloom filters here can not be in both adaptive and counting_bits modes" if $arg{adaptive} and $arg{counting_bits}>1;
my $bf={error_rate => 0.001, #default p
capacity => 100000, #default n
min_hashfuncs => 1,
max_hashfuncs => 100,
counting_bits => 1, #default: not counting filter
adaptive => 0,
%arg, #arguments
key_count => 0,
overflow => {},
version => $Acme::Tools::VERSION,
};
croak "Error rate ($$bf{error_rate}) should be larger than 0 and smaller than 1" if $$bf{error_rate}<=0 or $$bf{error_rate}>=1;
@$bf{'min_hashfuncs','max_hashfuncs'}=(map$arg{hashfuncs},1..2) if $arg{hashfuncs};
@$bf{'filterlength','hashfuncs'}=bfdimensions($bf); #m and k
$$bf{filter}=pack("b*", '0' x ($$bf{filterlength}*$$bf{counting_bits}) ); #hm x new empty filter
$$bf{unpack}= $$bf{filterlength}<=2**16/4 ? "n*" # /4 alleviates skewing if m just slightly < 2**x
:$$bf{filterlength}<=2**32/4 ? "N*"
: "Q*";
bfadd($bf,@{$arg{keys}}) if $arg{keys};
return $bf;
}
sub bfaddbf {
my($bf,$bf2)=@_;
my $differror=join"\n",
map "Property $_ differs ($$bf{$_} vs $$bf2{$_})",
grep $$bf{$_} ne $$bf2{$_},
qw/capacity counting_bits adaptive hashfuncs filterlength/; #not error_rate
croak $differror if $differror;
croak "Can not add adaptive bloom filters" if $$bf{adaptive};
my $count=$$bf{key_count}+$$bf2{key_count};
croak "Exceeded filter capacity $$bf{key_count} + $$bf2{key_count} = $count > $$bf{capacity}"
if $count > $$bf{capacity};
$$bf{key_count}+=$$bf2{key_count};
if($$bf{counting_bits}==1){
$$bf{filter} |= $$bf2{filter};
#$$bf{filter} = $$bf{filter} | $$bf2{filter}; #or-ing
}
else {
my $cb=$$bf{counting_bits};
for(0..$$bf{filterlength}-1){
my $sum=
vec($$bf{filter}, $_,$cb)+
vec($$bf2{filter},$_,$cb);
if( $sum>2**$cb-1 ){
$sum=2**$cb-1;
$$bf{overflow}{$_}++;
}
vec($$bf{filter}, $_,$cb)=$sum;
no warnings;
$$bf{overflow}{$_}+=$$bf2{overflow}{$_}
and keys(%{$$bf{overflow}})>10 #hmm, arbitrary limit
and croak "Too many overflows, concider doubling counting_bits from $cb to ".(2*$cb)
if exists $$bf2{overflow}{$_};
}
}
return $bf; #for convenience
}
sub bfsum {
my($bf)=@_;
return unpack( "%32b*", $$bf{filter}) if $$bf{counting_bits}==1;
my($sum,$cb)=(0,$$bf{counting_bits});
$sum+=vec($$bf{filter},$_,$cb) for 0..$$bf{filterlength}-1;
return $sum;
}
sub bfadd {
require Digest::MD5;
my($bf,@keys)=@_;
return if !@keys;
my $keysref=@keys==1 && ref($keys[0]) eq 'ARRAY' ? $keys[0] : \@keys;
my($m,$k,$up,$n,$cb,$adaptive)=@$bf{'filterlength','hashfuncs','unpack','capacity','counting_bits','adaptive'};
for(@$keysref){
#croak "Key should be scalar" if ref($_);
$$bf{key_count} >= $n and croak "Exceeded filter capacity $n" or $$bf{key_count}++;
my @h; push @h, unpack $up, Digest::MD5::md5($_,0+@h) while @h<$k;
if ($cb==1 and !$adaptive) { # normal bloom filter
vec($$bf{filter}, $h[$_] % $m, 1) = 1 for 0..$k-1;
}
elsif ($cb>1) { # counting bloom filter
for(0..$k-1){
my $pos=$h[$_] % $m;
my $c=
vec($$bf{filter}, $pos, $cb) =
vec($$bf{filter}, $pos, $cb) + 1;
if($c==0){
vec($$bf{filter}, $pos, $cb) = -1;
$$bf{overflow}{$pos}++
and keys(%{$$bf{overflow}})>10 #hmm, arbitrary limit
and croak "Too many overflows, concider doubling counting_bits from $cb to ".(2*$cb);
}
}
}
elsif ($adaptive) { # adaptive bloom filter
my($i,$key,$bit)=(0+@h,$_);
for(0..$$bf{filterlength}-1){
$i+=push(@h, unpack $up, Digest::MD5::md5($key,$i)) if !@h;
my $pos=shift(@h) % $m;
$bit=vec($$bf{filter}, $pos, 1);
vec($$bf{filter}, $pos, 1)=1;
last if $_>=$k-1 and $bit==0;
}
}
else {croak}
}
return 1;
}
sub bfcheck {
require Digest::MD5;
my($bf,@keys)=@_;
return if !@keys;
my $keysref=@keys==1 && ref($keys[0]) eq 'ARRAY' ? $keys[0] : \@keys;
my($m,$k,$up,$cb,$adaptive)=@$bf{'filterlength','hashfuncs','unpack','counting_bits','adaptive'};
my $wa=wantarray();
if(!$adaptive){ # normal bloom filter or counting bloom filter
return map {
my $match = 1; # match if every bit is on
my @h; push @h, unpack $up, Digest::MD5::md5($_,0+@h) while @h<$k;
vec($$bf{filter}, $h[$_] % $m, $cb) or $match=0 or last for 0..$k-1;
return $match if !$wa;
$match;
} @$keysref;
}
else { # adaptive bloom filter
return map {
my($match,$i,$key,$bit,@h)=(1,0,$_);
for(0..$$bf{filterlength}-1){
$i+=push(@h, unpack $up, Digest::MD5::md5($key,$i)) if !@h;
my $pos=shift(@h) % $m;
$bit=vec($$bf{filter}, $pos, 1);
$match++ if $_ > $k-1 and $bit==1;
$match=0 if $_ <= $k-1 and $bit==0;
last if $bit==0;
}
return $match if !$wa;
$match;
} @$keysref;
}
}
sub bfgrep { # just a copy of bfcheck with map replaced by grep
require Digest::MD5;
my($bf,@keys)=@_;
return if !@keys;
my $keysref=@keys==1 && ref($keys[0]) eq 'ARRAY' ? $keys[0] : \@keys;
my($m,$k,$up,$cb)=@$bf{'filterlength','hashfuncs','unpack','counting_bits'};
return grep {
my $match = 1; # match if every bit is on
my @h; push @h, unpack $up, Digest::MD5::md5($_,0+@h) while @h<$k;
vec($$bf{filter}, $h[$_] % $m, $cb) or $match=0 or last for 0..$k-1;
$match;
#http://www.sitepoint.com/web-foundations/mime-types-complete-list/
croak "todo: ext2mime not yet implemented";
#return "application/json";#feks
}
sub base64 ($;$) { #
if ($] >= 5.006) {
require bytes;
croak "base64 failed: only defined for bytes"
if bytes::length($_[0]) > length($_[0])
or $] >= 5.008 && $_[0] =~ /[^\0-\xFF]/
}
my $eol=defined$_[1]?$_[1]:"\n";
my $res=pack("u",$_[0]);
$res=~s/^.//mg;
$res=~s/\n//g;
$res=~tr|` -_|AA-Za-z0-9+/|;
my $pad=(3-length($_[0])%3)%3;
$res=~s/.{$pad}$/'=' x $pad/e if $pad;
$res=~s/(.{1,76})/$1$eol/g if length($eol); #todo !=76
$res;
}
our $Fix_unbase64=0;
sub unbase64 ($) {
my $s=shift;
$s=~tr,0-9a-zA-Z+=/,,cd;
if($Fix_unbase64){ $s.='=' while length($s)%4 }
croak "unbase64 failed: length ".length($s)." not multiple of 4" if length($s)%4;
$s=~s/=+$//;
$s=~tr|A-Za-z0-9+/| -_|;
length($s) ? unpack("u",join'',map(chr(32+length($_)*3/4).$_,$s=~/(.{1,60})/gs)) : "";
}
=head1 COMMANDS
=head2 install_acme_command_tools
sudo perl -MAcme::Tools -e install_acme_command_tools
Wrote executable /usr/local/bin/conv
Wrote executable /usr/local/bin/due
Wrote executable /usr/local/bin/xcat
Wrote executable /usr/local/bin/freq
Wrote executable /usr/local/bin/deldup
Wrote executable /usr/local/bin/ccmd
Wrote executable /usr/local/bin/z2z
Wrote executable /usr/local/bin/2gz
Wrote executable /usr/local/bin/2gzip
Wrote executable /usr/local/bin/2bz2
Wrote executable /usr/local/bin/2bzip2
Wrote executable /usr/local/bin/2xz
Wrote executable /usr/local/bin/resubst
Examples of commands then made available:
conv 1 USD EUR #might show 0.88029 if thats the current currency rate. Uses conv()
conv .5 in cm #reveals that 1/2 inch is 1.27 cm, see doc on conv() for all supported units
due [-h] /path/1/ /path/2/ #like du, but show statistics on file extentions instead of subdirs
xcat file #like cat, zcat, bzcat or xzcat in one. Uses file extention to decide. Uses openstr()
freq file #reads file(s) or stdin and view counts of each byte 0-255
ccmd grep string /huge/file #caches stdout+stderr for 15 minutes (default) for much faster results later
ccmd "sleep 2;echo hello" #slow first time. Note the quotes!
ccmd "du -s ~/*|sort -n|tail" #ccmd store stdout+stderr in /tmp files (default)
z2z [-pvk1-9oe -t type] files #convert from/to .gz/bz2/xz files, -p progress, -v verbose (output result),
#-k keep org file, -o overwrite, 1-9 compression degree, -e for xz does "extreme"
#compressions, very slow. For some data types this reduces size significantly
#2xz and 2bz2 depends on xz and bzip2 being installed on system
2xz #same as z2z with -t xz
2bz2 #same as z2z with -t bz2
2gz #same as z2z with -t gz
rttop
trunc file(s)
wipe file(s)
=head3 z2z
=head3 2xz
=head3 2bz2
=head3 2gz
The commands C<2xz>, C<2bz2> and C<2gz> are just synonyms for C<z2z> with an implicitly added option C<-t xz>, C<-t xz> or C<-t gz> accordingly.
z2z [-p -k -v -o -1 -2 -3 -4 -5 -6 -7 -8 -9 ] files
Converts (recompresses) files from one compression type to another. For instance from .gz to .bz2
Keeps uid, gid, mode (chmod) and mtime.
-p Show a progress meter using the pv program if installed
-k Keeps original file
-v Verbose, shows info on degree of compression and file
number if more than one file is being converted
-o Overwrites existing result file, otherwise stop with error msg
-1 .. -9 Degree of compression, -1 fastest .. -9 best
-e With -t xz (or 2xz) passes -e to xz (-9e = extreme compression)
-L rate With -p. Slow down, ex: -L 200K means 200 kilobytes per second
-D sec With -p. Only turn on progress meter (pv) after x seconds
-i sec With -p. Info update rate
-l With -p. Line mode
-I With -p. Show ETA as time of arrival as well as time left
-q With -p. Quiet. Useful with -L to limit rate, but no output
The options -L -D -i -l -I -q implicitly turns on -p. Those options are passed
through to pv. See: man pv.
=head3 due
Like C<du> command but views space used by file extentions instead of dirs. Options:
due [-options] [dirs] [files]
due -h View bytes "human readable", i.e. C<8.72 MB> instead of C<9145662 b> (bytes)
due -k | -m View bytes in kilobytes | megabytes (1024 | 1048576)
due -K Like -k but uses 1000 instead of 1024
due -z View two extentions if .z .Z .gz .bz2 .rz or .xz (.tar.gz, not just .gz)
due -M Also show min, medium and max date (mtime) of files, give an idea of their age
due -C Like -M, but create time instead (ctime)
( run in 0.702 second using v1.01-cache-2.11-cpan-e1769b4cff6 )