Algorithm-HyperLogLog

 view release on metacpan or  search on metacpan

lib/Algorithm/HyperLogLog/PP.pm  view on Meta::CPAN

package Algorithm::HyperLogLog::PP;
use strict;
use warnings;
use 5.008008;
use Carp ();
use Digest::MurmurHash3::PurePerl qw(murmur32);
use constant {
    HLL_HASH_SEED => 313,
    TWO_32        => 4294967296.0,
    NEG_TWO_32    => -4294967296.0,
};

our $VERSION = "0.24";

require Algorithm::HyperLogLog;

{

    package Algorithm::HyperLogLog;
    our @ISA = qw(Algorithm::HyperLogLog::PP);
}

sub new {
    my ( $class, $k ) = @_;

    if ( $k < 4 || $k > 16 ) {
        Carp::croak "Number of ragisters must be in the range [4,16]";
    }

    my $m         = 1 << $k;
    my $registers = [ (0) x $m ];
    my $alpha     = 0;
    if ( $m == 16 ) {
        $alpha = 0.673;
    }
    elsif ( $m == 32 ) {
        $alpha = 0.697;
    }
    elsif ( $m == 64 ) {
        $alpha = 0.709;
    }
    else {
        $alpha = 0.7213 / ( 1.0 + 1.079 / $m );
    }

    my $self = {
        k         => $k,
        m         => $m,
        registers => $registers,
        alphaMM   => $alpha * $m * $m,
    };
    bless $self, $class;
    return $self;
}

sub _new_from_dump {
    my ( $class, $k, $data ) = @_;
    my $self = $class->new($k);
    $self->{registers} = $data;
    return $self;
}

sub _dump_register {
    my $self = shift;
    return $self->{registers};
}

sub register_size {
    my $self = shift;
    return $self->{m};
}

sub add {
    my ( $self, @data_list ) = @_;
    for my $data (@data_list) {
        my $hash = murmur32( $data, HLL_HASH_SEED );
        my $index = ( $hash >> ( 32 - $self->{'k'} ) );
        my $rank = _rho( ( $hash << $self->{k} ), 32 - $self->{k} );
        if ( $rank > $self->{registers}[$index] ) {
            $self->{registers}[$index] = $rank;
        }
    }
}

sub estimate {
    my $self = shift;
    my $m    = $self->{m};

    my $rank = 0;
    my $sum  = 0.0;
    for my $i ( 0 .. ( $m - 1 ) ) {
        $rank = $self->{registers}[$i];
        $sum += 1.0 / ( 2.0**$rank );
    }

    my $estimate = $self->{alphaMM} * ( 1.0 / $sum );    # E in the original paper
    if ( $estimate <= 2.5 * $m ) {
        my $v = 0;
        for my $i ( 0 .. ( $m - 1 ) ) {
            if ( $self->{registers}[$i] == 0 ) {
                $v++;
            }
        }

        if ( $v != 0 ) {
            $estimate = $m * log( $m / $v );
        }
    }
    elsif ( $estimate > ( 1.0 / 30.0 ) * TWO_32 ) {
        $estimate = NEG_TWO_32 * log( 1.0 - ( $estimate / TWO_32 ) );
    }
    return $estimate;



( run in 0.513 second using v1.01-cache-2.11-cpan-39bf76dae61 )