Algorithm-LossyCount

 view release on metacpan or  search on metacpan

README  view on Meta::CPAN

VERSION
    version 0.03

SYNOPSIS
      use strict;
      use warnings;
      use Algorithm::LossyCount;
  
      my @samples = qw/a b a c d f a a d b b c a a .../;
  
      my $counter = Algorithm::LossyCount->new(max_error_ratio => 0.005);
      $counter->add_sample($_) for @samples;
  
      my $frequencies = $counter->frequencies;
      say $frequencies->{a};  # Approximate freq. of 'a'.
      say $frequencies->{b};  # Approximate freq. of 'b'.
      ...

DESCRIPTION
    Lossy-Counting is a approximate frequency counting algorithm proposed by
    Manku and Motwani in 2002 (refer "SEE ALSO" section below.)

    The main advantage of the algorithm is memory efficiency. You can get
    approximate count of appearance of items with very low memory footprint,
    compared with total inspection. Furthermore, Lossy-Counting is an online
    algorithm. It is applicable to data set such that the size is unknown,
    and you can take intermediate result anytime.

METHODS
  new(max_error_ratio => $num)
    Construcotr. "max_error_ratio" is the only mandatory parameter, that
    specifies acceptable error ratio. It is an error that give zero or a
    negative number as the value.

  add_sample($sample)
    Add given $sample to count.

  frequencies([support => $num])
    Returns current result as HashRef. Its keys and values are samples and
    corresponding counts respectively.

    If optional named parameter "support" is specified, returned HashRef
    will contain only samples having frequency greater than "($support -
    $max_error_ratio) * $num_samples".

  max_error_ratio
    Returns "max_error_ratio" you've given to the constructor.

  num_samples
    Returns the total number of samples you've added.

SEE ALSO
    Manku, Gurmeet Singh, and Rajeev Motwani. "Approximate frequency counts
    over data streams." Proceedings of the 28th international conference on
    Very Large Data Bases. VLDB Endowment, 2002.

AUTHOR

lib/Algorithm/LossyCount.pm  view on Meta::CPAN

use v5.10;
use Algorithm::LossyCount::Entry;
use Carp;
use POSIX qw//;

our $VERSION = 0.03;

sub new {
  my ($class, %params) = @_;

  my $max_error_ratio = delete $params{max_error_ratio}
    // Carp::croak('Missing mandatory parameter: "max_error_ratio"');
  if (%params) {
    Carp::croak(
      'Unknown parameter(s): ',
      join ', ', map { qq/"$_"/ } sort keys %params,
    )
  }

  Carp::croak('max_error_ratio must be positive.') if $max_error_ratio <= 0;

  my $self = bless +{
    bucket_size => POSIX::ceil(1 / $max_error_ratio),
    current_bucket => 1,
    entries => +{},
    max_error_ratio => $max_error_ratio,
    num_samples => 0,
    num_samples_in_current_bucket => 0,
  } => $class;

  return $self;
}

sub add_sample {
  my ($self, $sample) = @_;

  Carp::croak('add_sample() requires 1 parameter.') unless defined $sample;

  if (defined (my $entry = $self->entries->{$sample})) {
    $entry->increment_frequency;
    $entry->num_allowed_errors($self->current_bucket - 1);
  } else {
    $self->entries->{$sample} = Algorithm::LossyCount::Entry->new(
      num_allowed_errors => $self->current_bucket - 1,
    );
  }

  ++$self->{num_samples};
  ++$self->{num_samples_in_current_bucket};
  $self->clear_bucket if $self->bucket_is_full;
}

sub bucket_is_full {
  my ($self) = @_;

lib/Algorithm/LossyCount.pm  view on Meta::CPAN

  my ($self, %params) = @_;

  my $support = delete $params{support} // 0;
  if (%params) {
    Carp::croak(
      'Unknown parameter(s): ',
      join ', ', map { qq/"$_"/ } sort keys %params,
    )
  }

  my $threshold = ($support - $self->max_error_ratio) * $self->num_samples;
  my %frequencies = map {
    my $frequency = $self->entries->{$_}->frequency;
    $frequency < $threshold ? () : ($_ => $frequency);
  } keys %{ $self->entries };
  return \%frequencies;
}

sub max_error_ratio { $_[0]->{max_error_ratio} }

sub num_samples { $_[0]->{num_samples} }

sub num_samples_in_current_bucket { $_[0]->{num_samples_in_current_bucket} }

1;

__END__

=pod

lib/Algorithm/LossyCount.pm  view on Meta::CPAN

version 0.03

=head1 SYNOPSIS

  use strict;
  use warnings;
  use Algorithm::LossyCount;
  
  my @samples = qw/a b a c d f a a d b b c a a .../;
  
  my $counter = Algorithm::LossyCount->new(max_error_ratio => 0.005);
  $counter->add_sample($_) for @samples;
  
  my $frequencies = $counter->frequencies;
  say $frequencies->{a};  # Approximate freq. of 'a'.
  say $frequencies->{b};  # Approximate freq. of 'b'.
  ...

=head1 DESCRIPTION

Lossy-Counting is a approximate frequency counting algorithm proposed by Manku and Motwani in 2002 (refer L<SEE ALSO> section below.)

The main advantage of the algorithm is memory efficiency. You can get approximate count of appearance of items with very low memory footprint, compared with total inspection.
Furthermore, Lossy-Counting is an online algorithm. It is applicable to data set such that the size is unknown, and you can take intermediate result anytime.

=head1 METHODS

=head2 new(max_error_ratio => $num)

Construcotr. C<max_error_ratio> is the only mandatory parameter, that specifies acceptable error ratio. It is an error that give zero or a negative number as the value.

=head2 add_sample($sample)

Add given C<$sample> to count.

=head2 frequencies([support => $num])

Returns current result as HashRef. Its keys and values are samples and corresponding counts respectively.

If optional named parameter C<support> is specified, returned HashRef will contain only samples having frequency greater than C<($support - $max_error_ratio) * $num_samples>.

=head2 max_error_ratio

Returns C<max_error_ratio> you've given to the constructor.

=head2 num_samples

Returns the total number of samples you've added.

=head1 SEE ALSO

=over 4

=item Manku, Gurmeet Singh, and Rajeev Motwani. "Approximate frequency counts over data streams." Proceedings of the 28th international conference on Very Large Data Bases. VLDB Endowment, 2002.

lib/Algorithm/LossyCount/Entry.pm  view on Meta::CPAN

package Algorithm::LossyCount::Entry;

use v5.10;

sub new {
  my ($class, %params) = @_;

  my $num_allowed_errors = delete $params{num_allowed_errors}
    // Carp::croak('Missing mandatory parameter: "num_allowed_errors"');
  if (%params) {
    Carp::croak(
      'Unknown parameter(s): ',
      join ', ', map { qq/"$_"/ } sort keys %params,
    )
  }

  bless +{
    frequency => 1,
    num_allowed_errors => $num_allowed_errors,
  } => $class;
}

sub frequency { $_[0]->{frequency} }

sub increment_frequency { ++$_[0]->{frequency} }

sub num_allowed_errors {
  my ($self, $new_value) = @_;

  $self->{num_allowed_errors} = $new_value if defined $new_value;
  $self->{num_allowed_errors};
}

sub survive_in_bucket {
  my ($self, $current_bucket) = @_;

  unless (defined $current_bucket) {
    Carp::croak('survive_in_bucket() requires 1 parameter.');
  }

  $self->frequency + $self->num_allowed_errors > $current_bucket;
}

1;

__END__

=pod

=encoding UTF-8

t/Algorithm/LossyCount.t  view on Meta::CPAN


  my $partition_function = sum map { 1 / $_ } 1 .. $num_samples;
  return sub {
    my ($i) = @_;
    1 / ($i * $partition_function);
  };
}

throws_ok {
  Algorithm::LossyCount->new;
} qr/max_error_ratio/, 'max_error_ratio is a mandatory parameter.';

my $num_samples = 20000;
my $distribution = zipf_distribution($num_samples);
my %sample_frequencies;
for my $i (1 .. $num_samples) {
  my $probability = $distribution->($i);
  my $frequency = int ($probability * $num_samples);
  next if $frequency == 0;
  $sample_frequencies{$i} = $frequency;
}

subtest 'Basic' => sub {
  my $counter = new_ok 'Algorithm::LossyCount' => [ max_error_ratio => 0.005 ];

  my @samples =
    shuffle map { ($_) x $sample_frequencies{$_} } keys %sample_frequencies;
  $counter->add_sample($_) for @samples;

  my $frequencies = $counter->frequencies;
  my @frequent_samples = (
    sort { $frequencies->{$b} <=> $frequencies->{$a} } keys %$frequencies
  )[0 .. keys(%$frequencies) / 100];
  for my $sample (@frequent_samples) {
    my $errors = $sample_frequencies{$sample} - $frequencies->{$sample};
    my $error_ratio = $errors / $sample_frequencies{$sample};
    cmp_ok $error_ratio, '<=', $counter->max_error_ratio;
  }
};

done_testing;



( run in 0.746 second using v1.01-cache-2.11-cpan-65fba6d93b7 )