App-BloomUtils

 view release on metacpan or  search on metacpan

lib/App/BloomUtils.pm  view on Meta::CPAN

    }
    return $res unless $res->[0] == 200;
    my $m = $args{num_bits} // $res->[2]{actual_m};
    my $k = $args{num_hashes} // $res->[2]{actual_k};
    log_info "Will be creating bloom filter with num_bits (m)=%d (actual %d), num_hashes (k)=%d, actual false-positive rate=%.5f%% (when num_items=%d), actual bloom filter size=%d bytes",
        $m, $res->[2]{actual_m}, $k, $res->[2]{actual_p}*100, $res->[2]{n}, $res->[2]{actual_bloom_size};

    my $bf = Algorithm::BloomFilter->new($m, $k);
    my $i = 0;
    while (defined(my $line = <STDIN>)) {
        chomp $line;
        $bf->add($line);
        $i++;
        if (defined $args{num_items} && $i == $args{num_items}+1) {
            log_warn "You created bloom filter for num_items=%d, but now have added more than that", $args{num_items};
        }
    }

    print $bf->serialize;

    [200];
}

$SPEC{check_with_bloom_filter} = {
    v => 1.1,
    summary => 'Check with bloom filter',
    description => <<'_',

You supply the bloom filter in STDIN, items to check as arguments, and this
utility will print lines containing 0 or 1 depending on whether items in the
arguments are tested to be, respectively, not in the set (0) or probably in the
set (1).

_
    args => {
        items => {
            summary => 'Items to check',
            schema => ['array*', of=>'str*'],
            req => 1,
            pos => 0,
            greedy => 1,
        },
    },
    'cmdline.skip_format' => 1,
    links => [
    ],
};
sub check_with_bloom_filter {
    require Algorithm::BloomFilter;

    my %args = @_;

    my $bf_str = "";
    while (read(STDIN, my $block, 8192)) {
        $bf_str .= $block;
    }

    my $bf = Algorithm::BloomFilter->deserialize($bf_str);

    for (@{ $args{items} }) {
        say $bf->test($_) ? 1:0;
    }

    [200];
}

$SPEC{bloom_filter_calculator} = {
    v => 1.1,
    summary => 'Help calculate num_bits (m) and num_hashes (k)',
    description => $desc1,
    args => {
        num_items => {
            summary => 'Expected number of items to add to bloom filter',
            schema => 'posint*',
            pos => 0,
            req => 1,
            cmdline_aliases => {n=>{}},
        },
        num_bits => {
            summary => 'Number of bits to set for the bloom filter',
            schema => 'posint*',
            cmdline_aliases => {m=>{}},
        },
        false_positive_rate => {
            schema => ['float*', max=>0.5],
            default => 0.02,
            cmdline_aliases => {
                fp_rate => {},
                p => {},
            },
        },
        num_hashes => {
            schema => 'posint*',
            cmdline_aliases => {k=>{}},
        },
        num_hashes_to_bits_per_item_ratio => {
            summary => '0.7 (the default) is optimal',
            schema => 'num*',
        },
    },
    args_rels => {
        'choose_one&' => [
            [qw/num_hashes num_hashes_to_bits_per_item_ratio/],
        ],
    },
};
sub bloom_filter_calculator {
    require Algorithm::BloomFilter;

    my %args = @_;

    my $num_hashes_to_bits_per_item_ratio = $args{num_hashes_to_bits_per_item_ratio};
    $num_hashes_to_bits_per_item_ratio //= 0.7 unless defined($args{num_bits}) && defined($args{num_items});

    my $num_items = $args{num_items};
    my $fp_rate   = $args{false_positive_rate} // 0.02;
    my $num_bits = $args{num_bits} // ($num_items * log(1/$fp_rate)/ log(2)**2);

    my $num_bits_per_item = $num_bits / $num_items;
    my $num_hashes = $args{num_hashes} //
        (defined $num_hashes_to_bits_per_item_ratio ? $num_hashes_to_bits_per_item_ratio*$num_bits_per_item : undef) //



( run in 0.980 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )