Catmandu-Stat

 view release on metacpan or  search on metacpan

lib/Catmandu/Exporter/Stat.pm  view on Meta::CPAN


    my $is_exact = $sample_count == $sample_cardinality ? 1 : 0;

    my $cnt = 0;
    my $has_unit_values = 0;

    for my $k (keys %values) {
        $cnt += $values{$k};
        $has_unit_values = 1 if $values{$k} == 1;
    }

    my $missing_values = $sample_cardinality - $sample_count;

    if ($missing_values > 0 && ! $has_unit_values) {
        print STDERR "Statistics::TopK bin not big enough to estimate the entropy\n";
        print STDERR "Increate --topk to a value > " . $self->topk . "\n";
        return 'n/a';
    }

    $cnt += $missing_values;

    return 'n/a' unless $cnt > 0;

    my $h = 0;
    for my $k (keys %values) {
        my $p = $values{$k}/$cnt;
        $h += $p * log($p)/log(2);
    }

    if ($has_unit_values) {
        my $p = 1 / $cnt;
        $h += $missing_values * $p * log($p)/log(2);
    }

    return sprintf "%.1f/%.1f" , -1 * $h ,log($cnt)/log(2);
}

sub commit {
    my ($self) = shift;

    my @keys = split(/,/,$self->fields);

    my $fields = [qw(name count zeros zeros% min max mean variance stdev uniq~ uniq% entropy)];

    my $exporter = Catmandu->exporter(
                        $self->as,
                        fields => $fields,
                        file => $self->file
                   );

    $exporter->add(
        { name => '#' , count => $self->counter }
    );

    my $has_overflow = 0;

    for my $key (@keys) {
        my $stats = {};
        $stats->{name}     = $key;
        $stats->{count}    = $self->{res}->{$key}->{count};
        $stats->{min}      = $self->get_stat($key)->min();
        $stats->{max}      = $self->get_stat($key)->max();
        $stats->{mean}     = $self->get_stat($key)->mean();
        $stats->{variance} = sprintf "%.1f" , $self->get_stat($key)->variance();
        $stats->{stdev}    = sprintf "%.1f" , $self->get_stat($key)->standard_deviation();
        my ($zeros,$zerosp,$occur_count,$values_count,$uniqs);
        $zeros  = $self->{res}->{$key}->{zero} // 0;
        $values_count  = $self->{res}->{$key}->{count};
        $occur_count   = $self->get_stat($key)->count();
        $zerosp = sprintf "%.1f" , $occur_count > 0 ? 100 * $zeros / $occur_count : 100;
        $uniqs  = sprintf "%.1f" , $values_count > 0 ? 100 * $self->get_key_uniq($key) / $values_count : 0.0;

        my $overflow = $values_count > 0 ? 100 * $self->get_key_uniq($key) / $values_count : 0.0;
        $overflow    = $overflow > 100 ? 1 : 0;

        $stats->{zeros}    = $zeros;
        $stats->{'zeros%'} = $zerosp;
        $stats->{'uniq~'}  = floor($self->get_key_uniq($key));
        $stats->{'uniq%'}  = $uniqs;
        $stats->{'uniq%'} .= " (!)" if $overflow;
        $stats->{'uniq~'} .= " (!)" if $overflow;
        $stats->{entropy}  = $self->entropy($key);
        $stats->{entropy} .= " (!)" if $overflow;

        $exporter->add($stats);

        $has_overflow = 1 if $overflow;
    }

    $exporter->commit;

    if ($has_overflow) {
        print STDERR <<EOF;
Overflow warning - probably your dataset is too small for an accurate uniq~, uniq% and entropy count...
EOF
    }
}

1;

=head1 NAME

Catmandu::Exporter::Stat - a statistical export

=head1 SYNOPSIS

    # Calculate statistics on the availabity of the ISBN fields in the dataset
    cat data.json | catmandu convert -v JSON to Stat --fields isbn

    # Export the statistics as YAML
    cat data.json | catmandu convert -v JSON to Stat --fields isbn --as YAML

=head1 DESCRIPTION

The L<Catmandu::Stat> package can be used to calculate statistics on the availablity of
fields in a data file. Use this exporter to count the availability of fields or count
the number of duplicate values. For each field the exporter calculates the following
statistics:

  * name    : the name of a field
  * count   : the number of occurences of a field in all records
  * zeros   : the number of records without a field
  * zeros%  : the percentage of records without a field
  * min     : the minimum number of occurences of a field in any record
  * max     : the maximum number of occurences of a field in any record
  * mean    : the mean number of occurences of a field in all records
  * variance : the variance of the field number
  * stdev   : the standard deviation of the field number
  * uniq~   : the estimated number of unique records



( run in 1.621 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )