Catmandu-Stat
view release on metacpan or search on metacpan
lib/Catmandu/Exporter/Stat.pm view on Meta::CPAN
my $is_exact = $sample_count == $sample_cardinality ? 1 : 0;
my $cnt = 0;
my $has_unit_values = 0;
for my $k (keys %values) {
$cnt += $values{$k};
$has_unit_values = 1 if $values{$k} == 1;
}
my $missing_values = $sample_cardinality - $sample_count;
if ($missing_values > 0 && ! $has_unit_values) {
print STDERR "Statistics::TopK bin not big enough to estimate the entropy\n";
print STDERR "Increate --topk to a value > " . $self->topk . "\n";
return 'n/a';
}
$cnt += $missing_values;
return 'n/a' unless $cnt > 0;
my $h = 0;
for my $k (keys %values) {
my $p = $values{$k}/$cnt;
$h += $p * log($p)/log(2);
}
if ($has_unit_values) {
my $p = 1 / $cnt;
$h += $missing_values * $p * log($p)/log(2);
}
return sprintf "%.1f/%.1f" , -1 * $h ,log($cnt)/log(2);
}
sub commit {
my ($self) = shift;
my @keys = split(/,/,$self->fields);
my $fields = [qw(name count zeros zeros% min max mean variance stdev uniq~ uniq% entropy)];
my $exporter = Catmandu->exporter(
$self->as,
fields => $fields,
file => $self->file
);
$exporter->add(
{ name => '#' , count => $self->counter }
);
my $has_overflow = 0;
for my $key (@keys) {
my $stats = {};
$stats->{name} = $key;
$stats->{count} = $self->{res}->{$key}->{count};
$stats->{min} = $self->get_stat($key)->min();
$stats->{max} = $self->get_stat($key)->max();
$stats->{mean} = $self->get_stat($key)->mean();
$stats->{variance} = sprintf "%.1f" , $self->get_stat($key)->variance();
$stats->{stdev} = sprintf "%.1f" , $self->get_stat($key)->standard_deviation();
my ($zeros,$zerosp,$occur_count,$values_count,$uniqs);
$zeros = $self->{res}->{$key}->{zero} // 0;
$values_count = $self->{res}->{$key}->{count};
$occur_count = $self->get_stat($key)->count();
$zerosp = sprintf "%.1f" , $occur_count > 0 ? 100 * $zeros / $occur_count : 100;
$uniqs = sprintf "%.1f" , $values_count > 0 ? 100 * $self->get_key_uniq($key) / $values_count : 0.0;
my $overflow = $values_count > 0 ? 100 * $self->get_key_uniq($key) / $values_count : 0.0;
$overflow = $overflow > 100 ? 1 : 0;
$stats->{zeros} = $zeros;
$stats->{'zeros%'} = $zerosp;
$stats->{'uniq~'} = floor($self->get_key_uniq($key));
$stats->{'uniq%'} = $uniqs;
$stats->{'uniq%'} .= " (!)" if $overflow;
$stats->{'uniq~'} .= " (!)" if $overflow;
$stats->{entropy} = $self->entropy($key);
$stats->{entropy} .= " (!)" if $overflow;
$exporter->add($stats);
$has_overflow = 1 if $overflow;
}
$exporter->commit;
if ($has_overflow) {
print STDERR <<EOF;
Overflow warning - probably your dataset is too small for an accurate uniq~, uniq% and entropy count...
EOF
}
}
1;
=head1 NAME
Catmandu::Exporter::Stat - a statistical export
=head1 SYNOPSIS
# Calculate statistics on the availabity of the ISBN fields in the dataset
cat data.json | catmandu convert -v JSON to Stat --fields isbn
# Export the statistics as YAML
cat data.json | catmandu convert -v JSON to Stat --fields isbn --as YAML
=head1 DESCRIPTION
The L<Catmandu::Stat> package can be used to calculate statistics on the availablity of
fields in a data file. Use this exporter to count the availability of fields or count
the number of duplicate values. For each field the exporter calculates the following
statistics:
* name : the name of a field
* count : the number of occurences of a field in all records
* zeros : the number of records without a field
* zeros% : the percentage of records without a field
* min : the minimum number of occurences of a field in any record
* max : the maximum number of occurences of a field in any record
* mean : the mean number of occurences of a field in all records
* variance : the variance of the field number
* stdev : the standard deviation of the field number
* uniq~ : the estimated number of unique records
( run in 1.621 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )