File-FormatIdentification-RandomSampling
view release on metacpan or search on metacpan
lib/File/FormatIdentification/RandomSampling.pm view on Meta::CPAN
package File::FormatIdentification::RandomSampling;
# ABSTRACT: methods to identify content of device o media files using random sampling
our $VERSION = '0.006'; # VERSION:
# (c) 2020/2021 by Andreas Romeyke
# licensed via GPL v3.0 or later
use strict;
use warnings;
use feature qw(say);
use Moose;
has 'bytegram' => (
is => 'rw',
isa => 'ArrayRef',
default => sub {[]},
);
sub init_bytegrams {
my $self = shift;
my $bytegram_ref = $self->{'bytegram'};
$bytegram_ref->[0] = [(0) x 256]; # onegram
$bytegram_ref->[1] = [(0) x 65536]; #bigram
return 1;
}
sub BUILD {
my $self = shift;
$self->init_bytegrams();
return 1;
}
sub update_bytegram {
my $self = shift;
my $buffer = shift;
if (defined $buffer) {
my $bytegram_ref = $self->{'bytegram'};
my @bytes = unpack "C*", $buffer;
my @words = unpack "S*", $buffer;
# my @bytes = map{ ord($_)} split //, $buffer;
if (scalar @bytes > 0) {
my @onegram = @{$bytegram_ref->[0]};
my @bigram = @{$bytegram_ref->[1]};
foreach my $byte (@bytes) {
$onegram[$byte]++;
}
foreach my $word (@words) {
$bigram[$word]++;
}
$bytegram_ref->[0] = \@onegram;
$bytegram_ref->[1] = \@bigram;
}
}
return 1;
}
sub calc_histogram { # use only the most significant first 8 entries
my $self = shift;
my $bytegram_ref = $self->{'bytegram'};
my @bytes_sorted = sort {$bytegram_ref->[0]->[$b] <=> $bytegram_ref->[0]->[$a]} (0..255);
my @words_sorted = sort {$bytegram_ref->[1]->[$b] <=> $bytegram_ref->[1]->[$a]} (0 .. 65535);
# show only 8 most onegrame bytes
my @bytes_truncated = @bytes_sorted[0..7];
my @words_truncated = @words_sorted[0..7];
my %histogram;
foreach my $byte (@bytes_truncated) {
push @{$histogram{onegram}}, $byte; #$bytegram_ref->[0]->[$byte];
}
foreach my $word (@words_truncated) {
push @{$histogram{bigram}}, $word; #$bytegram_ref->[1]->[$word];
}
return \%histogram;
}
sub is_uniform {
my $self = shift;
#say "is_uniform?";
my $bytegram_ref = $self->{'bytegram'};
my $sum = 0;
my $n = 0;
my @unigram = @{$bytegram_ref->[0]};
foreach my $byte (0 .. 255) {
if ($unigram[$byte] > 0) {
$n += $unigram[$byte];
$sum += ($unigram[$byte] * $byte);
}
}
if ($n == 0) { return;}
my $expected = (256)/2;
my $mean = ($sum/$n);
#say "expected=$expected, sum=$sum, mean=$mean";
( run in 1.258 second using v1.01-cache-2.11-cpan-acebb50784d )