File-FormatIdentification-RandomSampling

 view release on metacpan or  search on metacpan

lib/File/FormatIdentification/RandomSampling.pm  view on Meta::CPAN

package File::FormatIdentification::RandomSampling;
# ABSTRACT: methods to identify content of device o media files using random sampling
our $VERSION = '0.006'; # VERSION:
# (c) 2020/2021 by Andreas Romeyke
# licensed via GPL v3.0 or later


use strict;
use warnings;
use feature qw(say);
use Moose;

has 'bytegram' => (
    is      => 'rw',
    isa     => 'ArrayRef',
    default => sub {[]},
);




sub init_bytegrams {
    my $self = shift;
    my $bytegram_ref = $self->{'bytegram'};
    $bytegram_ref->[0] = [(0) x 256]; # onegram
    $bytegram_ref->[1] = [(0) x 65536]; #bigram
    return 1;
}

sub BUILD {
    my $self = shift;
    $self->init_bytegrams();
    return 1;
}



sub update_bytegram {
    my $self = shift;
    my $buffer = shift;
    if (defined $buffer) {
        my $bytegram_ref = $self->{'bytegram'};
        my @bytes = unpack "C*", $buffer;
        my @words = unpack "S*", $buffer;
        #    my @bytes = map{ ord($_)} split //, $buffer;
        if (scalar @bytes > 0) {
            my @onegram = @{$bytegram_ref->[0]};
            my @bigram = @{$bytegram_ref->[1]};
            foreach my $byte (@bytes) {
                $onegram[$byte]++;
            }
            foreach my $word (@words) {
                $bigram[$word]++;
            }
            $bytegram_ref->[0] = \@onegram;
            $bytegram_ref->[1] = \@bigram;
        }
    }
    return 1;
}


sub calc_histogram { # use only the most significant first 8 entries
    my $self = shift;
    my $bytegram_ref = $self->{'bytegram'};
    my @bytes_sorted = sort {$bytegram_ref->[0]->[$b] <=> $bytegram_ref->[0]->[$a]} (0..255);
    my @words_sorted = sort {$bytegram_ref->[1]->[$b] <=> $bytegram_ref->[1]->[$a]} (0 .. 65535);
    # show only 8 most onegrame bytes
    my @bytes_truncated = @bytes_sorted[0..7];
    my @words_truncated = @words_sorted[0..7];
    my %histogram;
    foreach my $byte (@bytes_truncated) {
        push @{$histogram{onegram}}, $byte; #$bytegram_ref->[0]->[$byte];
    }
    foreach my $word (@words_truncated) {
        push @{$histogram{bigram}}, $word; #$bytegram_ref->[1]->[$word];
    }
    return \%histogram;
}


sub is_uniform {
    my $self = shift;
    #say "is_uniform?";
    my $bytegram_ref = $self->{'bytegram'};
    my $sum = 0;
    my $n = 0;
    my @unigram = @{$bytegram_ref->[0]};
    foreach my $byte (0 .. 255) {
        if ($unigram[$byte] > 0) {
            $n +=  $unigram[$byte];
            $sum += ($unigram[$byte] * $byte);
        }
    }
    if ($n == 0) { return;}
    my $expected = (256)/2;
    my $mean = ($sum/$n);
    #say "expected=$expected, sum=$sum, mean=$mean";



( run in 1.258 second using v1.01-cache-2.11-cpan-acebb50784d )