EBook-Ishmael

 view release on metacpan or  search on metacpan

contrib/512-bigram.pl  view on Meta::CPAN

        return 1.0;
    }

    my $total = sum values %$bigram;
    my @pairs = sort { $b->[1] <=> $a->[1] }
                map { [ $_, $bigram->{ $_ } ] }
                keys %$bigram;

    if ($top > @pairs) {
        return 1.0;
    }

    my $top_sum = 0;
    for my $p (@pairs[0 .. $top-1]) {
        $top_sum += $p->[1];
    }

    return $top_sum / $total;

}

sub print_histogram {

    my ($bigram, $num) = @_;

    my @pairs = sort { $b->[1] <=> $a->[1] }
                map { [ $_, $bigram->{ $_ } ] }
                keys %$bigram;

    if (@pairs > $num) {
        @pairs = @pairs[0 .. $num-1];
    }

    for my $p (@pairs) {
        say "$p->[0] : $p->[1]";
    }

}

sub print_perl {

    my ($bigram, $name, $encoding, $num) = @_;
    if ($name !~ /^\w+$/) {
        die "'$name' is not a valid name";
    }
    if (not exists $recognized_encodings{ $encoding }) {
        die "'$encoding' is not a valid encoding";
    }

    my @pairs = sort { $b->[1] <=> $a->[1] }
                map { [ $_, $bigram->{ $_ } ] }
                keys %$bigram;

    if (@pairs > $num) {
        @pairs = @pairs[0 .. $num-1];
    }

    say "my %$name = map { \$_ => 1 } (";
    my $l = '';
    for my $p (@pairs) {
        my $decoded = encode($encoding, $p->[0], Encode::FB_CROAK);
        my $b1 = ord(substr $decoded, 0, 1);
        my $b2 = ord(substr $decoded, 1, 1);
        my $s = sprintf q{"\x%02x\x%02x",}, $b1, $b2;
        if (length($l) + length($s) > 80) {
            say $l;
            $l = '';
        }
        $l .= $s;
    }
    if ($l ne '') {
        say $l;
    }
    say ");";

}

my $num = 512;
my $output_perl = 0;
my $get_percentage = 0;

GetOptions(
    'R'   => \$get_percentage,
    'p'   => \$output_perl,
    'n=i' => \$num,
    'h'   => sub { print $USAGE; exit 0 },
) or die $USAGE;

my $encoding = shift @ARGV;
if (not defined $encoding) {
    die $USAGE;
}
$encoding = lc $encoding;
if (not exists $recognized_encodings{ $encoding }) {
    die "Invalid encoding\n";
}

my @files = @ARGV;
if (!@files) {
    @files = ('-');
}

my %bigram;
for my $f (@files) {
    my $h = build_bigram($f, $encoding);
    for my $k (keys %$h) {
        $bigram{ $k } += $h->{ $k };
    }
}

if ($get_percentage) {
    my $p = bigram_percentage(\%bigram, $num);
    $p = round($p * 100);
    say $p;
} elsif ($output_perl) {
    print_perl(\%bigram, $encoding_bigram_names{ $encoding },
               $encoding, $num);
} else {
    print_histogram(\%bigram, $num);
}



( run in 1.194 second using v1.01-cache-2.11-cpan-13bb782fe5a )