EBook-Ishmael
view release on metacpan or search on metacpan
contrib/512-bigram.pl view on Meta::CPAN
return 1.0;
}
my $total = sum values %$bigram;
my @pairs = sort { $b->[1] <=> $a->[1] }
map { [ $_, $bigram->{ $_ } ] }
keys %$bigram;
if ($top > @pairs) {
return 1.0;
}
my $top_sum = 0;
for my $p (@pairs[0 .. $top-1]) {
$top_sum += $p->[1];
}
return $top_sum / $total;
}
sub print_histogram {
my ($bigram, $num) = @_;
my @pairs = sort { $b->[1] <=> $a->[1] }
map { [ $_, $bigram->{ $_ } ] }
keys %$bigram;
if (@pairs > $num) {
@pairs = @pairs[0 .. $num-1];
}
for my $p (@pairs) {
say "$p->[0] : $p->[1]";
}
}
sub print_perl {
my ($bigram, $name, $encoding, $num) = @_;
if ($name !~ /^\w+$/) {
die "'$name' is not a valid name";
}
if (not exists $recognized_encodings{ $encoding }) {
die "'$encoding' is not a valid encoding";
}
my @pairs = sort { $b->[1] <=> $a->[1] }
map { [ $_, $bigram->{ $_ } ] }
keys %$bigram;
if (@pairs > $num) {
@pairs = @pairs[0 .. $num-1];
}
say "my %$name = map { \$_ => 1 } (";
my $l = '';
for my $p (@pairs) {
my $decoded = encode($encoding, $p->[0], Encode::FB_CROAK);
my $b1 = ord(substr $decoded, 0, 1);
my $b2 = ord(substr $decoded, 1, 1);
my $s = sprintf q{"\x%02x\x%02x",}, $b1, $b2;
if (length($l) + length($s) > 80) {
say $l;
$l = '';
}
$l .= $s;
}
if ($l ne '') {
say $l;
}
say ");";
}
my $num = 512;
my $output_perl = 0;
my $get_percentage = 0;
GetOptions(
'R' => \$get_percentage,
'p' => \$output_perl,
'n=i' => \$num,
'h' => sub { print $USAGE; exit 0 },
) or die $USAGE;
my $encoding = shift @ARGV;
if (not defined $encoding) {
die $USAGE;
}
$encoding = lc $encoding;
if (not exists $recognized_encodings{ $encoding }) {
die "Invalid encoding\n";
}
my @files = @ARGV;
if (!@files) {
@files = ('-');
}
my %bigram;
for my $f (@files) {
my $h = build_bigram($f, $encoding);
for my $k (keys %$h) {
$bigram{ $k } += $h->{ $k };
}
}
if ($get_percentage) {
my $p = bigram_percentage(\%bigram, $num);
$p = round($p * 100);
say $p;
} elsif ($output_perl) {
print_perl(\%bigram, $encoding_bigram_names{ $encoding },
$encoding, $num);
} else {
print_histogram(\%bigram, $num);
}
( run in 1.194 second using v1.01-cache-2.11-cpan-13bb782fe5a )