App-freqtable
view release on metacpan or search on metacpan
script/freqtable view on Meta::CPAN
#!perl
use strict;
use warnings;
use Getopt::Long qw(:config bundling no_ignore_case);
use POSIX qw(ceil);
our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
our $DATE = '2025-08-03'; # DATE
our $DIST = 'App-freqtable'; # DIST
our $VERSION = '0.010'; # VERSION
my $tmp_rank;
my %Opts = (
mode => 'line',
ignore_case => 0,
print_total => 0,
print_freq => 1,
# XXX options to limit memory usage, e.g. max keys, max line length, --md5 (like in nauniq), ...
min_rank => undef,
max_rank => undef,
sort_args => {},
sort_sub => undef,
percent => 0,
clear_before_output => 0,
output_every => undef,
format => undef,
);
my $Numeric;
my %Occurences;
sub parse_cmdline {
my $res = GetOptions(
'bytes|c' => sub { $Opts{mode} = 'byte' },
'chars|m' => sub { $Opts{mode} = 'char' },
'words|w' => sub { $Opts{mode} = 'word' },
'lines|l' => sub { $Opts{mode} = 'line' },
'number|n' => sub { $Opts{mode} = 'number' },
'integer|i' => sub { $Opts{mode} = 'integer' },
'ignore-case|f' => \$Opts{ignore_case},
'no-print-freq|F' => sub { $Opts{print_freq} = 0 },
'print-freq' => sub { $Opts{print_freq} = 1 },
'no-print-total|T' => sub { $Opts{print_total} = 0 },
'print-total|t' => sub { $Opts{print_total} = 1 },
'rank|r=s' => \$tmp_rank,
'sort-sub=s' => \$Opts{sort_sub},
'sort-arg=s%' => $Opts{sort_args},
'a' => sub { $Opts{sort_sub} = 'asciibetically' },
'percent|p' => sub { $Opts{percent}++ },
'clear-before-output' => sub { $Opts{clear_before_output} = 1 },
'output-every=i' => \$Opts{output_every},
'format=s' =>\$Opts{format},
'help|h' => sub {
print <<USAGE;
Usage:
freqtable [OPTIONS]... < INPUT
freqtable --help (or -h)
Options:
--bytes, -c
--chars, -m
--words, -w
--lines, -l
--number, -n
--integer, -i
--ignore-case, -f
--print-freq
--no-print-freq, -F
--print-total, -t
--no-print-total, -T
--percent, -p
--format=FMT
--rank N|M-N|M-|-N, -r
--sort-sub=SPEC
--sort-arg=ARG=VAL
-a
--output-every=i
--clear-before-output
For more details, see the manpage/documentation.
USAGE
exit 0;
},
);
if (defined $tmp_rank) {
if ($tmp_rank =~ /\A\d+\z/) {
$Opts{min_rank} = $Opts{max_rank} = $tmp_rank;
} elsif ($tmp_rank =~ /\A-(\d+)\z/) {
$Opts{max_rank} = $1;
} elsif ($tmp_rank =~ /\A(\d+)-\z/) {
$Opts{min_rank} = $1;
} elsif ($tmp_rank =~ /\A(\d+)-(\d+)\z/) {
$Opts{min_rank} = $1;
$Opts{max_rank} = $2;
} else {
warn "freqtable: Invalid value for --rank: '$tmp_rank', ".
"please specify N|M-N|M-|-N\n";
$res = 0;
}
}
exit 99 if !$res;
}
sub _print_freqline {
my ($n, $k, $totoccurrences) = @_;
if (defined $Opts{format}) {
my $pct = $totoccurrences == 0 ? 0 : $n/$totoccurrences*100;
{
no warnings; # XXX only disable warning 'redundant argument in printf'
printf $Opts{format}, $n, $k, $pct;
print "\n";
}
} else {
if ($Opts{print_freq}) {
unless ($Opts{percent} && $Opts{percent} < 2) {
printf "%9d ", $n;
}
if ($Opts{percent}) {
my $pct = $totoccurrences == 0 ? 0 : $n/$totoccurrences*100;
printf "%6.2f%% ", $pct;
}
}
print $k, "\n";
}
}
sub _display_table {
my @keys = keys %Occurences;
my $totoccurrences = 0;
$totoccurrences += $Occurences{$_} for @keys;
if (defined $Opts{sort_sub}) {
require Sort::Sub;
my $sorter = Sort::Sub::get_sorter($Opts{sort_sub}, $Opts{sort_args});
@keys = sort $sorter @keys;
} else {
@keys = sort {
$Occurences{$b} <=> $Occurences{$a} ||
($Numeric ? $a <=> $b : $a cmp $b)
} @keys;
}
print "\033[2J" if $Opts{clear_before_output};
my $i = 0;
for my $k (@keys) {
$i++;
my $n = $Occurences{$k};
next if defined $Opts{min_rank} && $i < $Opts{min_rank};
next if defined $Opts{max_rank} && $i > $Opts{max_rank};
_print_freqline($n, $k, $totoccurrences);
}
_print_freqline($totoccurrences, "TOTAL", $totoccurrences) if $Opts{print_total};
}
sub run {
$|++ if $Opts{output_every};
my $i = 0;
if ($Opts{mode} eq 'byte' || $Opts{mode} eq 'char') {
@ARGV = (\*STDIN) unless @ARGV;
for my $fn (@ARGV) {
my $fh;
if (ref $fn) {
$fh = $fn;
} else {
open $fh, "<", $fn or do {
warn "freqtable: Can't open '$fn': $!\n";
next;
};
}
if ($Opts{mode} eq 'byte') {
binmode $fh;
} else {
binmode $fh, ":encoding(utf8)";
}
script/freqtable view on Meta::CPAN
Display frequency table (words):
% freqtable -w input-words.txt
3 five
2 eight
2 one
1 four
1 nine
1 seven
1 six
1 three
1 two
Display frequency table (characters):
% freqtable -c input-words.txt
12
12 e
7 i
5 n
4 f
4 o
4 t
4 v
3 h
2 g
2 r
2 s
1
1 u
1 w
1 x
Display frequency table (nums):
% freqtable -n input-nums.txt
2 9.99
1 9
Display frequency table (integers):
% freqtable -i input-nums.txt
3 9
=head2 Formatting the output line: omitting the frequency (-F option)
Don't display the frequencies:
% freqtable -F input-lines.txt
five
eight
one
four
nine
seven
six
three
two
=head2 Formatting the output line: showing the percentages (`--percent`, `-p` option)
The default is to show frequencies as numbers:
% freqtable input-lines.txt
3 five
...
You can display frequencies as percent instead:
% freqtable -p input-lines.txt
23.08% five
...
Specify another `-p` if you want to display frequencies as integers as well as
percent:
% freqtable -pp input-lines.txt
3 23.08% five
...
=head2 Formatting the output line: custom formatting (`--format` option)
% freqtable --format '%04d: %s' input-lines.txt
0003: five
=head2 Filter by rank
Only display the top 3 ranks:
% freqtable input-lines.txt -r -3
% freqtable input-lines.txt -r 1-3
3 five
2 eight
2 one
=head2 Sorting
Instead of the default sorting by frequency (descending order), if you specify
C<--sort-sub> (and optionally one or more C<--sort-arg>) you can sort by the
keys using one of L<Sort::Sub>::* subroutines. Examples:
# sort by keys, asciibetically
% freqtable -F input-lines.txt --sort-sub asciibetically
2 eight
3 five
1 four
1 nine
2 one
1 seven
1 six
1 three
1 two
# sort by keys, asciibetically (descending order)
% freqtable -F input-lines.txt --sort-sub 'asciibetically<r>'
1 two
1 three
1 six
1 seven
2 one
1 nine
1 four
3 five
2 eight
# sort by keys, randomly using perl code (essentially, shuffling)
% freqtable -F input-lines.txt --sort-sub 'by_perl_code' --sort-arg 'code=int(rand()*3)-1'
3 five
1 three
2 eight
1 seven
2 one
1 six
1 nine
1 two
1 four
=head2 Running table (`--output-every` option)
If you have streaming input, you can instruct `freqtable` to print the result
periodically after a number of input lines/words/characters/bytes. You can also
instruct to clear the terminal screen before every output
(`--clear-before-output`).
% perl -MArray::Sample::WeightedRandom=sample_weighted_random_with_replacement \
-E'say sample_weighted_random_with_replacement(
[ ["a", 1], ["b", 2], ["c", 3], ["d",5] ], 1) while 1' | \
freqtable --output-every 10000 --clear --percent
Sample output:
45.43% d
27.28% c
18.20% b
9.10% a
=head1 DESCRIPTION
This utility counts the occurences of lines (or words/characters) in the input
then display each unique lines along with their number of occurrences. You can
also instruct it to only show lines that have a specified number of occurrences.
You can use the following Unix command to count occurences of lines:
% sort input-lines.txt | uniq -c | sort -nr
and with a bit more work you can also use a combination of existing Unix
commands to count occurrences of words/characters, as well as filter items that
have a specified number of occurrences; freqtable basically offers convenience.
=head1 EXIT CODES
0 on success.
255 on I/O error.
99 on command-line options error.
=head1 OPTIONS
=over
=item * --bytes, -c
=item * --chars, -m
=item * --words, -w
=item * --lines, -l
=item * --number, -n
Treat each line as a number. A line like this:
9.99 cents
will be regarded as:
9.99
=item * --integer, -i
Treat each line as an integer. A line like this:
9.99 cents
will be regarded as:
9
=item * --ignore-case, -f
=item * --no-print-freq, -F
Will not print the frequencies.
=item * --print-total, -t
Print the total line at the bottom.
=item * --no-print-total, -T
Do not print the total line at the bottom (the default).
=item * --rank=s, -r
Filter by rank. There are several ways you can do this:
C<-N> to only display the top I<N> ranks.
C<N> to only display the I<N>'th rank.
C<M-N> to only display the I<M>'th to I<N>'th rank.
C<M-> to only display the I<M>'th rank and lower items.
=item * --sort-sub=s
This will cause C<freqtable> to sort by key name instead of by frequencies. You
pass this option to specify a L<Sort::Sub> routine, which is the name of a
C<Sort::Sub::*> module without the C<Sort::Sub::> prefix, e.g.
C<asciibetically>. The name can optionally be followed by C<< <i> >>, or C<< <r>
>>, or C<< <ir> >> to mean case-insensitive sorting, reverse order, and reverse
order case-insensitive sorting, respectively. When you use one of these suffixes
on the command-line, remember to quote since C<< < >> and C<< > >> can be
intereprted by shell.
Examples:
asciibetically
asciibetically<i>
by_length<r>
=item * --sort-arg=ARGNAME=ARGVALUE
Pass argument(s) to the sort subroutine. Can be specified multiple times, once
for every argument.
=item * -a
Shortcut for C<--sort=asciibetically>.
=item * --percent, -p
Show frequencies as percentages instead of integers. If you specify this option
one more time, will show frequencies as integers I<as well as> percentages.
=item * --format=s
Format frequency line using `sprintf()` template. `freqtable` will supply these
arguments after the template: frequency integer, item string, and frequency as
percent. For example:
%04d: %s # sample output: 0004: five
If you want to display the item first, you can use something like:
%2$-12s: %d
# sample output:
five : 3
eight : 2
=item * --output-every=i
If set, then after every specified number of input data
(bytes/characters/words/lines), will output the "running" (current) frequency
table.
=item * --clear-before-output
Emit ANSI escape codes "\033[2J" before each output to clear the screen.
=back
=head1 FAQ
=head1 HOMEPAGE
Please visit the project's homepage at L<https://metacpan.org/release/App-freqtable>.
=head1 SOURCE
Source repository is at L<https://github.com/perlancar/perl-App-freqtable>.
=head1 SEE ALSO
Unix commands B<wc>, B<sort>, B<uniq>
L<wordstat> from L<App::wordstat>
L<csv-freqtable> from L<App::CSVUtils>
=head1 AUTHOR
perlancar <perlancar@cpan.org>
=head1 CONTRIBUTING
To contribute, you can send patches by email/via RT, or send pull requests on
GitHub.
Most of the time, you don't need to build the distribution yourself. You can
simply modify the code, then test via:
% prove -l
If you want to build the distribution (e.g. to try to install it locally on your
system), you can install L<Dist::Zilla>,
L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond
( run in 3.056 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )