percent results from the CPAN

App-freqtable

#!perl

use strict;
use warnings;

use Getopt::Long qw(:config bundling no_ignore_case);
use POSIX qw(ceil);

our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
our $DATE = '2025-08-03'; # DATE
our $DIST = 'App-freqtable'; # DIST
our $VERSION = '0.010'; # VERSION

my $tmp_rank;
my %Opts = (
    mode => 'line',
    ignore_case => 0,
    print_total => 0,
    print_freq => 1,
    # XXX options to limit memory usage, e.g. max keys, max line length, --md5 (like in nauniq), ...
    min_rank => undef,
    max_rank => undef,
    sort_args => {},
    sort_sub => undef,
    percent => 0,
    clear_before_output => 0,
    output_every => undef,
    format => undef,
);
my $Numeric;
my %Occurences;

sub parse_cmdline {
    my $res = GetOptions(
        'bytes|c'   => sub { $Opts{mode} = 'byte' },
        'chars|m'   => sub { $Opts{mode} = 'char' },
        'words|w'   => sub { $Opts{mode} = 'word' },
        'lines|l'   => sub { $Opts{mode} = 'line' },
        'number|n'  => sub { $Opts{mode} = 'number' },
        'integer|i' => sub { $Opts{mode} = 'integer' },
        'ignore-case|f' => \$Opts{ignore_case},
        'no-print-freq|F' => sub { $Opts{print_freq} = 0 },
        'print-freq'      => sub { $Opts{print_freq} = 1 },
        'no-print-total|T' => sub { $Opts{print_total} = 0 },
        'print-total|t'    => sub { $Opts{print_total} = 1 },
        'rank|r=s' => \$tmp_rank,
        'sort-sub=s' => \$Opts{sort_sub},
        'sort-arg=s%' => $Opts{sort_args},
        'a'   => sub { $Opts{sort_sub} = 'asciibetically' },
        'percent|p' => sub { $Opts{percent}++ },
        'clear-before-output' => sub { $Opts{clear_before_output} = 1 },
        'output-every=i' => \$Opts{output_every},
        'format=s' =>\$Opts{format},
        'help|h'  => sub {
            print <<USAGE;
Usage:
  freqtable [OPTIONS]... < INPUT
  freqtable --help (or -h)
Options:
  --bytes, -c
  --chars, -m
  --words, -w
  --lines, -l
  --number, -n
  --integer, -i
  --ignore-case, -f
  --print-freq
  --no-print-freq, -F
  --print-total, -t
  --no-print-total, -T
  --percent, -p
  --format=FMT
  --rank N|M-N|M-|-N, -r
  --sort-sub=SPEC
  --sort-arg=ARG=VAL
  -a
  --output-every=i
  --clear-before-output
For more details, see the manpage/documentation.
USAGE
            exit 0;
        },
    );

    if (defined $tmp_rank) {
        if ($tmp_rank =~ /\A\d+\z/) {
            $Opts{min_rank} = $Opts{max_rank} = $tmp_rank;
        } elsif ($tmp_rank =~ /\A-(\d+)\z/) {
            $Opts{max_rank} = $1;
        } elsif ($tmp_rank =~ /\A(\d+)-\z/) {
            $Opts{min_rank} = $1;
        } elsif ($tmp_rank =~ /\A(\d+)-(\d+)\z/) {
            $Opts{min_rank} = $1;
            $Opts{max_rank} = $2;
        } else {
            warn "freqtable: Invalid value for --rank: '$tmp_rank', ".
                "please specify N|M-N|M-|-N\n";
            $res = 0;
        }
    }

    exit 99 if !$res;
}

sub _print_freqline {
    my ($n, $k, $totoccurrences) = @_;

    if (defined $Opts{format}) {
        my $pct = $totoccurrences == 0 ? 0 : $n/$totoccurrences*100;
        {
            no warnings; # XXX only disable warning 'redundant argument in printf'
            printf $Opts{format}, $n, $k, $pct;
            print "\n";
        }
    } else {
        if ($Opts{print_freq}) {
            unless ($Opts{percent} && $Opts{percent} < 2) {
                printf "%9d ", $n;
            }
            if ($Opts{percent}) {
                my $pct = $totoccurrences == 0 ? 0 : $n/$totoccurrences*100;
                printf "%6.2f%% ", $pct;
            }
        }
        print $k, "\n";
    }
}

sub _display_table {
    my @keys = keys %Occurences;

    my $totoccurrences = 0;
    $totoccurrences += $Occurences{$_} for @keys;

    if (defined $Opts{sort_sub}) {
        require Sort::Sub;
        my $sorter = Sort::Sub::get_sorter($Opts{sort_sub}, $Opts{sort_args});
        @keys = sort $sorter @keys;
    } else {
        @keys = sort {
        $Occurences{$b} <=> $Occurences{$a} ||
            ($Numeric ? $a <=> $b : $a cmp $b)
        } @keys;
    }

    print "\033[2J" if $Opts{clear_before_output};

    my $i = 0;
    for my $k (@keys) {
        $i++;
        my $n = $Occurences{$k};
        next if defined $Opts{min_rank} && $i < $Opts{min_rank};
        next if defined $Opts{max_rank} && $i > $Opts{max_rank};
        _print_freqline($n, $k, $totoccurrences);
    }

    _print_freqline($totoccurrences, "TOTAL", $totoccurrences) if $Opts{print_total};
}

sub run {
    $|++ if $Opts{output_every};

    my $i = 0;
    if ($Opts{mode} eq 'byte' || $Opts{mode} eq 'char') {
        @ARGV = (\*STDIN) unless @ARGV;
        for my $fn (@ARGV) {
            my $fh;
            if (ref $fn) {
                $fh = $fn;
            } else {
                open $fh, "<", $fn or do {
                    warn "freqtable: Can't open '$fn': $!\n";
                    next;
                };
            }
            if ($Opts{mode} eq 'byte') {
                binmode $fh;
            } else {
                binmode $fh, ":encoding(utf8)";
            }

script/freqtable view on Meta::CPAN

Display frequency table (words):

 % freqtable -w input-words.txt
 3       five
 2       eight
 2       one
 1       four
 1       nine
 1       seven
 1       six
 1       three
 1       two

Display frequency table (characters):

 % freqtable -c input-words.txt
 12
 12      e
  7      i
  5      n
  4      f
  4      o
  4      t
  4      v
  3      h
  2      g
  2      r
  2      s
  1

  1      u
  1      w
  1      x

Display frequency table (nums):

 % freqtable -n input-nums.txt
 2      9.99
 1      9

Display frequency table (integers):

 % freqtable -i input-nums.txt
 3      9

=head2 Formatting the output line: omitting the frequency (-F option)

Don't display the frequencies:

 % freqtable -F input-lines.txt
 five
 eight
 one
 four
 nine
 seven
 six
 three
 two

=head2 Formatting the output line: showing the percentages (`--percent`, `-p` option)

The default is to show frequencies as numbers:

 % freqtable input-lines.txt
         3 five
 ...

You can display frequencies as percent instead:

 % freqtable -p input-lines.txt
  23.08% five
 ...

Specify another `-p` if you want to display frequencies as integers as well as
percent:

 % freqtable -pp input-lines.txt
         3  23.08% five
 ...

=head2 Formatting the output line: custom formatting (`--format` option)

 % freqtable --format '%04d: %s' input-lines.txt
 0003: five

=head2 Filter by rank

Only display the top 3 ranks:

 % freqtable input-lines.txt -r -3
 % freqtable input-lines.txt -r 1-3
         3 five
         2 eight
         2 one

=head2 Sorting

Instead of the default sorting by frequency (descending order), if you specify
C<--sort-sub> (and optionally one or more C<--sort-arg>) you can sort by the
keys using one of L<Sort::Sub>::* subroutines. Examples:

 # sort by keys, asciibetically
 % freqtable -F input-lines.txt --sort-sub asciibetically
 2       eight
 3       five
 1       four
 1       nine
 2       one
 1       seven
 1       six
 1       three
 1       two

 # sort by keys, asciibetically (descending order)
 % freqtable -F input-lines.txt --sort-sub 'asciibetically<r>'
 1       two
 1       three
 1       six
 1       seven
 2       one
 1       nine
 1       four
 3       five
 2       eight

 # sort by keys, randomly using perl code (essentially, shuffling)
 % freqtable -F input-lines.txt --sort-sub 'by_perl_code' --sort-arg 'code=int(rand()*3)-1'
 3       five
 1       three
 2       eight
 1       seven
 2       one
 1       six
 1       nine
 1       two
 1       four

=head2 Running table (`--output-every` option)

If you have streaming input, you can instruct `freqtable` to print the result
periodically after a number of input lines/words/characters/bytes. You can also
instruct to clear the terminal screen before every output
(`--clear-before-output`).

 % perl -MArray::Sample::WeightedRandom=sample_weighted_random_with_replacement \
     -E'say sample_weighted_random_with_replacement(
          [ ["a", 1], ["b", 2], ["c", 3], ["d",5] ], 1) while 1' | \
   freqtable --output-every 10000 --clear --percent

Sample output:

 45.43%  d
 27.28%  c
 18.20%  b
  9.10%  a

=head1 DESCRIPTION

This utility counts the occurences of lines (or words/characters) in the input
then display each unique lines along with their number of occurrences. You can
also instruct it to only show lines that have a specified number of occurrences.

You can use the following Unix command to count occurences of lines:

 % sort input-lines.txt | uniq -c | sort -nr

and with a bit more work you can also use a combination of existing Unix
commands to count occurrences of words/characters, as well as filter items that
have a specified number of occurrences; freqtable basically offers convenience.

=head1 EXIT CODES

0 on success.

255 on I/O error.

99 on command-line options error.

=head1 OPTIONS

=over

=item * --bytes, -c

=item * --chars, -m

=item * --words, -w

=item * --lines, -l

=item * --number, -n

Treat each line as a number. A line like this:

 9.99 cents

will be regarded as:

 9.99

=item * --integer, -i

Treat each line as an integer. A line like this:

 9.99 cents

will be regarded as:

 9

=item * --ignore-case, -f

=item * --no-print-freq, -F

Will not print the frequencies.

=item * --print-total, -t

Print the total line at the bottom.

=item * --no-print-total, -T

Do not print the total line at the bottom (the default).

=item * --rank=s, -r

Filter by rank. There are several ways you can do this:

C<-N> to only display the top I<N> ranks.

C<N> to only display the I<N>'th rank.

C<M-N> to only display the I<M>'th to I<N>'th rank.

C<M-> to only display the I<M>'th rank and lower items.

=item * --sort-sub=s

This will cause C<freqtable> to sort by key name instead of by frequencies. You
pass this option to specify a L<Sort::Sub> routine, which is the name of a
C<Sort::Sub::*> module without the C<Sort::Sub::> prefix, e.g.
C<asciibetically>. The name can optionally be followed by C<< <i> >>, or C<< <r>
>>, or C<< <ir> >> to mean case-insensitive sorting, reverse order, and reverse
order case-insensitive sorting, respectively. When you use one of these suffixes
on the command-line, remember to quote since C<< < >> and C<< > >> can be
intereprted by shell.

Examples:

 asciibetically
 asciibetically<i>
 by_length<r>

=item * --sort-arg=ARGNAME=ARGVALUE

Pass argument(s) to the sort subroutine. Can be specified multiple times, once
for every argument.

=item * -a

Shortcut for C<--sort=asciibetically>.

=item * --percent, -p

Show frequencies as percentages instead of integers. If you specify this option
one more time, will show frequencies as integers I<as well as> percentages.

=item * --format=s

Format frequency line using `sprintf()` template. `freqtable` will supply these
arguments after the template: frequency integer, item string, and frequency as
percent. For example:

 %04d: %s              # sample output: 0004: five

If you want to display the item first, you can use something like:

 %2$-12s: %d
 # sample output:
 five        : 3
 eight       : 2

=item * --output-every=i

If set, then after every specified number of input data
(bytes/characters/words/lines), will output the "running" (current) frequency
table.

=item * --clear-before-output

Emit ANSI escape codes "\033[2J" before each output to clear the screen.

=back

=head1 FAQ

=head1 HOMEPAGE

Please visit the project's homepage at L<https://metacpan.org/release/App-freqtable>.

=head1 SOURCE

Source repository is at L<https://github.com/perlancar/perl-App-freqtable>.

=head1 SEE ALSO

Unix commands B<wc>, B<sort>, B<uniq>

L<wordstat> from L<App::wordstat>

L<csv-freqtable> from L<App::CSVUtils>

=head1 AUTHOR

perlancar <perlancar@cpan.org>

=head1 CONTRIBUTING


To contribute, you can send patches by email/via RT, or send pull requests on
GitHub.

Most of the time, you don't need to build the distribution yourself. You can
simply modify the code, then test via:

 % prove -l

If you want to build the distribution (e.g. to try to install it locally on your
system), you can install L<Dist::Zilla>,
L<Dist::Zilla::PluginBundle::Author::PERLANCAR>,
L<Pod::Weaver::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
Dist::Zilla- and/or Pod::Weaver plugins. Any additional steps required beyond

( run in 0.598 second using v1.01-cache-2.11-cpan-7fcb06a456a )