App-csvtool

 view release on metacpan or  search on metacpan

lib/App/csvtool.pm  view on Meta::CPAN

#  You may distribute under the terms of either the GNU General Public License
#  or the Artistic License (the same terms as Perl itself)
#
#  (C) Paul Evans, 2021-2024 -- leonerd@leonerd.org.uk

package App::csvtool 0.04;

use v5.26;
use warnings;
use experimental 'signatures';

use Commandable 0.11;

=head1 NAME

C<App::csvtool> - implements the F<csvtool> core commands

=head1 DESCRIPTION

This module provides the main commands for the F<csvtool> wrapper script.

=head1 COMMANDS

=cut

package App::csvtool::cut
{

=head2 cut

   $ csvtool cut -fFIELDS INPUT...

Extracts the given field column(s).

=head3 --fields, -f

A comma-separated list of field indexes (defaults to 1).

A field index of C<u> will result in an undefined (i.e. empty) field being
emitted. This can be used to create spaces and pad out the data.

=cut

   use constant COMMAND_DESC => "Extract the given field(s) to output";

   use constant COMMAND_OPTS => (
      { name => "fields|f=", description => "Comma-separated list of fields to extract",
          default => "1" },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my @FIELDS = split m/,/, $opts->{fields};

      # 1-indexed
      $_ eq "u" || $_-- for @FIELDS;

      while( my $row = $reader->() ) {
         $output->( [ map { $_ eq "u" ? undef : $row->[$_] } @FIELDS ] );
      }
   }
}

package App::csvtool::grep
{

=head2 grep

   $ csvtool grep PATTERN INPUT...

Filter rows by the given pattern. The pattern is always interpreted as a Perl
regular expression.

=head3 --ignore-case, -i

Ignore case when matching.

=head3 --invert-match, -v

Output only the lines that do not match the filter pattern.

=cut

   use constant COMMAND_DESC => "Filter rows based on a regexp pattern";

   use constant COMMAND_OPTS => (
      { name => "field|f=", description => "Field to filter by",
         default => 1 },
      { name => "ignore-case|i", description => "Match ignoring case" },
      { name => "invert-match|v", description => "Selects only the non-matching rows" },
   );

   use constant COMMAND_ARGS => (
      { name => "pattern", description => "regexp pattern for filtering" },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $pattern, $reader, $output )
   {
      my $FIELD = $opts->{field};
      my $INVERT = $opts->{invert_match} // 0;

      $pattern = "(?i:$pattern)" if $opts->{ignore_case};

      # 1-based
      $FIELD--;

      my $re = qr/$pattern/;

      while( my $row = $reader->() ) {
         $output->( $row ) if $INVERT ^ $row->[ $FIELD ] =~ $re;
      }
   }
}

package App::csvtool::head
{

=head2 head

   $ csvtool head -nLINES INPUT...

Output only the first few rows.

=head3 --lines, -n

Number of lines to output. If negative, will output all but the final few rows
of the given number.

=cut

   use constant COMMAND_DESC => "Select the first few rows";

   use constant COMMAND_OPTS => (
      { name => "lines|n=i", description => "Number of rows to select",
         default => 10 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $LINES = $opts->{lines};

      if( $LINES > 0 ) {
         while( $LINES-- > 0 and my $row = $reader->() ) {
            $output->( $row );
         }
      }
      elsif( $LINES < 0 ) {
         my @ROWS;
         while( $LINES++ < 0 and my $row = $reader->() ) {
            push @ROWS, $row;
         }
         while( my $row = $reader->() ) {
            $output->( shift @ROWS );
            push @ROWS, $row;
         }
      }
   }
}

package App::csvtool::join
{

=head2 join

   $ csvtool join -fFIELD FILE1 FILE2

Reads two files and outputs rows joined by a common key.

The second file is read entirely into memory and indexed by its key field.
Then the first file is read a row at a time, and each row has the
corresponding data from the second file appended to it when output.

This is more flexible than the F<join(1)> UNIX tool that inspires it, because
C<FILE2> does not need to be sorted in key order in the same way as C<FILE1>.
Additionally, rows of C<FILE2> may be emitted zero, one, or more times as
required by matches from C<FILE1>.

=head3 --field1, -1

The field index in FILE1 to use as the lookup key.

=head3 --field2, -2

The field index in FILE2 to use as the storage key.

=head3 --field, -f

Use the same field index for both files.

=cut

   use constant COMMAND_DESC => "Join two files by a common key";

   use constant COMMAND_OPTS => (
      { name => "field|f=", description => "Field of both files to join by" },
      { name => "field1|1=", description => "Field of FILE1 to join by" },
      { name => "field2|2=", description => "Field of FILE2 to join by" },
   );

   use constant WANT_READER => 2;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader1, $reader2, $output )
   {
      my $FIELD1 = $opts->{field1} // $opts->{field}; $FIELD1--;
      my $FIELD2 = $opts->{field2} // $opts->{field}; $FIELD2--;

      # Load the joindata from second reader
      my %rows_by_key;
      while( my $row = $reader2->() ) {
         my $key = splice @$row, $FIELD2, 1, ();
         warn "FILE2 contains duplicate key '$key'\n"
            if exists $rows_by_key{$key};
         $rows_by_key{$key} = $row;
      }

      while( my $row = $reader1->() ) {
         my $key = $row->[$FIELD1];
         $output->( [ @$row, @{ $rows_by_key{$key} // [] } ] );
      }
   }
}

package App::csvtool::sort
{

=head2 sort

   $ csvtool sort INPUT...

Sorts the rows according to the given field.

=head3 --field, -f

The field index to sort by (defaults to 1).

=head3 --numerical, -n

Sorts numerically. If absent, sorting happens alphabetically.

=head3 --reverse, -r

Reverses the order of sorting.

=cut

   use constant COMMAND_DESC => "Sort lexicographically (or numerically) by the given FIELD";

   use constant COMMAND_OPTS => (
      { name => "numerical|n", description => "Sort numerically" },
      { name => "reverse|r", description => "Reverse order of sorting" },
      { name => "field|f=", description => "Field to key by",
         default => 1 },
   );

   use List::UtilsBy qw( sort_by nsort_by );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field};

      # 1-indexed
      $FIELD--;

      my @rows;
      while( my $row = $reader->() ) {
         push @rows, $row;
      }

      if( $opts->{numerical} ) {
         @rows = nsort_by { $_->[$FIELD] } @rows;
      }
      else {
         @rows = sort_by { $_->[$FIELD] } @rows;
      }

      if( $opts->{reverse} ) {
         $output->( $_ ) for reverse @rows;
      }
      else {
         $output->( $_ ) for @rows;
      }
   }
}

package App::csvtool::tail
{

=head2 tail

   $ csvtool tail -nLINES INPUT...

Output only the final few rows.

=head3 --lines, -n

Number of lines to output. If negative, will output all but the first few rows
of the given number.

=cut

   use constant COMMAND_DESC => "Select the final few rows";

   use constant COMMAND_OPTS => (
      { name => "lines|n=i", description => "Number of rows to select",
         default => 10 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $LINES = $opts->{lines};

      if( $LINES > 0 ) {
         my @ROWS;
         while( my $row = $reader->() ) {
            shift @ROWS if @ROWS >= $LINES;
            push @ROWS, $row;
         }
         $output->( $_ ) for @ROWS;
      }
      elsif( $LINES < 0 ) {
         while( $LINES++ < 0 and my $row = $reader->() ) {
            # discard it
         }
         while( my $row = $reader->() ) {
            $output->( $row );
         }
      }
   }
}

package App::csvtool::uniq
{

=head2 uniq

   $ csvtool uniq -fFIELD INPUT...

Filters rows for unique values of the given field.

=head3 --field, -f

The field index to select rows on (defaults to 1).

=cut

   use constant COMMAND_DESC => "Filter rows for unique values of the given FIELD";

   use constant COMMAND_OPTS => (
      { name => "field|f=", description => "Field to key by",
         default => 1 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field};

      # 1-based
      $FIELD--;

      my %seen;

      while( my $row = $reader->() ) {
         $output->( $row ) unless $seen{ $row->[$FIELD] }++;
      }
   }
}

=head1 AUTHOR

Paul Evans <leonerd@leonerd.org.uk>

=cut

0x55AA;



( run in 0.995 second using v1.01-cache-2.11-cpan-13bb782fe5a )