App-csvtool

 view release on metacpan or  search on metacpan

README  view on Meta::CPAN

    App::csvtool - implements the csvtool core commands

DESCRIPTION

    This module provides the main commands for the csvtool wrapper script.

COMMANDS

 cut

       $ csvtool cut -fFIELDS INPUT...

    Extracts the given field column(s).

  --fields, -f

    A comma-separated list of field indexes (defaults to 1).

    A field index of u will result in an undefined (i.e. empty) field being
    emitted. This can be used to create spaces and pad out the data.

 grep

       $ csvtool grep PATTERN INPUT...

    Filter rows by the given pattern. The pattern is always interpreted as
    a Perl regular expression.

  --ignore-case, -i

    Ignore case when matching.

  --invert-match, -v

    Output only the lines that do not match the filter pattern.

 head

       $ csvtool head -nLINES INPUT...

    Output only the first few rows.

  --lines, -n

    Number of lines to output. If negative, will output all but the final
    few rows of the given number.

 join

README  view on Meta::CPAN

  --field2, -2

    The field index in FILE2 to use as the storage key.

  --field, -f

    Use the same field index for both files.

 sort

       $ csvtool sort INPUT...

    Sorts the rows according to the given field.

  --field, -f

    The field index to sort by (defaults to 1).

  --numerical, -n

    Sorts numerically. If absent, sorting happens alphabetically.

  --reverse, -r

    Reverses the order of sorting.

 tail

       $ csvtool tail -nLINES INPUT...

    Output only the final few rows.

  --lines, -n

    Number of lines to output. If negative, will output all but the first
    few rows of the given number.

 uniq

       $ csvtool uniq -fFIELD INPUT...

    Filters rows for unique values of the given field.

  --field, -f

    The field index to select rows on (defaults to 1).

AUTHOR

    Paul Evans <leonerd@leonerd.org.uk>

bin/csvtool  view on Meta::CPAN

   foreach ( 1 .. $count ) {
      my $path = shift @ARGV;
      my $fh;
      $path eq "-" ? $fh = \*STDIN
                   : open( $fh, "<", $path ) || die "Cannot read $path - $!";
      my $csv = new_csv();
      push @args, sub { $csv->getline( $fh ) };
   }
}

if( $toolpkg->can( "WANT_OUTPUT" ) and $toolpkg->WANT_OUTPUT ) {
   my $csv = new_csv( eol => $/ );
   push @args, sub { $csv->print( \*STDOUT, $_[0] ) or die "Cannot print - $!" };
}

push @args, @ARGV;

$toolpkg->run( @args );

__END__

=head1 NAME

F<csvtool> - command-line tools for operating on CSV-formatted data

=head1 SYNOPSIS

   $ csvtool COMMAND OPTS... INPUT...

=head1 DESCRIPTION

This tool provides several named sub-commands that act similarly to UNIX
commands of the same names, but operate on CSV-formatted data rather than
simple lines of text.

Input is taken from one or more files named on the commandline, and output is
printed to standard output using CSV formatting.

Columns in the data are named from 1 onwards. Thus, C<-f1> refers to the first
column of data, C<-f2> the second, and so on.

=head1 COMMANDS

=head2 cut

   $ csvtool cut -fFIELDS INPUT...

Extracts the given field column(s).

=head3 --fields, -f

A comma-separated list of field indexes (defaults to 1).

=head2 grep

   $ csvtool grep PATTERN INPUT...

Filter rows by the given pattern. The pattern is always interpreted as a Perl
regular expression.

=head3 --ignore-case, -i

Ignore case when matching.

=head3 --invert-match, -v

Output only the lines that do not match the filter pattern.

=head2 head

   $ csvtool head -nLINES INPUT...

Output only the first few rows.

=head3 --lines, -n

Number of lines to output. If negative, will output all but the final few rows
of the given number.

=head2 join

bin/csvtool  view on Meta::CPAN

=head3 --field2, -2

The field index in FILE2 to use as the storage key.

=head3 --field, -f

Use the same field index for both files.

=head2 sort

   $ csvtool sort INPUT...

Sorts the rows according to the given field.

=head3 --field, -f

The field index to sort by (defaults to 1).

=head3 --numerical, -n

Sorts numerically. If absent, sorting happens alphabetically.

=head3 --reverse, -r

Reverses the order of sorting.

=head2 tail

   $ csvtool tail -nLINES INPUT...

Output only the final few rows.

=head3 --lines, -n

Number of lines to output. If negative, will output all but the first few rows
of the given number.

=head2 uniq

   $ csvtool uniq -fFIELD INPUT...

Filters rows for unique values of the given field.

=head3 --field, -f

The field index to select rows on (defaults to 1).

=head1 AUTHOR

Paul Evans <leonerd@leonerd.org.uk>

lib/App/csvtool.pm  view on Meta::CPAN


=head1 COMMANDS

=cut

package App::csvtool::cut
{

=head2 cut

   $ csvtool cut -fFIELDS INPUT...

Extracts the given field column(s).

=head3 --fields, -f

A comma-separated list of field indexes (defaults to 1).

A field index of C<u> will result in an undefined (i.e. empty) field being
emitted. This can be used to create spaces and pad out the data.

=cut

   use constant COMMAND_DESC => "Extract the given field(s) to output";

   use constant COMMAND_OPTS => (
      { name => "fields|f=", description => "Comma-separated list of fields to extract",
          default => "1" },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my @FIELDS = split m/,/, $opts->{fields};

      # 1-indexed
      $_ eq "u" || $_-- for @FIELDS;

      while( my $row = $reader->() ) {
         $output->( [ map { $_ eq "u" ? undef : $row->[$_] } @FIELDS ] );
      }
   }
}

package App::csvtool::grep
{

=head2 grep

   $ csvtool grep PATTERN INPUT...

Filter rows by the given pattern. The pattern is always interpreted as a Perl
regular expression.

=head3 --ignore-case, -i

Ignore case when matching.

=head3 --invert-match, -v

lib/App/csvtool.pm  view on Meta::CPAN

         default => 1 },
      { name => "ignore-case|i", description => "Match ignoring case" },
      { name => "invert-match|v", description => "Selects only the non-matching rows" },
   );

   use constant COMMAND_ARGS => (
      { name => "pattern", description => "regexp pattern for filtering" },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $pattern, $reader, $output )
   {
      my $FIELD = $opts->{field};
      my $INVERT = $opts->{invert_match} // 0;

      $pattern = "(?i:$pattern)" if $opts->{ignore_case};

      # 1-based
      $FIELD--;

lib/App/csvtool.pm  view on Meta::CPAN

         $output->( $row ) if $INVERT ^ $row->[ $FIELD ] =~ $re;
      }
   }
}

package App::csvtool::head
{

=head2 head

   $ csvtool head -nLINES INPUT...

Output only the first few rows.

=head3 --lines, -n

Number of lines to output. If negative, will output all but the final few rows
of the given number.

=cut

   use constant COMMAND_DESC => "Select the first few rows";

   use constant COMMAND_OPTS => (
      { name => "lines|n=i", description => "Number of rows to select",
         default => 10 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $LINES = $opts->{lines};

      if( $LINES > 0 ) {
         while( $LINES-- > 0 and my $row = $reader->() ) {
            $output->( $row );
         }
      }

lib/App/csvtool.pm  view on Meta::CPAN


   use constant COMMAND_DESC => "Join two files by a common key";

   use constant COMMAND_OPTS => (
      { name => "field|f=", description => "Field of both files to join by" },
      { name => "field1|1=", description => "Field of FILE1 to join by" },
      { name => "field2|2=", description => "Field of FILE2 to join by" },
   );

   use constant WANT_READER => 2;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader1, $reader2, $output )
   {
      my $FIELD1 = $opts->{field1} // $opts->{field}; $FIELD1--;
      my $FIELD2 = $opts->{field2} // $opts->{field}; $FIELD2--;

      # Load the joindata from second reader
      my %rows_by_key;
      while( my $row = $reader2->() ) {
         my $key = splice @$row, $FIELD2, 1, ();

lib/App/csvtool.pm  view on Meta::CPAN

         $output->( [ @$row, @{ $rows_by_key{$key} // [] } ] );
      }
   }
}

package App::csvtool::sort
{

=head2 sort

   $ csvtool sort INPUT...

Sorts the rows according to the given field.

=head3 --field, -f

The field index to sort by (defaults to 1).

=head3 --numerical, -n

Sorts numerically. If absent, sorting happens alphabetically.

lib/App/csvtool.pm  view on Meta::CPAN

   use constant COMMAND_OPTS => (
      { name => "numerical|n", description => "Sort numerically" },
      { name => "reverse|r", description => "Reverse order of sorting" },
      { name => "field|f=", description => "Field to key by",
         default => 1 },
   );

   use List::UtilsBy qw( sort_by nsort_by );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field};

      # 1-indexed
      $FIELD--;

      my @rows;
      while( my $row = $reader->() ) {

lib/App/csvtool.pm  view on Meta::CPAN

         $output->( $_ ) for @rows;
      }
   }
}

package App::csvtool::tail
{

=head2 tail

   $ csvtool tail -nLINES INPUT...

Output only the final few rows.

=head3 --lines, -n

Number of lines to output. If negative, will output all but the first few rows
of the given number.

=cut

   use constant COMMAND_DESC => "Select the final few rows";

   use constant COMMAND_OPTS => (
      { name => "lines|n=i", description => "Number of rows to select",
         default => 10 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $LINES = $opts->{lines};

      if( $LINES > 0 ) {
         my @ROWS;
         while( my $row = $reader->() ) {
            shift @ROWS if @ROWS >= $LINES;
            push @ROWS, $row;

lib/App/csvtool.pm  view on Meta::CPAN

         }
      }
   }
}

package App::csvtool::uniq
{

=head2 uniq

   $ csvtool uniq -fFIELD INPUT...

Filters rows for unique values of the given field.

=head3 --field, -f

The field index to select rows on (defaults to 1).

=cut

   use constant COMMAND_DESC => "Filter rows for unique values of the given FIELD";

   use constant COMMAND_OPTS => (
      { name => "field|f=", description => "Field to key by",
         default => 1 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field};

      # 1-based
      $FIELD--;

      my %seen;

lib/App/csvtool/Smudge.pm  view on Meta::CPAN


=head1 COMMANDS

=cut

package App::csvtool::smudge
{

=head2 smudge

   $ csvtool smudge -F IDX:FILTER INPUT...

Applies smudging filters to columns of the input, generating new data that is
output in the same shape. A "smudge" filter is one that operates on numerical
data, attempting to reduce the imact of any individual values and smooth out
small variations, emitting values that follow general trends. This assumes
that successive rows of data represent successive moments in time, containing
measurements or readings taken at each instant.

Different filters can be applied to individual columns, as specified by the
C<--filter> (or C<-F>) argument. Any columns that are not filtered are simply

lib/App/csvtool/Smudge.pm  view on Meta::CPAN

=cut

   use constant COMMAND_DESC => "Apply smudge filtering to columns of data";

   use constant COMMAND_OPTS => (
      { name => "filter|F=", description => "filters to apply to each column",
         multi => 1, },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   use Carp;

   use List::Util qw( sum );

=head2 FILTERS

The following name templates may be used. Names are parametric,
and encode information about how the filter acts.

lib/App/csvtool/Summarizing.pm  view on Meta::CPAN


C<App::csvtool::Summarizing> - summarize tabular data in F<csvtool>

=cut

package App::csvtool::count
{

=head2 count

   $ csvtool count -fFIELD INPUT...

Counts the number of rows that have distinct values for the selected field.

Outputs a new table having only two columns. The first column will be the
distinct values of the selected field that were found in the input, the second
column will be an integer giving the number of rows of the input which had
that that value. Rows are output in order of the first time each distinct
value was seen in the input.

Besides the selected key field, all other fields of the input are ignored.

lib/App/csvtool/Summarizing.pm  view on Meta::CPAN

=cut

   use constant COMMAND_DESC => "Count the number of rows by the value in FIELD";

   use constant COMMAND_OPTS => (
      { name => "field|f=", description => "Field to extract",
         default => 1 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field};

      # 1-indexed
      $FIELD--;

      my @keys; # maintain original first-seen order
      my %count_for_key;

lib/App/csvtool/Timetools.pm  view on Meta::CPAN


   use constant COMMAND_DESC => "Format a timestamp from UNIX time";

   use constant COMMAND_OPTS => (
      __PACKAGE__->COMMON_COMMAND_OPTS,
      { name => "field|f=", description => "Field to use for timestamp",
         default => 1 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field}; $FIELD--;

      while( my $row = $reader->() ) {
         $row->[$FIELD] = __PACKAGE__->formattime( $opts, $row->[$FIELD] );

         $output->( $row );
      }

lib/App/csvtool/Timetools.pm  view on Meta::CPAN


   use constant COMMAND_DESC => "Parse a timestamp into UNIX time";

   use constant COMMAND_OPTS => (
      __PACKAGE__->COMMON_COMMAND_OPTS,
      { name => "field|f=", description => "Field to use for timestamp",
         default => 1 },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field}; $FIELD--;

      while( my $row = $reader->() ) {
         $row->[$FIELD] = __PACKAGE__->parsetime( $opts, $row->[$FIELD] );

         $output->( $row );
      }

lib/App/csvtool/Timetools.pm  view on Meta::CPAN

   use constant COMMAND_DESC => "Sort rows into chronological order by a timestamp";

   use constant COMMAND_OPTS => (
      __PACKAGE__->COMMON_COMMAND_OPTS,
      { name => "field|f=", description => "Field to use for timestamp",
         default => 1 },
      { name => "reverse|r", description => "Reverse order of sorting" },
   );

   use constant WANT_READER => 1;
   use constant WANT_OUTPUT => 1;

   sub run ( $pkg, $opts, $reader, $output )
   {
      my $FIELD = $opts->{field}; $FIELD--;

      my @rows;
      while( my $row = $reader->() ) {
         # Parse the timestamps on each line, rather than doing them all at
         # once later using e.g. nsort_by {}, so that warnings come out at the
         # right time

t/10cut.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "cut" ), 'cut command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'cut command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'cut command wants output' );

my @DATA = (
   [ 1,2,3 ],
   [ 4,5,6 ],
);

is(
   run_cmd( $cmd, "-f1", \@DATA ),
   [
      [ 1 ], [ 4 ],

t/10grep.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "grep" ), 'grep command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'grep command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'grep command wants output' );

my @DATA = (
   [ 1, "one" ],
   [ 50, "fifty" ],
   [ 5, "five" ],
   [ 10, "ten" ]
);

is(
   run_cmd( $cmd, "1", \@DATA ),

t/10head.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "head" ), 'head command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'head command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'head command wants output' );

my @DATA = (
   [ 1, "one" ],
   [ 2, "two" ],
   [ 3, "three" ],
   [ 4, "four" ],
   [ 5, "five" ],
);

is(

t/10join.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "join" ), 'join command exists' );

my $toolpkg = $cmd->package;

is( $toolpkg->WANT_READER, 2, 'join command wants 2 readers' );
ok( $toolpkg->WANT_OUTPUT, 'join command wants output' );

my @DAT2 = (
   [ 1, "one" ],
   [ 2, "two" ],
   [ 3, "three" ],
);

is(
   run_cmd( $cmd, "-f1", [ [ 2, "second" ], [ 4, "fourth" ] ], \@DAT2 ),
   [ [ 2, "second", "two" ], [ 4, "fourth" ] ],

t/10sort.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "sort" ), 'sort command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'sort command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'sort command wants output' );

my @DATA = (
   [ 1, "one" ],
   [ 50, "fifty" ],
   [ 5, "five" ],
   [ 10, "ten" ]
);

is(
   run_cmd( $cmd, "", \@DATA ),

t/10tail.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "tail" ), 'tail command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'tail command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'tail command wants output' );

my @DATA = (
   [ 1, "one" ],
   [ 2, "two" ],
   [ 3, "three" ],
   [ 4, "four" ],
   [ 5, "five" ],
);

is(

t/10uniq.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "uniq" ), 'uniq command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'uniq command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'uniq command wants output' );

my @DATA = (
   [ 1, "one" ],
   [ 1, "one again" ],
   [ 2, "two" ],
   [ 3, "three" ],
);

is(
   run_cmd( $cmd, "", \@DATA ),

t/20strftime.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "strftime" ), 'strftime command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'strftime command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'strftime command wants output' );

is(
   run_cmd( $cmd, "-f1 -U", [
      [ 1704808800, "first", ],
      [ 1704809100, "second", ],
      [ 1704812400, "third", ],
   ] ),
   [
      [ "2024-01-09T14:00:00", "first" ],
      [ "2024-01-09T14:05:00", "second" ],

t/20strptime.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "strptime" ), 'strptime command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'strptime command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'strptime command wants output' );

is(
   run_cmd( $cmd, "-f1 -U", [
      [ "2024-01-09T14:00:00", "first" ],
      [ "2024-01-09T14:05:00", "second" ],
      [ "2024-01-09T15:00:00", "third" ],
   ] ),
   [
      [ 1704808800, "first", ],
      [ 1704809100, "second", ],

t/20tsort.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "tsort" ), 'tsort command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'tsort command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'tsort command wants output' );

# Use a format whose strings wouldn't be sorted alphabetically, to demonstrate
# that the sort order works
is(
   run_cmd( $cmd, qq(-f1 --timefmt=%Y/%b/%d), [
      [ "2024/Apr/01", "second" ],
      [ "2024/Dec/01", "fifth" ],
      [ "2024/Feb/01", "first" ],
      [ "2024/Jun/01", "third" ],
      [ "2024/Oct/01", "fourth" ],

t/30smudge.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "smudge" ), 'smudge command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'smudge command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'smudge command wants output' );

is(
   # No filters just passes data
   run_cmd( $cmd, "", [
      [ "one",   "1", "1.0" ],
      [ "two",   "2", "1.1" ],
      [ "three", "3", "1.2" ],
   ] ),
   [
      [ "one",   "1", "1.0" ],

t/40count.t  view on Meta::CPAN


use App::csvtool;

use Commandable::Invocation;

ok( my $cmd = finder->find_command( "count" ), 'count command exists' );

my $toolpkg = $cmd->package;

ok( $toolpkg->WANT_READER, 'count command wants reader' );
ok( $toolpkg->WANT_OUTPUT, 'count command wants output' );

is(
   run_cmd( $cmd, "-f 1", [
      map { [ $_ ] } qw( a b c d e ),
   ] ),
   [
      map { [ $_, 1 ] } qw( a b c d e ),
   ],
   'count -f 1 on unique values' );



( run in 0.625 second using v1.01-cache-2.11-cpan-c6e0e5ac2a7 )