App-combinesheets

 view release on metacpan or  search on metacpan

lib/App/combinesheets.pm  view on Meta::CPAN

#-----------------------------------------------------------------
# App::combinesheets
# Author: Martin Senger <martin.senger@gmail.com>
# For copyright and disclaimer se below.
#
# ABSTRACT: command-line tool merging CSV and TSV spreadsheets
# PODNAME: App::combinesheets
#-----------------------------------------------------------------
use warnings;
use strict;

package App::combinesheets;

our $VERSION = '0.2.14'; # VERSION

use base 'App::Cmd::Simple';

use Pod::Usage;
use Pod::Find qw(pod_where);

use Text::CSV::Simple;
use Text::CSV_XS;
use File::Spec;
use File::Temp;
use File::Which;
use File::BOM qw( :all );
use Algorithm::Loops qw( NestedLoops );
use autouse 'IO::CaptureOutput' => qw(capture_exec);

# reserved keywords in the configuration
use constant {
    CFG_MATCH => 'MATCH',
    CFG_PROG  => 'PROG',
    CFG_PROGS => 'PROGS',
    CFG_PERL  => 'PERL',
};

# types of input files
use constant {
    TYPE_CSV => 'csv',
    TYPE_TSV => 'tsv',
#    TYPE_XSL => 'xsl',    # not-yet-supported
};

# hash keys describing an input ($inputs)
use constant {
    INPUT_FILE              => 'file',
    INPUT_TYPE              => 'type',
    INPUT_MATCHED_BY        => 'matched_by',
    INPUT_MATCHED_BY_INDEX  => 'matched_by_index',
    INPUT_HEADERS           => 'headers',
    INPUT_CONTENT           => 'content',
};

# hash keys describing wanted fields ($wanted_columns)
use constant {
    CFG_TYPE     => 'type',   # what kind of input (MATCH, PROG, PROGS or PERL)
    CFG_OUT_COL  => 'ocol',   # a name for this column used in the output
    # keys used for the normal (MATCH) columns
    CFG_ID       => 'id',     # which input
    CFG_IN_COL   => 'icol',   # a column in such input
    # keys used for the calculated columns (i.e. of type PROG, PROGS or PERL)
    CFG_EXT      => 'id',     # name of the external program or Perl external subroutine
    PERL_DETAILS => '_perl_details_', # added during the config processing
};

# ----------------------------------------------------------------
# Command-line arguments and script usage
# ----------------------------------------------------------------
sub usage_desc {
     my $self = shift;
     return "%c -config <config-file> -inputs <inputs> [other otions...]";
}
sub opt_spec {
    return (
        [ 'h'               => "display a short usage message" ],
        [ 'help'            => "display a full usage message"  ],
        [ 'man|m'           => "display a full manual page"    ],
        [ 'version|v'       => "display a version"             ],
        [],
        [ 'config|cfg=s'    => "<configuration file>"          ],
        [ 'inputs|i=s@{1,}' => "<input files> in the form: <input-ID>=<filename>[,<input-ID>=<filename>...] (e.g. PERSON=<persons.tsv>,CAR=<cars.csv>)"                                  ],
        [ 'outfile|o=s'     => "<output file>"                 ],
        [ 'check|c'         => "only check the configuration"  ],

        { getopt_conf => ['no_bundling', 'no_ignore_case', 'auto_abbrev'] }
        );
}
sub validate_args {
    my ($self, $opt, $args) = @_;

    # show various levels of help and exit
    my $pod_where = pod_where ({-inc => 1}, __PACKAGE__);
    if ($opt->h) {
        print "Usage: " . $self->usage();
        if ($^S) { die "Okay\n" } else { exit (0) };
    }
    pod2usage (-input => $pod_where, -verbose => 1, -exitval => 0) if $opt->help;
    pod2usage (-input => $pod_where, -verbose => 2, -exitval => 0) if $opt->man;

    # show version and exit
    if ($opt->version) {
        ## no critic
        no strict;    # because the $VERSION will be added only when
        no warnings;  # the distribution is fully built up
        print "$VERSION\n";
        if ($^S) { die "Okay\n" } else { exit (0) };
    }

    # check required command-line arguments
    $self->usage_error ("Parameter '-config' is required.")
        unless $opt->config;

lib/App/combinesheets.pm  view on Meta::CPAN

            # locate the external program
            $col->{CFG_EXT()} = find_prog ($col->{CFG_EXT()});

        } elsif ($col->{CFG_TYPE()} eq CFG_PERL) {

            # load the wanted Perl module
            my $call = $col->{CFG_EXT()};
            $call =~ m{^(.+)((::)|(->))(.*)$};
            my $module = $1;
            my $subroutine = $5;
            my $how_to_call = $2;   # can be '::' or '->'
            unless ($module) {
                warn "[WR11] Missing module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
                $col->{ignored} = 1;
                next;
            }
            unless ($subroutine) {
                warn "[WR12] Missing subroutine name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
                $col->{ignored} = 1;
                next;
            }
            if ($module =~ m{^:+}) {
                warn "[WR13] Uncomplete module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
                $col->{ignored} = 1;
                next;
            }
            eval "require $module";  ## no critic
            if ($@) {
                warn "[WR14] Cannot load module '$module': $@. Column '" . $col->{CFG_OUT_COL()} . " ignored\n";
                $col->{ignored} = 1;
                next;
            }
            $module->import();

            # remember what we just parsed and checked
            $col->{PERL_DETAILS()} = {};
            $col->{PERL_DETAILS()}->{what_to_call} = $module . $how_to_call . $subroutine;
            $col->{PERL_DETAILS()}->{module} = $module;
            $col->{PERL_DETAILS()}->{subroutine} = $subroutine;
            $col->{PERL_DETAILS()}->{how_to_call} = $how_to_call;
        }
    }
    $wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];

    # locate expected inputs
    my $primary_input;   # ID of the first input
    foreach my $opt_input (@opt_inputs) {
        my ($key, $value) = split (m{\s*=\s*}, $opt_input, 2);
        next unless $key;
        next unless $value;
        $key = uc ($key);
        unless (exists $known_inputs->{$key}) {
            warn "[WR03] Configuration does not recognize the input named '$key'. Input ignored.\n";
            next;
        }
        unless (exists $matches->{$key}) {
            warn "[WR04] Input named '$key' does not have any MATCH column defined in configuration. Input ignored.\n";
            next;
        }
        $primary_input = $key unless $primary_input;   # remember which input came first
        my $input = { INPUT_FILE()       => $value,
                      INPUT_MATCHED_BY() => $matches->{$key} };
        if ($value =~ m{\.csv$}i) {
            $input->{INPUT_TYPE()} = TYPE_CSV;
        } else {
            $input->{INPUT_TYPE()} = TYPE_TSV;
        }
        $inputs->{$key} = $input;
    }
    die "[ER01] No valid inputs specified. Exiting.\n"
        unless scalar keys (%$inputs) > 0;

    # read headers from all inputs
    my $headers_by_id = {};  # used for re-using the same headers once read, and for some checks
    foreach my $input_id (keys %$inputs) {
        my $input = $inputs->{$input_id};
        my $headers;
        if (exists $headers_by_id->{$input_id}) {
            $headers = $headers_by_id->{$input_id};   # copy already known headers
        } else {
            $headers = read_headers ($input);
        }

        # add new properties to $input
        unless (exists $headers->{ $input->{INPUT_MATCHED_BY()} }) {
            warn ("[WR05] Input '$input_id' does not contain the matching header '" . $input->{INPUT_MATCHED_BY()} .
                  "'. Input ignored\n");
            delete $inputs->{$input_id};
            next;
        }
        $headers_by_id->{$input_id} = $headers
            unless exists $headers_by_id->{$input_id};
        $input->{INPUT_HEADERS()} = $headers;
        $input->{INPUT_MATCHED_BY_INDEX()} = $headers->{ $input->{INPUT_MATCHED_BY()} };
    }

    # check real headers vs. headers as defined in configuration
    my $already_reported = {};
    foreach my $col (@$wanted_cols) {
        next if $col->{CFG_TYPE()} ne CFG_MATCH;   # check is done only for normal columns
        my $input_id = $col->{CFG_ID()};
        if (exists $headers_by_id->{$input_id}) {
            # does the requested column exist in this input's headers?
            unless (column_exists ($input_id, $col->{CFG_IN_COL()})) {
                warn "[WR06] Column '$col->{CFG_IN_COL()}' not found in the input '$input_id'. Column will be ignored.\n";
                $col->{ignored} = 1;
            }
            next;

        } elsif (!exists $already_reported->{$input_id}) {
            $already_reported->{$input_id} = 1;
            warn "[WR07] Configuration defines columns from an input '$input_id' but no such input given (or was ignored). These columns will be ignored.\n";
        }
        $col->{ignored} = 1;
    }
    $wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];

    foreach my $input_id (keys %$matches) {
        next unless exists $inputs->{$input_id};   # ignoring matches whose inputs are already ignored
        # does the matching column exist in this input's headers?
        unless (column_exists ($input_id, $matches->{$input_id})) {
            die "[ER02] Matching column '$matches->{$input_id}' not found in the input '$input_id'. Must exit.\n";
        }
    }

    # do we still have a primary input?
    unless (exists $inputs->{$primary_input}) {
        die "[ER03] Due to errors, the primary input '$primary_input' is now ignored. Must exit.\n";
    }

    # end of checking
    exit (0) if $opt->check;

    # read all inputs into memory
    foreach my $input_id (keys %$inputs) {
        my $input = $inputs->{$input_id};
        my $content = read_content ($input);
        $input->{INPUT_CONTENT()} = $content;
    }

    # output combined headers
    my @header_line = ();
    foreach my $col (@$wanted_cols) {
        push (@header_line, $col->{CFG_OUT_COL()});
    }
    print $combined join ("\t", @header_line) . "\n"
        unless scalar @header_line == 0;

    # combine all inputs and make output lines
    foreach my $matching_content (sort keys %{ $inputs->{$primary_input}->{INPUT_CONTENT()} }) {
        # $matching_content is, for example, a publication title ("An Atlas of....")

        # inputs may have more lines with the same value in the matching columns
        # therefore, extract first the matching lines from all inputs
        my $lines_to_combine = [];
        my $inputs_to_combine = {};  # keys are inputs' CFG_IDs, values are indeces into $lines_to_combine

        foreach my $col (@$wanted_cols) {
            if ($col->{CFG_TYPE()} eq CFG_MATCH) {
                unless (exists $inputs_to_combine->{ $col->{CFG_ID()} }) {
                    # remember the same lines (from the same input) only once
                    my $input = $inputs->{ $col->{CFG_ID()} };
                    push (@$lines_to_combine, $input->{INPUT_CONTENT()}->{$matching_content} || [undef]);
                    $inputs_to_combine->{ $col->{CFG_ID()} } = $#$lines_to_combine;
                }
            }
        }

        # make all combinantions of matching lines

        # let's have 3 inputs, identified by K, L and M
        # there are three matching lines in K, two in L and one in M:
        # my $lines_to_combine = [
        #       [ "line1", "line2", "line3", ], # from input K
        #       [ "lineX", "lineY", ],          # from input L
        #       [ "lineQ", ],                   # from input M
        #       );
        # my $inputs_to_combine = { K => 0, L => 1, M => 2 };
        #
        # the subroutine create_output_line() will be called 6 times
        # with the following arguments:
        #   line1, lineX, lineQ
        #   line1, lineY, lineQ
        #   line2, lineX, lineQ
        #   line2, lineY, lineQ
        #   line3, lineX, lineQ
        #   line3, lineY, lineQ

        NestedLoops ($lines_to_combine,
                     sub {
                         my @input_lines = @_;
                         my @output_line = ();
                         my @calculated = ();   # indeces of the yet-to-be-calculated elements
                         my $column_count = -1;
                         foreach my $col (@$wanted_cols) { # $col defines what data to push into @output_line
                             $column_count++;
                             if ($col->{CFG_TYPE()} eq CFG_MATCH) {
                                 my $input = $inputs->{ $col->{CFG_ID()} };
                                 my $input_line = @input_lines[$inputs_to_combine->{ $col->{CFG_ID()} }];
                                 # use Data::Dumper;
                                 # print Dumper (\@input_lines);
                                 # print Dumper ($inputs_to_combine);
                                 my $idx = $input->{INPUT_HEADERS()}->{ $col->{CFG_IN_COL()} };
                                 my $value = $input_line->[$idx] || '';
                                 push (@output_line, $value);
                             } else {
                                 push (@calculated, $column_count);
                                 push (@output_line, '');
                             }
                         }
                         # insert the calculated columns
                         foreach my $idx (@calculated) {
                             if ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROG) {
                                 $output_line[$idx] = call_prog ($wanted_cols->[$idx], \@header_line, \@output_line);
                             } elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROGS) {
                                 $output_line[$idx] = call_prog_simple ($wanted_cols->[$idx]);
                             } elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PERL) {
                                 $output_line[$idx] = call_perl ($wanted_cols->[$idx], \@header_line, \@output_line);
                             }
                         }

                         print $combined join ("\t", @output_line) . "\n"
                             unless scalar @output_line == 0;
                     });
    }
    close $combined if $opt_outfile;
}

# ----------------------------------------------------------------
# Call a Perl subroutine (from any module) in order to get a value for
# a "calculated" column. $column defines which column to fill,
# $header_line is an arra is an arrayref with column headers and the
# $data_line is another arrayref with the values for the current row.
#
# $column->{PERL_DETAILS} contains all details needed for the call
# ----------------------------------------------------------------
sub call_perl {
    my ($column, $header_line, $data_line) = @_;

    no strict;  ## no critic
    my $what_to_call = $column->{PERL_DETAILS()}->{what_to_call};
    my $how_to_call  = $column->{PERL_DETAILS()}->{how_to_call};
    my $module       = $column->{PERL_DETAILS()}->{module};
    my $subroutine   = $column->{PERL_DETAILS()}->{subroutine};

    if ($how_to_call eq '->') {
        return $module->$subroutine ($column, $header_line, $data_line);
    } else {
        return &$what_to_call ($column, $header_line, $data_line);
    }
}

# ----------------------------------------------------------------
# Call an external program in order to get a value for a "calculated"
# column. $column defines which column to fill, $header_line is an
# arra is an arrayref with column headers and the $data_line is
# another arrayref with the values for the current row.
#
# $column->{CFG_EXT} contains a program name to call
# ----------------------------------------------------------------
sub call_prog {
    my ($column, $header_line, $data_line) = @_;

lib/App/combinesheets.pm  view on Meta::CPAN

# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub _call_it {
    my @command = @_;
    my ($stdout, $stderr, $success, $exit_code) = capture_exec (@command);
    if ($exit_code != 0 or $stderr) {
        my $errmsg = '[ER05] Failed command: ' . join (' ', map {"'$_'"} @command) . "\n";
        $errmsg .= "STDERR: $stderr\n" if $stderr;
        $errmsg .= "EXIT CODE: $exit_code\n";
        die $errmsg;
    }
    chomp $stdout;         # remove the last newline
    $stdout =~ s{\n}{ }g;  # better to replace newlines
    return $stdout;
}

# ----------------------------------------------------------------
# Locate given $prgname and return it, usually with an added path. Or
# die if such program cannot be found or it is not executable.
# ----------------------------------------------------------------
sub find_prog {
    my $prgname = shift;
    my $full_name;

    # 1) try the name as it is (e.g. the ones with an absolute path)
    if (-e $prgname and -x $prgname and
        File::Spec->file_name_is_absolute ($prgname)) {
        return $prgname;
    }

    # 2) try to find it on system PATH
    $full_name = which ($prgname);
    if ($full_name ) {
        chomp $full_name;
        return $full_name;
    }

    # 3) try the environment variable with a path
    if (exists $ENV{COMBINE_SHEETS_EXT_PATH}) {
        $full_name = File::Spec->catfile ($ENV{COMBINE_SHEETS_EXT_PATH}, $prgname);
        return maybe_die ($full_name);
    }

    # 4) try to find it in the current directory
    $full_name = File::Spec->catfile ('./', $prgname);
    return maybe_die ($full_name);
}
sub maybe_die {
    my $prg = shift;
    die "[ER04] '$prg' not found or is not executable.\n"
        unless -e $prg and -x $prg;
    return $prg;
}

# ----------------------------------------------------------------
# Does the requested $column exist in the given input's headers?
# ----------------------------------------------------------------
sub column_exists {
    my ($input_id, $column) = @_;
    return exists $inputs->{$input_id}->{INPUT_HEADERS()}->{$column};
}

# ----------------------------------------------------------------
# Read the headers (the first line) form an input file (given in
# hashref $input) and store them in the hashref $headers, each od them
# with its index as it appears in the read file. Do nothing if
# $headers already contains headers from the same input identifier.
# ----------------------------------------------------------------
sub read_headers {
    my ($input) = @_;

    my $headers;
    if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
        $headers = read_csv_headers ($input->{INPUT_FILE()});
    } else {
        $headers = read_tsv_headers ($input->{INPUT_FILE()});
    }
    my $new_headers = {};
    my $column_index = 0;
    foreach my $column (@$headers) {
        $new_headers->{$column} = $column_index++;
    }
    return $new_headers;
}

# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_csv_headers {
    my ($file) = @_;
    my $line = read_first_line ($file);

    my $parser = Text::CSV_XS->new ({
        allow_loose_quotes => 1,
        escape_char        => "\\",
                                    });
    if ($parser->parse ($line)) {
        return [ $parser->fields ];
    } else {
        die "[ER04] Parsing CSV file $file failed: " .
            $parser->error_input  . "\n" .
            $parser->error_diag() . "\n";
    }
}

# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_tsv_headers {
    my ($file) = @_;
    my $line = read_first_line ($file);
    return [ split (m{\t}, $line) ];
}

# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_first_line {
    my ($file) = @_;
    my $fh;
    open_bom ($fh, $file); # or open ($fh, '<', $file)
        # or die "[ER00] Cannot read input file $file: $!\n";
    my $line = <$fh>;          # read just one line
    close $fh;
    $line =~ s{(\r|\n)+$}{};   # remove newlines of any kind
    return $line;
}

# ----------------------------------------------------------------
# Stringify a hashref
# ----------------------------------------------------------------
sub ph {
    my $hashref = shift;
    my $result = '';
    my ($key, $value);
    while (($key, $value) = each (%$hashref)) {
        $result .= "$key => $value,";
    }
    return substr ($result, 0, -1);
}

# ----------------------------------------------------------------
# Read contents...
# ----------------------------------------------------------------
sub read_content {
    my ($input) = @_;
    my $content;
    if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
        $content = read_csv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
    } else {
        $content = read_tsv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
    }
    return $content;
}

# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_tsv_content {
    my ($file, $matched_index) = @_;
    my $fh;
    open_bom ($fh, $file); # or open ($fh, '<', $file)
        # or die "[ER00] Cannot read input file $file: $!\n";
    my $content = {};
    my $line_count = 0;
    while (my $line = <$fh>) {
        next if $line_count++ == 0;  # skip header line
        next if $line =~ m{^\s*$};   # ignore empty lines
        $line =~ s{(\r|\n)+$}{};     # remove newlines of any kind
        my @data = split (m{\t}, $line);
        $content->{ $data[$matched_index] } = [] unless $content->{ $data[$matched_index] };
        push (@{ $content->{ $data[$matched_index] } }, [@data]);
    }
    close $fh;
    return $content;
}

# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_csv_content {
    my ($file, $matched_index) = @_;
    my $count_lines = 0;
    my $content = {};

    # create a CSV parser; any error in reading input will be fatal
    my $csv = Text::CSV_XS->new ({
        allow_loose_quotes => 1,
        escape_char        => "\\",
        auto_diag          => 1,
                                 });

    # read the CSV input
    open_bom (my $fh, $file);
    while (<$fh>) {
        if ($csv->parse ($_)) {
            next if $count_lines++ == 0;   # headers are ignored
            my @data = $csv->fields;
            if (@data) {
                push (@{ $content->{ $data[$matched_index] } }, \@data);
            }
        } else {
            my $err = $csv->error_input;
            warn "[WR09] Possible a wrong or not-readable input file '$file': $err\n";
            exit (1);
        }
    }

    # $parser->add_trigger (after_parse => sub {
    #     my ($self, $data) = @_;
    #     return if $count_lines++ == 0;   # headers are ignored



( run in 0.642 second using v1.01-cache-2.11-cpan-13bb782fe5a )