App-combinesheets
view release on metacpan or search on metacpan
lib/App/combinesheets.pm view on Meta::CPAN
#-----------------------------------------------------------------
# App::combinesheets
# Author: Martin Senger <martin.senger@gmail.com>
# For copyright and disclaimer se below.
#
# ABSTRACT: command-line tool merging CSV and TSV spreadsheets
# PODNAME: App::combinesheets
#-----------------------------------------------------------------
use warnings;
use strict;
package App::combinesheets;
our $VERSION = '0.2.14'; # VERSION
use base 'App::Cmd::Simple';
use Pod::Usage;
use Pod::Find qw(pod_where);
use Text::CSV::Simple;
use Text::CSV_XS;
use File::Spec;
use File::Temp;
use File::Which;
use File::BOM qw( :all );
use Algorithm::Loops qw( NestedLoops );
use autouse 'IO::CaptureOutput' => qw(capture_exec);
# reserved keywords in the configuration
use constant {
CFG_MATCH => 'MATCH',
CFG_PROG => 'PROG',
CFG_PROGS => 'PROGS',
CFG_PERL => 'PERL',
};
# types of input files
use constant {
TYPE_CSV => 'csv',
TYPE_TSV => 'tsv',
# TYPE_XSL => 'xsl', # not-yet-supported
};
# hash keys describing an input ($inputs)
use constant {
INPUT_FILE => 'file',
INPUT_TYPE => 'type',
INPUT_MATCHED_BY => 'matched_by',
INPUT_MATCHED_BY_INDEX => 'matched_by_index',
INPUT_HEADERS => 'headers',
INPUT_CONTENT => 'content',
};
# hash keys describing wanted fields ($wanted_columns)
use constant {
CFG_TYPE => 'type', # what kind of input (MATCH, PROG, PROGS or PERL)
CFG_OUT_COL => 'ocol', # a name for this column used in the output
# keys used for the normal (MATCH) columns
CFG_ID => 'id', # which input
CFG_IN_COL => 'icol', # a column in such input
# keys used for the calculated columns (i.e. of type PROG, PROGS or PERL)
CFG_EXT => 'id', # name of the external program or Perl external subroutine
PERL_DETAILS => '_perl_details_', # added during the config processing
};
# ----------------------------------------------------------------
# Command-line arguments and script usage
# ----------------------------------------------------------------
sub usage_desc {
my $self = shift;
return "%c -config <config-file> -inputs <inputs> [other otions...]";
}
sub opt_spec {
return (
[ 'h' => "display a short usage message" ],
[ 'help' => "display a full usage message" ],
[ 'man|m' => "display a full manual page" ],
[ 'version|v' => "display a version" ],
[],
[ 'config|cfg=s' => "<configuration file>" ],
[ 'inputs|i=s@{1,}' => "<input files> in the form: <input-ID>=<filename>[,<input-ID>=<filename>...] (e.g. PERSON=<persons.tsv>,CAR=<cars.csv>)" ],
[ 'outfile|o=s' => "<output file>" ],
[ 'check|c' => "only check the configuration" ],
{ getopt_conf => ['no_bundling', 'no_ignore_case', 'auto_abbrev'] }
);
}
sub validate_args {
my ($self, $opt, $args) = @_;
# show various levels of help and exit
my $pod_where = pod_where ({-inc => 1}, __PACKAGE__);
if ($opt->h) {
print "Usage: " . $self->usage();
if ($^S) { die "Okay\n" } else { exit (0) };
}
pod2usage (-input => $pod_where, -verbose => 1, -exitval => 0) if $opt->help;
pod2usage (-input => $pod_where, -verbose => 2, -exitval => 0) if $opt->man;
# show version and exit
if ($opt->version) {
## no critic
no strict; # because the $VERSION will be added only when
no warnings; # the distribution is fully built up
print "$VERSION\n";
if ($^S) { die "Okay\n" } else { exit (0) };
}
# check required command-line arguments
$self->usage_error ("Parameter '-config' is required.")
unless $opt->config;
lib/App/combinesheets.pm view on Meta::CPAN
# locate the external program
$col->{CFG_EXT()} = find_prog ($col->{CFG_EXT()});
} elsif ($col->{CFG_TYPE()} eq CFG_PERL) {
# load the wanted Perl module
my $call = $col->{CFG_EXT()};
$call =~ m{^(.+)((::)|(->))(.*)$};
my $module = $1;
my $subroutine = $5;
my $how_to_call = $2; # can be '::' or '->'
unless ($module) {
warn "[WR11] Missing module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
$col->{ignored} = 1;
next;
}
unless ($subroutine) {
warn "[WR12] Missing subroutine name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
$col->{ignored} = 1;
next;
}
if ($module =~ m{^:+}) {
warn "[WR13] Uncomplete module name in '[PERL] " . $col->{CFG_OUT_COL()} . "'. Column ignored.\n";
$col->{ignored} = 1;
next;
}
eval "require $module"; ## no critic
if ($@) {
warn "[WR14] Cannot load module '$module': $@. Column '" . $col->{CFG_OUT_COL()} . " ignored\n";
$col->{ignored} = 1;
next;
}
$module->import();
# remember what we just parsed and checked
$col->{PERL_DETAILS()} = {};
$col->{PERL_DETAILS()}->{what_to_call} = $module . $how_to_call . $subroutine;
$col->{PERL_DETAILS()}->{module} = $module;
$col->{PERL_DETAILS()}->{subroutine} = $subroutine;
$col->{PERL_DETAILS()}->{how_to_call} = $how_to_call;
}
}
$wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];
# locate expected inputs
my $primary_input; # ID of the first input
foreach my $opt_input (@opt_inputs) {
my ($key, $value) = split (m{\s*=\s*}, $opt_input, 2);
next unless $key;
next unless $value;
$key = uc ($key);
unless (exists $known_inputs->{$key}) {
warn "[WR03] Configuration does not recognize the input named '$key'. Input ignored.\n";
next;
}
unless (exists $matches->{$key}) {
warn "[WR04] Input named '$key' does not have any MATCH column defined in configuration. Input ignored.\n";
next;
}
$primary_input = $key unless $primary_input; # remember which input came first
my $input = { INPUT_FILE() => $value,
INPUT_MATCHED_BY() => $matches->{$key} };
if ($value =~ m{\.csv$}i) {
$input->{INPUT_TYPE()} = TYPE_CSV;
} else {
$input->{INPUT_TYPE()} = TYPE_TSV;
}
$inputs->{$key} = $input;
}
die "[ER01] No valid inputs specified. Exiting.\n"
unless scalar keys (%$inputs) > 0;
# read headers from all inputs
my $headers_by_id = {}; # used for re-using the same headers once read, and for some checks
foreach my $input_id (keys %$inputs) {
my $input = $inputs->{$input_id};
my $headers;
if (exists $headers_by_id->{$input_id}) {
$headers = $headers_by_id->{$input_id}; # copy already known headers
} else {
$headers = read_headers ($input);
}
# add new properties to $input
unless (exists $headers->{ $input->{INPUT_MATCHED_BY()} }) {
warn ("[WR05] Input '$input_id' does not contain the matching header '" . $input->{INPUT_MATCHED_BY()} .
"'. Input ignored\n");
delete $inputs->{$input_id};
next;
}
$headers_by_id->{$input_id} = $headers
unless exists $headers_by_id->{$input_id};
$input->{INPUT_HEADERS()} = $headers;
$input->{INPUT_MATCHED_BY_INDEX()} = $headers->{ $input->{INPUT_MATCHED_BY()} };
}
# check real headers vs. headers as defined in configuration
my $already_reported = {};
foreach my $col (@$wanted_cols) {
next if $col->{CFG_TYPE()} ne CFG_MATCH; # check is done only for normal columns
my $input_id = $col->{CFG_ID()};
if (exists $headers_by_id->{$input_id}) {
# does the requested column exist in this input's headers?
unless (column_exists ($input_id, $col->{CFG_IN_COL()})) {
warn "[WR06] Column '$col->{CFG_IN_COL()}' not found in the input '$input_id'. Column will be ignored.\n";
$col->{ignored} = 1;
}
next;
} elsif (!exists $already_reported->{$input_id}) {
$already_reported->{$input_id} = 1;
warn "[WR07] Configuration defines columns from an input '$input_id' but no such input given (or was ignored). These columns will be ignored.\n";
}
$col->{ignored} = 1;
}
$wanted_cols = [ grep { not $_->{ignored} } @$wanted_cols ];
foreach my $input_id (keys %$matches) {
next unless exists $inputs->{$input_id}; # ignoring matches whose inputs are already ignored
# does the matching column exist in this input's headers?
unless (column_exists ($input_id, $matches->{$input_id})) {
die "[ER02] Matching column '$matches->{$input_id}' not found in the input '$input_id'. Must exit.\n";
}
}
# do we still have a primary input?
unless (exists $inputs->{$primary_input}) {
die "[ER03] Due to errors, the primary input '$primary_input' is now ignored. Must exit.\n";
}
# end of checking
exit (0) if $opt->check;
# read all inputs into memory
foreach my $input_id (keys %$inputs) {
my $input = $inputs->{$input_id};
my $content = read_content ($input);
$input->{INPUT_CONTENT()} = $content;
}
# output combined headers
my @header_line = ();
foreach my $col (@$wanted_cols) {
push (@header_line, $col->{CFG_OUT_COL()});
}
print $combined join ("\t", @header_line) . "\n"
unless scalar @header_line == 0;
# combine all inputs and make output lines
foreach my $matching_content (sort keys %{ $inputs->{$primary_input}->{INPUT_CONTENT()} }) {
# $matching_content is, for example, a publication title ("An Atlas of....")
# inputs may have more lines with the same value in the matching columns
# therefore, extract first the matching lines from all inputs
my $lines_to_combine = [];
my $inputs_to_combine = {}; # keys are inputs' CFG_IDs, values are indeces into $lines_to_combine
foreach my $col (@$wanted_cols) {
if ($col->{CFG_TYPE()} eq CFG_MATCH) {
unless (exists $inputs_to_combine->{ $col->{CFG_ID()} }) {
# remember the same lines (from the same input) only once
my $input = $inputs->{ $col->{CFG_ID()} };
push (@$lines_to_combine, $input->{INPUT_CONTENT()}->{$matching_content} || [undef]);
$inputs_to_combine->{ $col->{CFG_ID()} } = $#$lines_to_combine;
}
}
}
# make all combinantions of matching lines
# let's have 3 inputs, identified by K, L and M
# there are three matching lines in K, two in L and one in M:
# my $lines_to_combine = [
# [ "line1", "line2", "line3", ], # from input K
# [ "lineX", "lineY", ], # from input L
# [ "lineQ", ], # from input M
# );
# my $inputs_to_combine = { K => 0, L => 1, M => 2 };
#
# the subroutine create_output_line() will be called 6 times
# with the following arguments:
# line1, lineX, lineQ
# line1, lineY, lineQ
# line2, lineX, lineQ
# line2, lineY, lineQ
# line3, lineX, lineQ
# line3, lineY, lineQ
NestedLoops ($lines_to_combine,
sub {
my @input_lines = @_;
my @output_line = ();
my @calculated = (); # indeces of the yet-to-be-calculated elements
my $column_count = -1;
foreach my $col (@$wanted_cols) { # $col defines what data to push into @output_line
$column_count++;
if ($col->{CFG_TYPE()} eq CFG_MATCH) {
my $input = $inputs->{ $col->{CFG_ID()} };
my $input_line = @input_lines[$inputs_to_combine->{ $col->{CFG_ID()} }];
# use Data::Dumper;
# print Dumper (\@input_lines);
# print Dumper ($inputs_to_combine);
my $idx = $input->{INPUT_HEADERS()}->{ $col->{CFG_IN_COL()} };
my $value = $input_line->[$idx] || '';
push (@output_line, $value);
} else {
push (@calculated, $column_count);
push (@output_line, '');
}
}
# insert the calculated columns
foreach my $idx (@calculated) {
if ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROG) {
$output_line[$idx] = call_prog ($wanted_cols->[$idx], \@header_line, \@output_line);
} elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROGS) {
$output_line[$idx] = call_prog_simple ($wanted_cols->[$idx]);
} elsif ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PERL) {
$output_line[$idx] = call_perl ($wanted_cols->[$idx], \@header_line, \@output_line);
}
}
print $combined join ("\t", @output_line) . "\n"
unless scalar @output_line == 0;
});
}
close $combined if $opt_outfile;
}
# ----------------------------------------------------------------
# Call a Perl subroutine (from any module) in order to get a value for
# a "calculated" column. $column defines which column to fill,
# $header_line is an arra is an arrayref with column headers and the
# $data_line is another arrayref with the values for the current row.
#
# $column->{PERL_DETAILS} contains all details needed for the call
# ----------------------------------------------------------------
sub call_perl {
my ($column, $header_line, $data_line) = @_;
no strict; ## no critic
my $what_to_call = $column->{PERL_DETAILS()}->{what_to_call};
my $how_to_call = $column->{PERL_DETAILS()}->{how_to_call};
my $module = $column->{PERL_DETAILS()}->{module};
my $subroutine = $column->{PERL_DETAILS()}->{subroutine};
if ($how_to_call eq '->') {
return $module->$subroutine ($column, $header_line, $data_line);
} else {
return &$what_to_call ($column, $header_line, $data_line);
}
}
# ----------------------------------------------------------------
# Call an external program in order to get a value for a "calculated"
# column. $column defines which column to fill, $header_line is an
# arra is an arrayref with column headers and the $data_line is
# another arrayref with the values for the current row.
#
# $column->{CFG_EXT} contains a program name to call
# ----------------------------------------------------------------
sub call_prog {
my ($column, $header_line, $data_line) = @_;
lib/App/combinesheets.pm view on Meta::CPAN
# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub _call_it {
my @command = @_;
my ($stdout, $stderr, $success, $exit_code) = capture_exec (@command);
if ($exit_code != 0 or $stderr) {
my $errmsg = '[ER05] Failed command: ' . join (' ', map {"'$_'"} @command) . "\n";
$errmsg .= "STDERR: $stderr\n" if $stderr;
$errmsg .= "EXIT CODE: $exit_code\n";
die $errmsg;
}
chomp $stdout; # remove the last newline
$stdout =~ s{\n}{ }g; # better to replace newlines
return $stdout;
}
# ----------------------------------------------------------------
# Locate given $prgname and return it, usually with an added path. Or
# die if such program cannot be found or it is not executable.
# ----------------------------------------------------------------
sub find_prog {
my $prgname = shift;
my $full_name;
# 1) try the name as it is (e.g. the ones with an absolute path)
if (-e $prgname and -x $prgname and
File::Spec->file_name_is_absolute ($prgname)) {
return $prgname;
}
# 2) try to find it on system PATH
$full_name = which ($prgname);
if ($full_name ) {
chomp $full_name;
return $full_name;
}
# 3) try the environment variable with a path
if (exists $ENV{COMBINE_SHEETS_EXT_PATH}) {
$full_name = File::Spec->catfile ($ENV{COMBINE_SHEETS_EXT_PATH}, $prgname);
return maybe_die ($full_name);
}
# 4) try to find it in the current directory
$full_name = File::Spec->catfile ('./', $prgname);
return maybe_die ($full_name);
}
sub maybe_die {
my $prg = shift;
die "[ER04] '$prg' not found or is not executable.\n"
unless -e $prg and -x $prg;
return $prg;
}
# ----------------------------------------------------------------
# Does the requested $column exist in the given input's headers?
# ----------------------------------------------------------------
sub column_exists {
my ($input_id, $column) = @_;
return exists $inputs->{$input_id}->{INPUT_HEADERS()}->{$column};
}
# ----------------------------------------------------------------
# Read the headers (the first line) form an input file (given in
# hashref $input) and store them in the hashref $headers, each od them
# with its index as it appears in the read file. Do nothing if
# $headers already contains headers from the same input identifier.
# ----------------------------------------------------------------
sub read_headers {
my ($input) = @_;
my $headers;
if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
$headers = read_csv_headers ($input->{INPUT_FILE()});
} else {
$headers = read_tsv_headers ($input->{INPUT_FILE()});
}
my $new_headers = {};
my $column_index = 0;
foreach my $column (@$headers) {
$new_headers->{$column} = $column_index++;
}
return $new_headers;
}
# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_csv_headers {
my ($file) = @_;
my $line = read_first_line ($file);
my $parser = Text::CSV_XS->new ({
allow_loose_quotes => 1,
escape_char => "\\",
});
if ($parser->parse ($line)) {
return [ $parser->fields ];
} else {
die "[ER04] Parsing CSV file $file failed: " .
$parser->error_input . "\n" .
$parser->error_diag() . "\n";
}
}
# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_tsv_headers {
my ($file) = @_;
my $line = read_first_line ($file);
return [ split (m{\t}, $line) ];
}
# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_first_line {
my ($file) = @_;
my $fh;
open_bom ($fh, $file); # or open ($fh, '<', $file)
# or die "[ER00] Cannot read input file $file: $!\n";
my $line = <$fh>; # read just one line
close $fh;
$line =~ s{(\r|\n)+$}{}; # remove newlines of any kind
return $line;
}
# ----------------------------------------------------------------
# Stringify a hashref
# ----------------------------------------------------------------
sub ph {
my $hashref = shift;
my $result = '';
my ($key, $value);
while (($key, $value) = each (%$hashref)) {
$result .= "$key => $value,";
}
return substr ($result, 0, -1);
}
# ----------------------------------------------------------------
# Read contents...
# ----------------------------------------------------------------
sub read_content {
my ($input) = @_;
my $content;
if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
$content = read_csv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
} else {
$content = read_tsv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
}
return $content;
}
# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_tsv_content {
my ($file, $matched_index) = @_;
my $fh;
open_bom ($fh, $file); # or open ($fh, '<', $file)
# or die "[ER00] Cannot read input file $file: $!\n";
my $content = {};
my $line_count = 0;
while (my $line = <$fh>) {
next if $line_count++ == 0; # skip header line
next if $line =~ m{^\s*$}; # ignore empty lines
$line =~ s{(\r|\n)+$}{}; # remove newlines of any kind
my @data = split (m{\t}, $line);
$content->{ $data[$matched_index] } = [] unless $content->{ $data[$matched_index] };
push (@{ $content->{ $data[$matched_index] } }, [@data]);
}
close $fh;
return $content;
}
# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_csv_content {
my ($file, $matched_index) = @_;
my $count_lines = 0;
my $content = {};
# create a CSV parser; any error in reading input will be fatal
my $csv = Text::CSV_XS->new ({
allow_loose_quotes => 1,
escape_char => "\\",
auto_diag => 1,
});
# read the CSV input
open_bom (my $fh, $file);
while (<$fh>) {
if ($csv->parse ($_)) {
next if $count_lines++ == 0; # headers are ignored
my @data = $csv->fields;
if (@data) {
push (@{ $content->{ $data[$matched_index] } }, \@data);
}
} else {
my $err = $csv->error_input;
warn "[WR09] Possible a wrong or not-readable input file '$file': $err\n";
exit (1);
}
}
# $parser->add_trigger (after_parse => sub {
# my ($self, $data) = @_;
# return if $count_lines++ == 0; # headers are ignored
( run in 0.642 second using v1.01-cache-2.11-cpan-13bb782fe5a )