App-combinesheets
view release on metacpan or search on metacpan
lib/App/combinesheets.pm view on Meta::CPAN
# types of input files
use constant {
TYPE_CSV => 'csv',
TYPE_TSV => 'tsv',
# TYPE_XSL => 'xsl', # not-yet-supported
};
# hash keys describing an input ($inputs)
use constant {
INPUT_FILE => 'file',
INPUT_TYPE => 'type',
INPUT_MATCHED_BY => 'matched_by',
INPUT_MATCHED_BY_INDEX => 'matched_by_index',
INPUT_HEADERS => 'headers',
INPUT_CONTENT => 'content',
};
# hash keys describing wanted fields ($wanted_columns)
use constant {
CFG_TYPE => 'type', # what kind of input (MATCH, PROG, PROGS or PERL)
CFG_OUT_COL => 'ocol', # a name for this column used in the output
# keys used for the normal (MATCH) columns
CFG_ID => 'id', # which input
CFG_IN_COL => 'icol', # a column in such input
# keys used for the calculated columns (i.e. of type PROG, PROGS or PERL)
lib/App/combinesheets.pm view on Meta::CPAN
$key = uc ($key);
unless (exists $known_inputs->{$key}) {
warn "[WR03] Configuration does not recognize the input named '$key'. Input ignored.\n";
next;
}
unless (exists $matches->{$key}) {
warn "[WR04] Input named '$key' does not have any MATCH column defined in configuration. Input ignored.\n";
next;
}
$primary_input = $key unless $primary_input; # remember which input came first
my $input = { INPUT_FILE() => $value,
INPUT_MATCHED_BY() => $matches->{$key} };
if ($value =~ m{\.csv$}i) {
$input->{INPUT_TYPE()} = TYPE_CSV;
} else {
$input->{INPUT_TYPE()} = TYPE_TSV;
}
$inputs->{$key} = $input;
}
die "[ER01] No valid inputs specified. Exiting.\n"
unless scalar keys (%$inputs) > 0;
# read headers from all inputs
my $headers_by_id = {}; # used for re-using the same headers once read, and for some checks
foreach my $input_id (keys %$inputs) {
my $input = $inputs->{$input_id};
my $headers;
if (exists $headers_by_id->{$input_id}) {
$headers = $headers_by_id->{$input_id}; # copy already known headers
} else {
$headers = read_headers ($input);
}
# add new properties to $input
unless (exists $headers->{ $input->{INPUT_MATCHED_BY()} }) {
warn ("[WR05] Input '$input_id' does not contain the matching header '" . $input->{INPUT_MATCHED_BY()} .
"'. Input ignored\n");
delete $inputs->{$input_id};
next;
}
$headers_by_id->{$input_id} = $headers
unless exists $headers_by_id->{$input_id};
$input->{INPUT_HEADERS()} = $headers;
$input->{INPUT_MATCHED_BY_INDEX()} = $headers->{ $input->{INPUT_MATCHED_BY()} };
}
# check real headers vs. headers as defined in configuration
my $already_reported = {};
foreach my $col (@$wanted_cols) {
next if $col->{CFG_TYPE()} ne CFG_MATCH; # check is done only for normal columns
my $input_id = $col->{CFG_ID()};
if (exists $headers_by_id->{$input_id}) {
# does the requested column exist in this input's headers?
unless (column_exists ($input_id, $col->{CFG_IN_COL()})) {
lib/App/combinesheets.pm view on Meta::CPAN
die "[ER03] Due to errors, the primary input '$primary_input' is now ignored. Must exit.\n";
}
# end of checking
exit (0) if $opt->check;
# read all inputs into memory
foreach my $input_id (keys %$inputs) {
my $input = $inputs->{$input_id};
my $content = read_content ($input);
$input->{INPUT_CONTENT()} = $content;
}
# output combined headers
my @header_line = ();
foreach my $col (@$wanted_cols) {
push (@header_line, $col->{CFG_OUT_COL()});
}
print $combined join ("\t", @header_line) . "\n"
unless scalar @header_line == 0;
# combine all inputs and make output lines
foreach my $matching_content (sort keys %{ $inputs->{$primary_input}->{INPUT_CONTENT()} }) {
# $matching_content is, for example, a publication title ("An Atlas of....")
# inputs may have more lines with the same value in the matching columns
# therefore, extract first the matching lines from all inputs
my $lines_to_combine = [];
my $inputs_to_combine = {}; # keys are inputs' CFG_IDs, values are indeces into $lines_to_combine
foreach my $col (@$wanted_cols) {
if ($col->{CFG_TYPE()} eq CFG_MATCH) {
unless (exists $inputs_to_combine->{ $col->{CFG_ID()} }) {
# remember the same lines (from the same input) only once
my $input = $inputs->{ $col->{CFG_ID()} };
push (@$lines_to_combine, $input->{INPUT_CONTENT()}->{$matching_content} || [undef]);
$inputs_to_combine->{ $col->{CFG_ID()} } = $#$lines_to_combine;
}
}
}
# make all combinantions of matching lines
# let's have 3 inputs, identified by K, L and M
# there are three matching lines in K, two in L and one in M:
# my $lines_to_combine = [
lib/App/combinesheets.pm view on Meta::CPAN
my @calculated = (); # indeces of the yet-to-be-calculated elements
my $column_count = -1;
foreach my $col (@$wanted_cols) { # $col defines what data to push into @output_line
$column_count++;
if ($col->{CFG_TYPE()} eq CFG_MATCH) {
my $input = $inputs->{ $col->{CFG_ID()} };
my $input_line = @input_lines[$inputs_to_combine->{ $col->{CFG_ID()} }];
# use Data::Dumper;
# print Dumper (\@input_lines);
# print Dumper ($inputs_to_combine);
my $idx = $input->{INPUT_HEADERS()}->{ $col->{CFG_IN_COL()} };
my $value = $input_line->[$idx] || '';
push (@output_line, $value);
} else {
push (@calculated, $column_count);
push (@output_line, '');
}
}
# insert the calculated columns
foreach my $idx (@calculated) {
if ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROG) {
lib/App/combinesheets.pm view on Meta::CPAN
die "[ER04] '$prg' not found or is not executable.\n"
unless -e $prg and -x $prg;
return $prg;
}
# ----------------------------------------------------------------
# Does the requested $column exist in the given input's headers?
# ----------------------------------------------------------------
sub column_exists {
my ($input_id, $column) = @_;
return exists $inputs->{$input_id}->{INPUT_HEADERS()}->{$column};
}
# ----------------------------------------------------------------
# Read the headers (the first line) form an input file (given in
# hashref $input) and store them in the hashref $headers, each od them
# with its index as it appears in the read file. Do nothing if
# $headers already contains headers from the same input identifier.
# ----------------------------------------------------------------
sub read_headers {
my ($input) = @_;
my $headers;
if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
$headers = read_csv_headers ($input->{INPUT_FILE()});
} else {
$headers = read_tsv_headers ($input->{INPUT_FILE()});
}
my $new_headers = {};
my $column_index = 0;
foreach my $column (@$headers) {
$new_headers->{$column} = $column_index++;
}
return $new_headers;
}
# ----------------------------------------------------------------
lib/App/combinesheets.pm view on Meta::CPAN
}
return substr ($result, 0, -1);
}
# ----------------------------------------------------------------
# Read contents...
# ----------------------------------------------------------------
sub read_content {
my ($input) = @_;
my $content;
if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
$content = read_csv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
} else {
$content = read_tsv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
}
return $content;
}
# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_tsv_content {
my ($file, $matched_index) = @_;
my $fh;
( run in 0.478 second using v1.01-cache-2.11-cpan-64827b87656 )