App-combinesheets

 view release on metacpan or  search on metacpan

lib/App/combinesheets.pm  view on Meta::CPAN


# types of input files
use constant {
    TYPE_CSV => 'csv',
    TYPE_TSV => 'tsv',
#    TYPE_XSL => 'xsl',    # not-yet-supported
};

# hash keys describing an input ($inputs)
use constant {
    INPUT_FILE              => 'file',
    INPUT_TYPE              => 'type',
    INPUT_MATCHED_BY        => 'matched_by',
    INPUT_MATCHED_BY_INDEX  => 'matched_by_index',
    INPUT_HEADERS           => 'headers',
    INPUT_CONTENT           => 'content',
};

# hash keys describing wanted fields ($wanted_columns)
use constant {
    CFG_TYPE     => 'type',   # what kind of input (MATCH, PROG, PROGS or PERL)
    CFG_OUT_COL  => 'ocol',   # a name for this column used in the output
    # keys used for the normal (MATCH) columns
    CFG_ID       => 'id',     # which input
    CFG_IN_COL   => 'icol',   # a column in such input
    # keys used for the calculated columns (i.e. of type PROG, PROGS or PERL)

lib/App/combinesheets.pm  view on Meta::CPAN

        $key = uc ($key);
        unless (exists $known_inputs->{$key}) {
            warn "[WR03] Configuration does not recognize the input named '$key'. Input ignored.\n";
            next;
        }
        unless (exists $matches->{$key}) {
            warn "[WR04] Input named '$key' does not have any MATCH column defined in configuration. Input ignored.\n";
            next;
        }
        $primary_input = $key unless $primary_input;   # remember which input came first
        my $input = { INPUT_FILE()       => $value,
                      INPUT_MATCHED_BY() => $matches->{$key} };
        if ($value =~ m{\.csv$}i) {
            $input->{INPUT_TYPE()} = TYPE_CSV;
        } else {
            $input->{INPUT_TYPE()} = TYPE_TSV;
        }
        $inputs->{$key} = $input;
    }
    die "[ER01] No valid inputs specified. Exiting.\n"
        unless scalar keys (%$inputs) > 0;

    # read headers from all inputs
    my $headers_by_id = {};  # used for re-using the same headers once read, and for some checks
    foreach my $input_id (keys %$inputs) {
        my $input = $inputs->{$input_id};
        my $headers;
        if (exists $headers_by_id->{$input_id}) {
            $headers = $headers_by_id->{$input_id};   # copy already known headers
        } else {
            $headers = read_headers ($input);
        }

        # add new properties to $input
        unless (exists $headers->{ $input->{INPUT_MATCHED_BY()} }) {
            warn ("[WR05] Input '$input_id' does not contain the matching header '" . $input->{INPUT_MATCHED_BY()} .
                  "'. Input ignored\n");
            delete $inputs->{$input_id};
            next;
        }
        $headers_by_id->{$input_id} = $headers
            unless exists $headers_by_id->{$input_id};
        $input->{INPUT_HEADERS()} = $headers;
        $input->{INPUT_MATCHED_BY_INDEX()} = $headers->{ $input->{INPUT_MATCHED_BY()} };
    }

    # check real headers vs. headers as defined in configuration
    my $already_reported = {};
    foreach my $col (@$wanted_cols) {
        next if $col->{CFG_TYPE()} ne CFG_MATCH;   # check is done only for normal columns
        my $input_id = $col->{CFG_ID()};
        if (exists $headers_by_id->{$input_id}) {
            # does the requested column exist in this input's headers?
            unless (column_exists ($input_id, $col->{CFG_IN_COL()})) {

lib/App/combinesheets.pm  view on Meta::CPAN

        die "[ER03] Due to errors, the primary input '$primary_input' is now ignored. Must exit.\n";
    }

    # end of checking
    exit (0) if $opt->check;

    # read all inputs into memory
    foreach my $input_id (keys %$inputs) {
        my $input = $inputs->{$input_id};
        my $content = read_content ($input);
        $input->{INPUT_CONTENT()} = $content;
    }

    # output combined headers
    my @header_line = ();
    foreach my $col (@$wanted_cols) {
        push (@header_line, $col->{CFG_OUT_COL()});
    }
    print $combined join ("\t", @header_line) . "\n"
        unless scalar @header_line == 0;

    # combine all inputs and make output lines
    foreach my $matching_content (sort keys %{ $inputs->{$primary_input}->{INPUT_CONTENT()} }) {
        # $matching_content is, for example, a publication title ("An Atlas of....")

        # inputs may have more lines with the same value in the matching columns
        # therefore, extract first the matching lines from all inputs
        my $lines_to_combine = [];
        my $inputs_to_combine = {};  # keys are inputs' CFG_IDs, values are indeces into $lines_to_combine

        foreach my $col (@$wanted_cols) {
            if ($col->{CFG_TYPE()} eq CFG_MATCH) {
                unless (exists $inputs_to_combine->{ $col->{CFG_ID()} }) {
                    # remember the same lines (from the same input) only once
                    my $input = $inputs->{ $col->{CFG_ID()} };
                    push (@$lines_to_combine, $input->{INPUT_CONTENT()}->{$matching_content} || [undef]);
                    $inputs_to_combine->{ $col->{CFG_ID()} } = $#$lines_to_combine;
                }
            }
        }

        # make all combinantions of matching lines

        # let's have 3 inputs, identified by K, L and M
        # there are three matching lines in K, two in L and one in M:
        # my $lines_to_combine = [

lib/App/combinesheets.pm  view on Meta::CPAN

                         my @calculated = ();   # indeces of the yet-to-be-calculated elements
                         my $column_count = -1;
                         foreach my $col (@$wanted_cols) { # $col defines what data to push into @output_line
                             $column_count++;
                             if ($col->{CFG_TYPE()} eq CFG_MATCH) {
                                 my $input = $inputs->{ $col->{CFG_ID()} };
                                 my $input_line = @input_lines[$inputs_to_combine->{ $col->{CFG_ID()} }];
                                 # use Data::Dumper;
                                 # print Dumper (\@input_lines);
                                 # print Dumper ($inputs_to_combine);
                                 my $idx = $input->{INPUT_HEADERS()}->{ $col->{CFG_IN_COL()} };
                                 my $value = $input_line->[$idx] || '';
                                 push (@output_line, $value);
                             } else {
                                 push (@calculated, $column_count);
                                 push (@output_line, '');
                             }
                         }
                         # insert the calculated columns
                         foreach my $idx (@calculated) {
                             if ($wanted_cols->[$idx]->{CFG_TYPE()} eq CFG_PROG) {

lib/App/combinesheets.pm  view on Meta::CPAN

    die "[ER04] '$prg' not found or is not executable.\n"
        unless -e $prg and -x $prg;
    return $prg;
}

# ----------------------------------------------------------------
# Does the requested $column exist in the given input's headers?
# ----------------------------------------------------------------
sub column_exists {
    my ($input_id, $column) = @_;
    return exists $inputs->{$input_id}->{INPUT_HEADERS()}->{$column};
}

# ----------------------------------------------------------------
# Read the headers (the first line) form an input file (given in
# hashref $input) and store them in the hashref $headers, each od them
# with its index as it appears in the read file. Do nothing if
# $headers already contains headers from the same input identifier.
# ----------------------------------------------------------------
sub read_headers {
    my ($input) = @_;

    my $headers;
    if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
        $headers = read_csv_headers ($input->{INPUT_FILE()});
    } else {
        $headers = read_tsv_headers ($input->{INPUT_FILE()});
    }
    my $new_headers = {};
    my $column_index = 0;
    foreach my $column (@$headers) {
        $new_headers->{$column} = $column_index++;
    }
    return $new_headers;
}

# ----------------------------------------------------------------

lib/App/combinesheets.pm  view on Meta::CPAN

    }
    return substr ($result, 0, -1);
}

# ----------------------------------------------------------------
# Read contents...
# ----------------------------------------------------------------
sub read_content {
    my ($input) = @_;
    my $content;
    if ($input->{INPUT_TYPE()} eq TYPE_CSV) {
        $content = read_csv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
    } else {
        $content = read_tsv_content ($input->{INPUT_FILE()}, $input->{INPUT_MATCHED_BY_INDEX()});
    }
    return $content;
}

# ----------------------------------------------------------------
#
# ----------------------------------------------------------------
sub read_tsv_content {
    my ($file, $matched_index) = @_;
    my $fh;



( run in 0.478 second using v1.01-cache-2.11-cpan-64827b87656 )