Bio-ToolBox
view release on metacpan or search on metacpan
scripts/manipulate_datasets.pl view on Meta::CPAN
if ( defined $opt_target ) {
# use the command line specified target
$target = $opt_target;
}
else {
# request target from user
my $p = ' Enter the new median target: ';
$target = prompt($p);
chomp $target;
}
# Work through the requested datasets
my @datasets_modified; # a list of which datasets were modified
INDEX_LOOP: foreach my $index (@indices) {
# Retrieve values and calculate median
my @cv = $Data->column_values($index);
shift @cv; # skip header
my @values = grep { looks_like_number($_) and $_ != 0 } @cv;
# I had historically always skipped zero values before, so continue to do so?
unless (@values) {
printf " WARNING: no numeric values dataset %s, index %d!\n",
$Data->name($index), $index;
next INDEX_LOOP;
}
my $median = median(@values);
printf " The median value for dataset %s is %s\n", $Data->name($index), $median;
# Calculate correction value
my $correction_value = $target / $median;
# Replace values
my $name = $Data->name($index);
$index = _prepare_new_destination( $index, '_scaled' ) if $placement =~ /^n/i;
$Data->iterate(
sub {
my $row = shift;
my $v = $row->value($index);
next unless looks_like_number($v);
$v *= $correction_value;
$row->value( $index, $v );
}
);
# annotate metadata
$Data->metadata( $index, 'median_scaled', $target );
# results
push @datasets_modified, $name;
}
# report results
if (@datasets_modified) {
printf " %s were median scaled to $target\n", join( ", ", @datasets_modified );
}
return scalar(@datasets_modified);
}
sub percentile_rank_function {
# this subroutine will convert a dataset into a percentile rank
# request datasets
my @indices;
if (@_) {
# provided from an internal subroutine
@indices = @_;
}
else {
# otherwise request from user
@indices = _request_indices(
" Enter one or more column index numbers to convert to percentile rank "
);
}
unless (@indices) {
print " WARNING: unknown index number(s). nothing done\n";
return 0;
}
# Where to put new values?
my $placement = _request_placement();
# Process each index request
my @datasets_modified; # a list of which datasets were modified
foreach my $index (@indices) {
# Calculate percent rank of values
my @cv = $Data->column_values($index);
shift @cv; # skip header
my @values = grep { looks_like_number($_) } @cv;
unless (@values) {
printf " WARNING: no numeric values in dataset %d, %s! Skipping\n",
$index, $Data->name($index);
next;
}
my $total = scalar @values;
my %percentrank;
my $n = 1;
foreach ( sort { $a <=> $b } @values ) {
# sort by increasing hash values, not hash keys
# percentrank is key value (index) divided by total
$percentrank{$_} = $n / $total;
$n++;
}
# Replace the contents with the calculated percent rank
my $name = $Data->name($index);
$index = _prepare_new_destination( $index, '_pr' ) if $placement =~ /^n/i;
$Data->iterate(
sub {
my $row = shift;
my $v = $row->value($index);
next unless looks_like_number($v);
$row->value( $index, $percentrank{$v} );
}
);
# update metadata
$Data->metadata( $index, 'converted', 'percent_rank' );
# done
push @datasets_modified, $name;
}
# report results
if (@datasets_modified) {
printf " %s were converted to percent rank\n", join( ", ", @datasets_modified );
}
return scalar(@datasets_modified);
}
sub zscore_function {
# this subroutine will generate a z-score for each value in a dataset
# identify the datasets to convert
my @indices;
if (@_) {
# provided from an internal subroutine
@indices = @_;
}
else {
# otherwise request from user
@indices = _request_indices(
" Enter one or more column index numbers to convert to z-scores ");
}
unless (@indices) {
print " WARNING: Unknown columns. Nothing done.\n";
return 0;
}
# Where to put new values?
my $placement = _request_placement();
# Process each index request
my @datasets_modified; # a list of which datasets were modified
foreach my $index (@indices) {
# generate statistics on the dataset
my @cv = $Data->column_values($index);
shift @cv; # skip header
my @values = grep { looks_like_number($_) } @cv;
unless (@values) {
printf " WARNING: no numeric values for index %d, %s! skipping\n",
$index, $Data->name($index);
next;
}
my $mean = sum0(@values) / scalar(@values);
my $std = stddevp(@values);
printf " Column %d is %.6f ± %.6f\n", $index, $mean, $std;
# Replace the current values
my $name = $Data->name($index);
$index = _prepare_new_destination( $index, '_Zscore' ) if $placement =~ /^n/i;
$Data->iterate(
sub {
my $row = shift;
my $v = $row->value($index);
next unless looks_like_number($v);
$v = ( $v - $mean ) / $std;
$row->value( $index, $v );
}
);
# update metadata
$Data->metadata( $index, 'converted', 'Z-score' );
scripts/manipulate_datasets.pl view on Meta::CPAN
'a' => 'add',
'u' => 'subtract',
'y' => 'multiply',
'v' => 'divide',
's' => 'scale',
'p' => 'pr',
'Z' => 'zscore',
'l' => 'log',
'L' => 'delog',
'f' => 'format',
'c' => 'combine',
'r' => 'ratio',
'd' => 'diff',
'e' => 'center',
'w' => 'new',
'Y' => 'summary',
'x' => 'export',
'W' => 'rewrite',
'h' => 'help',
'V' => 'view',
'q' => 'write_quit',
'Q' => 'quit',
'm' => 'menu',
);
}
sub _get_function_to_subroutine_hash {
# this hash converts the function name to the actual subroutine for the function
# the key is the function name
# the value is a scalar reference to the subroutine
return (
'stat' => \&print_statistics_function,
'lengthstat' => \&print_length_statistics_function,
'reorder' => \&reorder_function,
'delete' => \&delete_function,
'rename' => \&rename_function,
'number' => \&number_function,
'concatenate' => \&concatenate_function,
'split' => \&split_function,
'coordinate' => \&coordinate_function,
'sort' => \&sort_function,
'gsort' => \&genomic_sort_function,
'null' => \&toss_nulls_function,
'duplicate' => \&toss_duplicates_function,
'above' => \&toss_above_threshold_function,
'below' => \&toss_below_threshold_function,
'specific' => \&toss_specific_values_function,
'keep' => \&keep_specific_values_function,
'lengthfilt' => \&filter_length_function,
'addname' => \&addname_function,
'cnull' => \&convert_nulls_function,
'absolute' => \&convert_absolute_function,
'minimum' => \&minimum_function,
'maximum' => \&maximum_function,
'add' => \&add_function,
'subtract' => \&subtract_function,
'multiply' => \&multiply_function,
'divide' => \÷_function,
'scale' => \&median_scale_function,
'pr' => \&percentile_rank_function,
'zscore' => \&zscore_function,
'log' => \&log_function,
'log2' => \&log_function, # holdover from previous
'delog' => \&delog_function,
'delog2' => \&delog_function,
'format' => \&format_function,
'combine' => \&combine_function,
'ratio' => \&ratio_function,
'diff' => \&difference_function,
'center' => \¢er_function,
'new' => \&new_column_function,
'summary' => \&write_summary_function,
'export' => \&export_function,
'rewrite' => \&rewrite_function,
'view' => \&view_function,
'help' => \&print_online_help,
'menu' => \&print_menu,
'write_quit' => \&write_and_quit_function,
'quit' => \&quit_function,
);
}
sub _request_index {
# this subroutine will determine which dataset index to use
# if index is specified on the command line, that will be used
# alternatively, it will ask the user which dataset to process.
# it will return the index number
my $line = shift; # the custom request line to give the user
if (@opt_indices) {
# index array is specified on the command line
# use the first element in the global index array
my $index = $opt_indices[0];
unless ( _validate_index_list($index) ) {
return -1; # error value
}
return $index;
}
else {
# request interactively from the user
my $index = ask_user_for_index( $Data, $line );
unless ( defined $index ) {
# return an error value
return -1;
}
return $index;
}
}
sub _request_indices {
# this subroutine will determine which datasets are to be used
# if the indices are specified on the command line, those will be used
# alternatively, it will ask the user for the indices interactively
my $line = shift; # the custom request line to give the user
# get list of indices
if (@opt_indices) {
scripts/manipulate_datasets.pl view on Meta::CPAN
Convert signed values to their absolute value equivalents. One or
more columns may be selected to convert.
=item B<minimum> (menu option B<I>)
Reset datapoints whose values are less than a specified minimum
value to the minimum value. One or more columns may be selected
to reset values to the minimum. The minimum value may be requested
interactively or specified with the C<--target> option.
=item B<maximum> (menu option B<X>)
Reset datapoints whose values are greater than a specified maximum
value to the maximum value. One or more columns may be selected
to reset values to the maximum. The maximum value may be requested
interactively or specified with the C<--target> option.
=item B<add> (menu option B<a>)
Add a value to a column. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.
=item B<subtract> (menu option B<u>)
Subtract a value from a column. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.
=item B<multiply> (menu option B<y>)
Multiply a column by a value. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.
=item B<divide> (menu option B<v>)
Divide a column by a value. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.
=item B<scale> (menu option B<s>)
A column may be a median scaled as a means of normalization
with other columns. The current median of the column requested is
presented, and a new median target is requested. The column may
either be replaced with the median scaled values or added as a new
column. For automatic execution, specify the new median target
with the C<--target> option.
=item B<pr> (menu option B<p>)
A column may be converted to a percentile rank, whereby the
data values are sorted in ascending order and assigned a new value
from 0..1 based on its rank in the sorted order. The column may
either be replaced with the percentile rank or added as a new
column. The original order of the column is maintained.
=item B<zscore> (menu option B<Z>)
Generate a Z-score or standard score for each value in a column. The
Z-score is the number of standard deviations the value is away from
the column's mean, such that the new mean is 0 and the standard
deviation is 1. Provides a simple method of normalizing columns
with disparate dynamic ranges.
=item B<log> (menu option B<l>)
A column may be converted to log values. The column may either
be replaced with the log values or added as a new column. Use
the C<--target> option to specify the base (usually 2 or 10).
=item B<delog> (menu option B<L>)
A column that is currently in log space may be converted back to
normal numbers. The column may either be replaced with the
new values or added as a new column. Use the C<--target> option to
specify the base (usually 2 or 10). The base may be obtained from the
metadata.
=item B<format> (menu option B<f>)
Format the numbers of a column to a given number of decimal places.
An integer must be provided. The column may either be replaced or
added as a new column. For automatic execution, use the C<--target>
option to specify the number decimal places.
=item B<combine> (menu option B<c>)
Mathematically combine the data values in two or more columns. The
methods for combining the values include mean, median, min, max,
stdev, or sum. The method may be specified on the command line
using the C<--target> option. The combined data values are added as a
new column.
=item B<ratio> (menu option B<r>)
A ratio may be generated between two columns. The experiment and
control columns are requested and the ratio is added as a new
column. For log2 numbers, the control is subtracted from the
experiment. The log2 status is checked in the metadata for the
specified columns, or may be specified as a command line option, or
asked of the user.
=item B<diff> (menu option B<d>)
A simple difference is generated between two existing columns. The
values in the 'control' column are simply subtracted from the
values in the 'experimental' column and recorded as a new column.
For enumerated columns (e.g. tag counts from Next Generation
Sequencing), the columns should be subsampled to equalize the sums
of the two columns. The indices for the experimental and control columns
may either requested from the user or supplied by the C<--exp> and
C<--con> command line options.
=item B<center> (menu option B<e>)
( run in 0.967 second using v1.01-cache-2.11-cpan-483215c6ad5 )