Bio-ToolBox

 view release on metacpan or  search on metacpan

scripts/manipulate_datasets.pl  view on Meta::CPAN

	if ( defined $opt_target ) {

		# use the command line specified target
		$target = $opt_target;
	}
	else {
		# request target from user
		my $p = ' Enter the new median target:  ';
		$target = prompt($p);
		chomp $target;
	}

	# Work through the requested datasets
	my @datasets_modified;    # a list of which datasets were modified
INDEX_LOOP: foreach my $index (@indices) {

		# Retrieve values and calculate median
		my @cv = $Data->column_values($index);
		shift @cv;            # skip header
		my @values = grep { looks_like_number($_) and $_ != 0 } @cv;

		# I had historically always skipped zero values before, so continue to do so?
		unless (@values) {
			printf " WARNING: no numeric values dataset %s, index %d!\n",
				$Data->name($index), $index;
			next INDEX_LOOP;
		}
		my $median = median(@values);
		printf " The median value for dataset %s is %s\n", $Data->name($index), $median;

		# Calculate correction value
		my $correction_value = $target / $median;

		# Replace values
		my $name = $Data->name($index);
		$index = _prepare_new_destination( $index, '_scaled' ) if $placement =~ /^n/i;
		$Data->iterate(
			sub {
				my $row = shift;
				my $v   = $row->value($index);
				next unless looks_like_number($v);
				$v *= $correction_value;
				$row->value( $index, $v );
			}
		);

		# annotate metadata
		$Data->metadata( $index, 'median_scaled', $target );

		# results
		push @datasets_modified, $name;
	}

	# report results
	if (@datasets_modified) {
		printf " %s were median scaled to $target\n", join( ", ", @datasets_modified );
	}
	return scalar(@datasets_modified);
}

sub percentile_rank_function {

	# this subroutine will convert a dataset into a percentile rank

	# request datasets
	my @indices;
	if (@_) {

		# provided from an internal subroutine
		@indices = @_;
	}
	else {
		# otherwise request from user
		@indices = _request_indices(
			" Enter one or more column index numbers to convert to percentile rank  "
		);
	}
	unless (@indices) {
		print " WARNING: unknown index number(s). nothing done\n";
		return 0;
	}

	# Where to put new values?
	my $placement = _request_placement();

	# Process each index request
	my @datasets_modified;    # a list of which datasets were modified
	foreach my $index (@indices) {

		# Calculate percent rank of values
		my @cv = $Data->column_values($index);
		shift @cv;            # skip header
		my @values = grep { looks_like_number($_) } @cv;
		unless (@values) {
			printf " WARNING: no numeric values in dataset %d, %s! Skipping\n",
				$index, $Data->name($index);
			next;
		}
		my $total = scalar @values;
		my %percentrank;
		my $n = 1;
		foreach ( sort { $a <=> $b } @values ) {

			# sort by increasing hash values, not hash keys
			# percentrank is key value (index) divided by total
			$percentrank{$_} = $n / $total;
			$n++;
		}

		# Replace the contents with the calculated percent rank
		my $name = $Data->name($index);
		$index = _prepare_new_destination( $index, '_pr' ) if $placement =~ /^n/i;
		$Data->iterate(
			sub {
				my $row = shift;
				my $v   = $row->value($index);
				next unless looks_like_number($v);
				$row->value( $index, $percentrank{$v} );
			}
		);

		# update metadata
		$Data->metadata( $index, 'converted', 'percent_rank' );

		# done
		push @datasets_modified, $name;
	}

	# report results
	if (@datasets_modified) {
		printf " %s were converted to percent rank\n", join( ", ", @datasets_modified );
	}
	return scalar(@datasets_modified);
}

sub zscore_function {

	# this subroutine will generate a z-score for each value in a dataset

	# identify the datasets to convert
	my @indices;
	if (@_) {

		# provided from an internal subroutine
		@indices = @_;
	}
	else {
		# otherwise request from user
		@indices = _request_indices(
			" Enter one or more column index numbers to convert to z-scores  ");
	}
	unless (@indices) {
		print " WARNING: Unknown columns. Nothing done.\n";
		return 0;
	}

	# Where to put new values?
	my $placement = _request_placement();

	# Process each index request
	my @datasets_modified;    # a list of which datasets were modified
	foreach my $index (@indices) {

		# generate statistics on the dataset
		my @cv = $Data->column_values($index);
		shift @cv;            # skip header
		my @values = grep { looks_like_number($_) } @cv;
		unless (@values) {
			printf " WARNING: no numeric values for index %d, %s! skipping\n",
				$index, $Data->name($index);
			next;
		}
		my $mean = sum0(@values) / scalar(@values);
		my $std  = stddevp(@values);
		printf "   Column %d is %.6f ± %.6f\n", $index, $mean, $std;

		# Replace the current values
		my $name = $Data->name($index);
		$index = _prepare_new_destination( $index, '_Zscore' ) if $placement =~ /^n/i;
		$Data->iterate(
			sub {
				my $row = shift;
				my $v   = $row->value($index);
				next unless looks_like_number($v);
				$v = ( $v - $mean ) / $std;
				$row->value( $index, $v );
			}
		);

		# update metadata
		$Data->metadata( $index, 'converted', 'Z-score' );

scripts/manipulate_datasets.pl  view on Meta::CPAN

		'a' => 'add',
		'u' => 'subtract',
		'y' => 'multiply',
		'v' => 'divide',
		's' => 'scale',
		'p' => 'pr',
		'Z' => 'zscore',
		'l' => 'log',
		'L' => 'delog',
		'f' => 'format',
		'c' => 'combine',
		'r' => 'ratio',
		'd' => 'diff',
		'e' => 'center',
		'w' => 'new',
		'Y' => 'summary',
		'x' => 'export',
		'W' => 'rewrite',
		'h' => 'help',
		'V' => 'view',
		'q' => 'write_quit',
		'Q' => 'quit',
		'm' => 'menu',
	);
}

sub _get_function_to_subroutine_hash {

	# this hash converts the function name to the actual subroutine for the function
	# the key is the function name
	# the value is a scalar reference to the subroutine
	return (
		'stat'        => \&print_statistics_function,
		'lengthstat'  => \&print_length_statistics_function,
		'reorder'     => \&reorder_function,
		'delete'      => \&delete_function,
		'rename'      => \&rename_function,
		'number'      => \&number_function,
		'concatenate' => \&concatenate_function,
		'split'       => \&split_function,
		'coordinate'  => \&coordinate_function,
		'sort'        => \&sort_function,
		'gsort'       => \&genomic_sort_function,
		'null'        => \&toss_nulls_function,
		'duplicate'   => \&toss_duplicates_function,
		'above'       => \&toss_above_threshold_function,
		'below'       => \&toss_below_threshold_function,
		'specific'    => \&toss_specific_values_function,
		'keep'        => \&keep_specific_values_function,
		'lengthfilt'  => \&filter_length_function,
		'addname'     => \&addname_function,
		'cnull'       => \&convert_nulls_function,
		'absolute'    => \&convert_absolute_function,
		'minimum'     => \&minimum_function,
		'maximum'     => \&maximum_function,
		'add'         => \&add_function,
		'subtract'    => \&subtract_function,
		'multiply'    => \&multiply_function,
		'divide'      => \&divide_function,
		'scale'       => \&median_scale_function,
		'pr'          => \&percentile_rank_function,
		'zscore'      => \&zscore_function,
		'log'         => \&log_function,
		'log2'        => \&log_function,                     # holdover from previous
		'delog'       => \&delog_function,
		'delog2'      => \&delog_function,
		'format'      => \&format_function,
		'combine'     => \&combine_function,
		'ratio'       => \&ratio_function,
		'diff'        => \&difference_function,
		'center'      => \&center_function,
		'new'         => \&new_column_function,
		'summary'     => \&write_summary_function,
		'export'      => \&export_function,
		'rewrite'     => \&rewrite_function,
		'view'        => \&view_function,
		'help'        => \&print_online_help,
		'menu'        => \&print_menu,
		'write_quit'  => \&write_and_quit_function,
		'quit'        => \&quit_function,
	);
}

sub _request_index {

	# this subroutine will determine which dataset index to use
	# if index is specified on the command line, that will be used
	# alternatively, it will ask the user which dataset to process.
	# it will return the index number
	my $line = shift;    # the custom request line to give the user
	if (@opt_indices) {

		# index array is specified on the command line
		# use the first element in the global index array
		my $index = $opt_indices[0];
		unless ( _validate_index_list($index) ) {
			return -1;    # error value
		}
		return $index;
	}
	else {
		# request interactively from the user
		my $index = ask_user_for_index( $Data, $line );
		unless ( defined $index ) {

			# return an error value
			return -1;
		}
		return $index;
	}
}

sub _request_indices {

	# this subroutine will determine which datasets are to be used
	# if the indices are specified on the command line, those will be used
	# alternatively, it will ask the user for the indices interactively
	my $line = shift;    # the custom request line to give the user

	# get list of indices
	if (@opt_indices) {

scripts/manipulate_datasets.pl  view on Meta::CPAN

Convert signed values to their absolute value equivalents. One or 
more columns may be selected to convert.

=item B<minimum> (menu option B<I>)

Reset datapoints whose values are less than a specified minimum 
value to the minimum value. One or more columns may be selected 
to reset values to the minimum. The minimum value may be requested 
interactively or specified with the C<--target> option. 

=item B<maximum> (menu option B<X>)

Reset datapoints whose values are greater than a specified maximum 
value to the maximum value. One or more columns may be selected 
to reset values to the maximum. The maximum value may be requested 
interactively or specified with the C<--target> option. 

=item B<add> (menu option B<a>)

Add a value to a column. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.

=item B<subtract> (menu option B<u>)

Subtract a value from a column. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.

=item B<multiply> (menu option B<y>)

Multiply a column by a value. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.

=item B<divide> (menu option B<v>)

Divide a column by a value. A real number may be supplied, or the words
'mean', 'median', or 'sum' may be entered as a proxy for those statistical
values of the column. The column may either be replaced or added
as a new one. For automatic execution, specify the number using the
C<--target> option.

=item B<scale> (menu option B<s>)

A column may be a median scaled as a means of normalization 
with other columns. The current median of the column requested is
presented, and a new median target is requested. The column may 
either be replaced with the median scaled values or added as a new 
column. For automatic execution, specify the new median target 
with the C<--target> option.

=item B<pr> (menu option B<p>)

A column may be converted to a percentile rank, whereby the
data values are sorted in ascending order and assigned a new value 
from 0..1 based on its rank in the sorted order. The column may 
either be replaced with the percentile rank or added as a new
column. The original order of the column is maintained.

=item B<zscore> (menu option B<Z>)

Generate a Z-score or standard score for each value in a column. The
Z-score is the number of standard deviations the value is away from
the column's mean, such that the new mean is 0 and the standard 
deviation is 1. Provides a simple method of normalizing columns
with disparate dynamic ranges.

=item B<log> (menu option B<l>)

A column may be converted to log values. The column may either 
be replaced with the log values or added as a new column. Use 
the C<--target> option to specify the base (usually 2 or 10).

=item B<delog> (menu option B<L>)

A column that is currently in log space may be converted back to
normal numbers. The column may either be replaced with the 
new values or added as a new column. Use the C<--target> option to 
specify the base (usually 2 or 10). The base may be obtained from the 
metadata.

=item B<format> (menu option B<f>)

Format the numbers of a column to a given number of decimal places. 
An integer must be provided. The column may either be replaced or 
added as a new column. For automatic execution, use the C<--target> 
option to specify the number decimal places.

=item B<combine> (menu option B<c>)

Mathematically combine the data values in two or more columns. The 
methods for combining the values include mean, median, min, max, 
stdev, or sum. The method may be specified on the command line 
using the C<--target> option. The combined data values are added as a 
new column.

=item B<ratio> (menu option B<r>)

A ratio may be generated between two columns. The experiment and 
control columns are requested and the ratio is added as a new
column. For log2 numbers, the control is subtracted from the
experiment. The log2 status is checked in the metadata for the 
specified columns, or may be specified as a command line option, or
asked of the user.

=item B<diff> (menu option B<d>)

A simple difference is generated between two existing columns. The 
values in the 'control' column are simply subtracted from the 
values in the 'experimental' column and recorded as a new column.
For enumerated columns (e.g. tag counts from Next Generation 
Sequencing), the columns should be subsampled to equalize the sums 
of the two columns. The indices for the experimental and control columns 
may either requested from the user or supplied by the C<--exp> and 
C<--con> command line options. 

=item B<center> (menu option B<e>)



( run in 0.967 second using v1.01-cache-2.11-cpan-483215c6ad5 )