percent results from the CPAN

CLIPSeqTools
=head1 NAME

CLIPSeqTools::App::nmer_enrichment_over_shuffled - Measure Nmer enrichment over shuffled reads.

=head1 SYNOPSIS

clipseqtools nmer_enrichment_over_shuffled [options/parameters]

=head1 DESCRIPTION

Measure words of length N (Nmer) enrichment and compare to shuffled reads. Shuffling is done at the nucleotide level and p-values are calculated using permutations.

=head1 OPTIONS

  Input options for library.
    --driver <Str>         driver for database connection (eg. mysql,
                           SQLite).
    --database <Str>       database name or path to database file for file
                           based databases (eg. SQLite).
    --table <Str>          database table.
    --host <Str>           hostname for database connection.
    --user <Str>           username for database connection.
    --password <Str>       password for database connection.
    --records_class <Str>  type of records stored in database.
    --filter <Filter>      filter library. May be used multiple times.
                           Syntax: column_name="pattern"
                           e.g. keep reads with deletions AND not repeat
                                masked AND longer than 31
                                --filter deletion="def" 
                                --filter rmsk="undef" .
                                --filter query_length=">31".
                           Operators: >, >=, <, <=, =, !=, def, undef

  Output
    --o_prefix <Str>       output path prefix. Script will create and add
                           extension to path. [Default: ./]

  Other options.
    --nmer_length <Int>    length N of the Nmer. Default: 6
    --permutations <Int>   number of permutation to be performed. 
                           Use more than 100 to get p-values < 0.01.
    --subset <Num>         run analysis on random subset. Option specifies
                           number (if integer) or percent (if % is used)
                           of data to be used.
    -v --verbose           print progress lines and extra information.
    -h -? --usage --help   print help message

=cut

package CLIPSeqTools::App::nmer_enrichment_over_shuffled;
$CLIPSeqTools::App::nmer_enrichment_over_shuffled::VERSION = '1.0.0';

# Make it an app command
use MooseX::App::Command;
extends 'CLIPSeqTools::App';


#######################################################################
#######################   Load External modules   #####################
#######################################################################
use Modern::Perl;
use autodie;
use namespace::autoclean;
use List::Util qw(sum max);
use List::Util qw(shuffle);


#######################################################################
#######################   Command line options   ######################
#######################################################################
option 'nmer_length' => (
	is            => 'rw',
	isa           => 'Int',
	default       => 6,
	documentation => 'the length N of the Nmer.',
);

option 'permutations' => (
	is            => 'rw',
	isa           => 'Int',
	default       => 100,
	documentation => 'the number of permutation to be performed. Use more than 100 to get p-values < 0.01.',
);

option 'subset' => (
	is            => 'rw',
	isa           => 'Str',
	default       => '100%',
	documentation => 'run analysis on random subset. Option specifies the number (if integer) or percent (if % is used) of data to be used.',
);


#######################################################################
##########################   Consume Roles   ##########################
#######################################################################
with 
	"CLIPSeqTools::Role::Option::Library" => {
		-alias    => { validate_args => '_validate_args_for_library' },
		-excludes => 'validate_args',
	},
	"CLIPSeqTools::Role::Option::OutputPrefix" => {
		-alias    => { validate_args => '_validate_args_for_output_prefix' },
		-excludes => 'validate_args',
	};


#######################################################################
########################   Interface Methods   ########################
#######################################################################
sub validate_args {
	my ($self) = @_;
	
	$self->_validate_args_for_library;
	$self->_validate_args_for_output_prefix;
}

sub run {
	my ($self) = @_;
	
	warn "Starting analysis: nmer_enrichment_over_shuffled\n";
	
	warn "Validating arguments\n" if $self->verbose;
	$self->validate_args();

	warn "Creating reads collection\n" if $self->verbose;
	my $reads_collection = $self->reads_collection;

	my $subset_factor = 1;
	if ($self->subset ne '100%') {
		warn "Calculating subset parameters\n" if $self->verbose;
		my $records_count = $reads_collection->records_count;
		if ($self->subset =~ /(.+)%/) {
			$subset_factor = $1 / 100;
		}
		else {
			$subset_factor = $self->subset / $records_count;
		}
		warn "Program will run on approximatelly ".sprintf('%.0f', $subset_factor*100)."% of library\n" if $self->verbose;
	}

	warn "Counting Nmer occurrences in original and shuffled reads\n" if $self->verbose;
	my %nmer_stats;
	$reads_collection->foreach_record_do(sub{
		my ($record) = @_;
		
		return 0 if rand > $subset_factor;
		
		my $seq = $record->sequence;
		for my $i (0..length($seq)-$self->nmer_length) {
( run in 2.787 seconds using v1.01-cache-2.11-cpan-0bb4e1dffa6 )