CLIPSeqTools
view release on metacpan or search on metacpan
lib/CLIPSeqTools/App/nmer_enrichment_over_shuffled.pm view on Meta::CPAN
=head1 NAME
CLIPSeqTools::App::nmer_enrichment_over_shuffled - Measure Nmer enrichment over shuffled reads.
=head1 SYNOPSIS
clipseqtools nmer_enrichment_over_shuffled [options/parameters]
=head1 DESCRIPTION
Measure words of length N (Nmer) enrichment and compare to shuffled reads. Shuffling is done at the nucleotide level and p-values are calculated using permutations.
=head1 OPTIONS
Input options for library.
--driver <Str> driver for database connection (eg. mysql,
SQLite).
--database <Str> database name or path to database file for file
based databases (eg. SQLite).
--table <Str> database table.
--host <Str> hostname for database connection.
--user <Str> username for database connection.
--password <Str> password for database connection.
--records_class <Str> type of records stored in database.
--filter <Filter> filter library. May be used multiple times.
Syntax: column_name="pattern"
e.g. keep reads with deletions AND not repeat
masked AND longer than 31
--filter deletion="def"
--filter rmsk="undef" .
--filter query_length=">31".
Operators: >, >=, <, <=, =, !=, def, undef
Output
--o_prefix <Str> output path prefix. Script will create and add
extension to path. [Default: ./]
Other options.
--nmer_length <Int> length N of the Nmer. Default: 6
--permutations <Int> number of permutation to be performed.
Use more than 100 to get p-values < 0.01.
--subset <Num> run analysis on random subset. Option specifies
number (if integer) or percent (if % is used)
of data to be used.
-v --verbose print progress lines and extra information.
-h -? --usage --help print help message
=cut
package CLIPSeqTools::App::nmer_enrichment_over_shuffled;
$CLIPSeqTools::App::nmer_enrichment_over_shuffled::VERSION = '1.0.0';
# Make it an app command
use MooseX::App::Command;
extends 'CLIPSeqTools::App';
#######################################################################
####################### Load External modules #####################
#######################################################################
use Modern::Perl;
use autodie;
use namespace::autoclean;
use List::Util qw(sum max);
use List::Util qw(shuffle);
#######################################################################
####################### Command line options ######################
#######################################################################
option 'nmer_length' => (
is => 'rw',
isa => 'Int',
default => 6,
documentation => 'the length N of the Nmer.',
);
option 'permutations' => (
is => 'rw',
isa => 'Int',
default => 100,
documentation => 'the number of permutation to be performed. Use more than 100 to get p-values < 0.01.',
);
option 'subset' => (
is => 'rw',
isa => 'Str',
default => '100%',
documentation => 'run analysis on random subset. Option specifies the number (if integer) or percent (if % is used) of data to be used.',
);
#######################################################################
########################## Consume Roles ##########################
#######################################################################
with
"CLIPSeqTools::Role::Option::Library" => {
-alias => { validate_args => '_validate_args_for_library' },
-excludes => 'validate_args',
},
"CLIPSeqTools::Role::Option::OutputPrefix" => {
-alias => { validate_args => '_validate_args_for_output_prefix' },
-excludes => 'validate_args',
};
#######################################################################
######################## Interface Methods ########################
#######################################################################
sub validate_args {
my ($self) = @_;
$self->_validate_args_for_library;
$self->_validate_args_for_output_prefix;
}
sub run {
my ($self) = @_;
warn "Starting analysis: nmer_enrichment_over_shuffled\n";
warn "Validating arguments\n" if $self->verbose;
$self->validate_args();
warn "Creating reads collection\n" if $self->verbose;
my $reads_collection = $self->reads_collection;
my $subset_factor = 1;
if ($self->subset ne '100%') {
warn "Calculating subset parameters\n" if $self->verbose;
my $records_count = $reads_collection->records_count;
if ($self->subset =~ /(.+)%/) {
$subset_factor = $1 / 100;
}
else {
$subset_factor = $self->subset / $records_count;
}
warn "Program will run on approximatelly ".sprintf('%.0f', $subset_factor*100)."% of library\n" if $self->verbose;
}
warn "Counting Nmer occurrences in original and shuffled reads\n" if $self->verbose;
my %nmer_stats;
$reads_collection->foreach_record_do(sub{
my ($record) = @_;
return 0 if rand > $subset_factor;
my $seq = $record->sequence;
for my $i (0..length($seq)-$self->nmer_length) {
( run in 2.787 seconds using v1.01-cache-2.11-cpan-0bb4e1dffa6 )