App-Sandy
view release on metacpan or search on metacpan
lib/App/Sandy/Read.pm view on Meta::CPAN
package App::Sandy::Read;
# ABSTRACT: Base class to simulate reads
use App::Sandy::Base 'class';
use List::Util 'first';
use constant NUM_TRIES => 1000;
with 'App::Sandy::Role::BSearch';
our $VERSION = '0.25'; # VERSION
has 'sequencing_error' => (
is => 'ro',
isa => 'My:NumHS',
required => 1
);
has '_count_base' => (
is => 'rw',
isa => 'Int',
default => 0
);
has '_base' => (
is => 'rw',
isa => 'Int',
builder => '_build_base',
lazy_build => 1
);
has '_not_base' => (
is => 'ro',
isa => 'HashRef',
builder => '_build_not_base',
lazy_build => 1
);
sub _build_not_base {
my %not_base = (
A => ['T', 'C', 'G'],
a => ['t', 'c', 'g'],
T => ['A', 'C', 'G'],
t => ['a', 'c', 'g'],
C => ['A', 'T', 'G'],
c => ['a', 't', 'g'],
G => ['A', 'T', 'C'],
g => ['a', 't', 'c']
);
return \%not_base;
}
sub _build_base {
my $self = shift;
# If sequencing_error equal to zero, set _base to zero
return $self->sequencing_error && int(1 / $self->sequencing_error);
}
sub subseq {
my ($self, $seq_ref, $seq_len, $slice_len, $pos) = @_;
my $read = substr $$seq_ref, $pos, $slice_len;
return \$read;
}
sub subseq_rand {
my ($self, $seq_ref, $seq_len, $slice_len, $rng) = @_;
my $usable_len = $seq_len - $slice_len;
# Use App::Sandy::Rand
my $pos = $rng->get_n($usable_len + 1);
my $read = substr $$seq_ref, $pos, $slice_len;
return (\$read, $pos);
}
sub subseq_rand_ptable {
my ($self, $ptable, $ptable_size, $slice_len, $sub_slice_len, $rng, $blacklist) = @_;
my $usable_len = $ptable_size - $slice_len;
state $cmp_func = sub {
my ($key1, $key2) = @_;
if ($key1->[1] >= $key2->[0] && $key1->[0] <= $key2->[1]) {
return 0;
}
elsif ($key1->[0] > $key2->[0]) {
return 1;
} else {
return -1;
}
};
my $is_inside_blacklist;
my $pos = 0;
my $random_tries = 0;
do {
if (++$random_tries > NUM_TRIES) {
croak sprintf
"Too many tries to calculate a valid random position\n" .
"Your FASTA file may be full of NNN regions\n"
}
# Use App::Sandy::Rand
$pos = $rng->get_n($usable_len + 1);
$is_inside_blacklist = $self->with_bsearch([$pos, $pos + $slice_len - 1],
$blacklist, scalar @$blacklist, $cmp_func);
} while (defined $is_inside_blacklist);
my $pieces = $ptable->lookup($pos, $slice_len);
return $self->_build_subseq($pieces, $pos, $slice_len, $sub_slice_len);
lib/App/Sandy/Read.pm view on Meta::CPAN
my $slice_len = $len < $usable_len
? $len
: $usable_len;
my $read = substr ${ $pieces->[0]{ref} }, $pieces->[0]{start} + $offset, $slice_len;
my $miss_len = $len - $slice_len;
for (my $i = 1; $i < @$pieces; $i++) {
push @annot => @{ $pieces->[$i]{annot2} } if @{ $pieces->[$i]{annot2} };
push @annot => $pieces->[$i]{annot1} if $pieces->[$i]{annot1};
$slice_len = $miss_len < $pieces->[$i]{len}
? $miss_len
: $pieces->[$i]{len};
$read .= substr ${ $pieces->[$i]{ref} }, $pieces->[$i]{start}, $slice_len;
$miss_len -= $slice_len;
}
# I must to make this mess in order to annotate paired-end reads :(
my $start_ref = $pieces->[0]{pos} + $offset;
my $end_ref = $start_ref + $len - 1;
my $read_end_ref = $start_ref + $sub_len - 1;
my $read_end_piece = first { $self->_is_pos_inside_piece($read_end_ref, $_) } reverse @$pieces;
my $read_start_ref = $end_ref - $sub_len + 1;
my $read_start_piece = first { $self->_is_pos_inside_piece($read_start_ref, $_) } @$pieces;
my $attr = {
'start' => $pos + 1,
'end' => $pos + $len,
'start_ref' => $pieces->[0]{is_orig} ? $start_ref + 1 : 'NA',
'end_ref' => $pieces->[-1]{is_orig} ? $end_ref + 1 : 'NA',
'read_end_ref' => $read_end_piece->{is_orig} ? $read_end_ref + 1 : 'NA',
'read_start_ref' => $read_start_piece->{is_orig} ? $read_start_ref + 1 : 'NA',
'annot' => \@annot
};
return (\$read, $attr);
}
sub _is_pos_inside_piece {
my ($self, $pos, $piece) = @_;
my $end = $piece->{pos} + $piece->{len} - 1;
return $pos >= $piece->{pos} && $pos <= $end;
}
sub insert_sequencing_error {
my ($self, $seq_ref, $read_size, $rng) = @_;
my @errors;
if ($self->sequencing_error) {
my $acm_base = $read_size + $self->_count_base;
my $num_err = int($acm_base / $self->_base);
my $left_count = $acm_base % $self->_base;
for (my $i = 0; $i < $num_err; $i++) {
my $pos = $i * $self->_base + $self->_base - $self->_count_base - 1;
my $b = substr($$seq_ref, $pos, 1);
my $not_b = $self->_randb($b, $rng);
substr($$seq_ref, $pos, 1) = $not_b;
push @errors => sprintf("%d:%s/%s", $pos + 1, $b, $not_b);
}
$self->_count_base($left_count);
}
return \@errors;
}
sub reverse_complement {
my ($self, $seq_ref) = @_;
$$seq_ref = reverse $$seq_ref;
$$seq_ref =~ tr/atcgATCG/tagcTAGC/;
}
sub _randb {
my ($self, $base, $rng) = @_;
return $self->_not_base->{$base}[$rng->get_n(3)] || $base;
}
__END__
=pod
=encoding UTF-8
=head1 NAME
App::Sandy::Read - Base class to simulate reads
=head1 VERSION
version 0.25
=head1 AUTHORS
=over 4
=item *
Thiago L. A. Miller <tmiller@mochsl.org.br>
=item *
J. Leonel Buzzo <lbuzzo@mochsl.org.br>
=item *
Felipe R. C. dos Santos <fsantos@mochsl.org.br>
=item *
Helena B. Conceição <hconceicao@mochsl.org.br>
=item *
Rodrigo Barreiro <rbarreiro@mochsl.org.br>
=item *
Gabriela Guardia <gguardia@mochsl.org.br>
=item *
Fernanda Orpinelli <forpinelli@mochsl.org.br>
=item *
Rafael Mercuri <rmercuri@mochsl.org.br>
=item *
Rodrigo Barreiro <rbarreiro@mochsl.org.br>
=item *
Pedro A. F. Galante <pgalante@mochsl.org.br>
( run in 0.590 second using v1.01-cache-2.11-cpan-39bf76dae61 )