App-Sandy

 view release on metacpan or  search on metacpan

lib/App/Sandy/Simulator.pm  view on Meta::CPAN


			# Populate alternative seq_id
			$self->_populate_piece_table($piece_table{$seq_id}{alt}{table}, $snvs);
		}
	}

	# Initialize the logical offsets and valodate the
	# new size due to the genomic variation

	my @blacklist;

	for my $seq_id (keys %piece_table) {
		my $type_h = delete $piece_table{$seq_id};

		for my $type (keys %$type_h) {
			my $table_h = delete $type_h->{$type};
			my $table = $table_h->{table};

			# Initialize the logical offset
			$table->calculate_logical_offset;

			# Get the new size
			my $new_size = $table->logical_len;

			unless ($self->truncate) {
				my $class = ref $self->seq;

				if ($class eq 'App::Sandy::Seq::SingleEnd') {
					if ($new_size < $self->seq->read_mean) {
						log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required read-mean";
						next;
					}
				} elsif ($class eq 'App::Sandy::Seq::PairedEnd') {
					if ($new_size < $self->seq->fragment_mean) {
						log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required fragment mean";
						next;
					}
				} else {
					die "No valid options for 'seq'";
				}
			}

			# If all's right
			$table_h->{size} = $new_size;
			$type_h->{$type} = $table_h;
		}

		# if there is at least one type,
		# then return it to the piece_table
		if (%$type_h) {
			$piece_table{$seq_id} = $type_h;

		# else, just remove it!
		} else {
			push @blacklist => $seq_id;
		}
	}

	unless (%piece_table) {
		die "All fasta entries were removed due to deletions. ",
			"Please, verify the genomic variation '$genomic_variation'\n";
	}

	# If fasta_rtree has entries
	unless ($self->_has_no_fasta_rtree) {
		# Remove no valid entries from id -> pid relation
		$self->_delete_fasta_rtree(@blacklist) if @blacklist;
	}

	# Make the id -> pid relationship
	$self->_populate_fasta_tree;

	# HASH -> SEQ_ID -> @(REF @ALT) -> @(TABLE SIZE)
	return \%piece_table;
}

sub _populate_piece_table {
	my ($self, $table, $snvs) = @_;

	for my $snv (@$snvs) {
		# If there is an ID, make sure that it is not a comma, colon
		# separated list. Else, make sure to keep the ref/alt length
		# to max 25+25+1=51
		my $annot = defined $snv->{id} && $snv->{id} ne '.'
			? sprintf "%d:%s" => $snv->{pos} + 1, (split(/[,;]/, $snv->{id}))[0]
			: sprintf "%d:%.25s/%.25s" => $snv->{pos} + 1, $snv->{ref}, $snv->{alt};

		# Insertion
		if ($snv->{ref} eq '-') {
			$table->insert(\$snv->{alt}, $snv->{pos}, $annot);

		# Deletion
		} elsif ($snv->{alt} eq '-') {
			$table->delete($snv->{pos}, length $snv->{ref}, $annot);

		# Change
		} else {
			$table->change(\$snv->{alt}, $snv->{pos}, length $snv->{ref}, $annot);
		}
	}
}

sub _validate_indexed_snv_against_fasta {
	my ($self, $indexed_snv) = @_;

	my $indexed_fasta = $self->_fasta;
	my $genomic_variation = $self->_genomic_variation_names;

	for my $std_seq_id (keys %$indexed_snv) {
		my $snvs = delete $indexed_snv->{$std_seq_id};
		my $seq_id = $self->_get_seqname($std_seq_id);

		unless (defined $seq_id && exists $indexed_fasta->{$seq_id}) {
			next;
		}

		my $seq = \$indexed_fasta->{$seq_id}{seq};
		my $size = $indexed_fasta->{$seq_id}{size};

		my @saved_snvs;

		for my $snv (@$snvs) {
			# Insertions may accur until one base after the
			# end of the sequence, not more
			if (($snv->{ref} eq '-' && $snv->{pos} > $size) || ($snv->{ref} ne '-' && $snv->{pos} >= $size)) {
				log_msg sprintf ":: In validating '%s': Position, %s/%s at %s:%d, outside fasta sequence",
					$genomic_variation, $snv->{ref}, $snv->{alt}, $seq_id, $snv->{pos} + 1;

				# Next snv
				next;
			# Deletions and changes. Just verify if the reference exists
			} elsif ($snv->{ref} ne '-') {
				my $ref = substr $$seq, $snv->{pos}, length($snv->{ref});

				if (uc($ref) ne uc($snv->{ref})) {
					log_msg sprintf ":: In validating '%s': Not found reference '%s' at fasta position %s:%d",
						$genomic_variation, $snv->{ref}, $seq_id, $snv->{pos} + 1;

					# Next snv
					next;
				}
			}

			push @saved_snvs  => $snv;
		}

		if (@saved_snvs) {
			$indexed_snv->{$std_seq_id} = [@saved_snvs];
		}
	}
}

sub _calculate_number_of_reads {
	my $self = shift;
	my $number_of_reads;

	if ($self->count_loops_by eq 'coverage') {
		# It is needed to calculate the genome size
		my $fasta = $self->_fasta;
		my $fasta_size = 0;
		$fasta_size += $fasta->{$_}{size} for keys %{ $fasta };
		$number_of_reads = int(($fasta_size * $self->coverage) / $self->seq->read_mean);
		# In case it is paired-end read, divide the number of reads by 2 because
		# App::Sandy::Seq::PairedEnd class returns 2 reads at time
		$number_of_reads = int($number_of_reads / 2)
			if ref($self->seq) eq 'App::Sandy::Seq::PairedEnd';
	} elsif ($self->count_loops_by eq 'number-of-reads') {
		$number_of_reads = $self->number_of_reads;
	} else {
		croak sprintf "Unknown option '%s' for calculating the number of reads\n",
			$self->count_loops_by;
	}

	# Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size
	if ($number_of_reads <= 0) {
		die "The computed number of reads is equal to zero.\n" .
			"It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n";
	}

	return $number_of_reads;
}

sub _calculate_parent_count {
	my ($self, $counter_ref) = @_;
	return if $self->_has_no_fasta_rtree;

	my %parent_count;

	while (my ($id, $count) = each %$counter_ref) {
		my $pid = $self->_get_fasta_rtree($id);
		$parent_count{$pid} += $count if defined $pid;



( run in 3.462 seconds using v1.01-cache-2.11-cpan-13bb782fe5a )