BioPerl
view release on metacpan or search on metacpan
Bio/Assembly/Tools/ContigSpectrum.pm view on Meta::CPAN
=cut
sub _get_contig_seq_stats {
my ($self, $contigobj, $seq_hash) = @_;
my @contig_stats = (0, 0);
# contig_stats = (avg_length, nof_seq)
for my $seqobj ($contigobj->each_seq) {
next if defined $seq_hash && !defined $$seq_hash{$seqobj->id};
my $seq_string;
if ($contigobj->isa('Bio::Assembly::Singlet')) { # a singlet
$seq_string = $contigobj->seqref->seq;
} else { # a contig
$seq_string = $seqobj->seq;
}
# Number of non-gap characters in the sequence
my $seq_len = $seqobj->_ungapped_len;
my @seq_stats = ($seq_len);
@contig_stats = $self->_update_seq_stats(@contig_stats, @seq_stats);
}
return @contig_stats;
}
=head2 _update_seq_stats
Title : _update_seq_stats
Usage :
Function: Update the number of sequences and their average length 1
average identity 1
minimum length 1
minimum identity 1
number of overlaps 1 average sequence length
Returns : average sequence length
number of sequences
Args : average sequence length 1
number of sequences 1
average sequence length 2
number of sequences 2
=cut
sub _update_seq_stats {
my ($self, $p_avg_length, $p_nof_seq, $n_avg_length, $n_nof_seq) = @_;
# Defaults
if (not defined $n_nof_seq) {
$n_nof_seq = 1;
}
# Update overlap statistics
my $avg_length = 0;
my $nof_seq = $p_nof_seq + $n_nof_seq;
if ($nof_seq != 0) {
$avg_length = ($p_avg_length * $p_nof_seq + $n_avg_length * $n_nof_seq) / $nof_seq;
}
return $avg_length, $nof_seq;
}
=head2 _get_assembly_overlap_stats
Title : _get_assembly_overlap_stats
Usage : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
= $csp->_get_assembly_overlap_stats($assemblyobj);
Function: Get statistics about pairwise overlaps in contigs of an assembly
Returns : average overlap length
average identity percent
minimum overlap length
minimum identity percent
number of overlaps
Args : Bio::Assembly::Scaffold, Contig or Singlet object
hash reference with the IDs of the sequences to consider [optional]
=cut
sub _get_assembly_overlap_stats {
my ($self, $assembly_obj, $seq_hash) = @_;
# Sanity check
if ( !defined $assembly_obj ||
( !$assembly_obj->isa('Bio::Assembly::ScaffoldI') &&
!$assembly_obj->isa('Bio::Assembly::Contig') ) ) {
$self->throw("Must provide a Bio::Assembly::ScaffoldI, Contig or Singlet object");
}
$self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
# Look at all the contigs (no singlets!)
my @asm_stats = (0, 0, undef, undef, 0);
# asm_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)
for my $contig_obj ( $self->_get_contig_like($assembly_obj) ) {
@asm_stats = $self->_update_overlap_stats( @asm_stats,
$self->_get_contig_overlap_stats($contig_obj, $seq_hash) );
}
return @asm_stats;
}
=head2 _get_contig_overlap_stats
Title : _get_contig_overlap_stats
Usage : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
= $csp->_get_contig_overlap_stats($contigobj);
Function: Get statistics about pairwise overlaps in a contig or singlet. The
statistics are obtained using graph theory: each read is a node
and the edges between 2 reads are weighted by minus the number of
conserved residues in the alignment between the 2 reads. The
minimum spanning tree of this graph represents the overlaps that
form the contig. Overlaps that do not satisfy the minimum overlap
length and similarity get a malus on their score.
Note: This function requires the optional BioPerl dependency
module called 'Graph'
Returns : average overlap length
average identity percent
minimum overlap length
minimum identity percent
number of overlaps
Args : Bio::Assembly::Contig or Singlet object
hash reference with the IDs of the sequences to consider [optional]
=cut
sub _get_contig_overlap_stats {
my ($self, $contig_obj, $seq_hash) = @_;
# Sanity check
$self->throw("Must provide a Bio::Assembly::Contig object")
if (!defined $contig_obj || !$contig_obj->isa("Bio::Assembly::Contig"));
$self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');
my @contig_stats = (0, 0, undef, undef, 0);
# contig_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)
# Build contig graph
### consider providing the minima to _contig_graph here too?
my ($g, $overlaps) = $self->_contig_graph($contig_obj, $seq_hash);
if ( defined $g ) {
# Graph minimum spanning tree (tree that goes through strongest overlaps)
$g = $g->MST_Kruskal();
# Calculate minimum overlap length and identity for this contig
for my $edge ( $g->edges ) {
# Retrieve overlap information
my ($id1, $id2) = @$edge;
if (not exists $$overlaps{$id1}{$id2}) {
($id2, $id1) = @$edge;
}
my ($score, $length, $identity) = @{$$overlaps{$id1}{$id2}};
# Update contig stats
my @overlap_stats = ($length, $identity);
@contig_stats = $self->_update_overlap_stats(@contig_stats, @overlap_stats);
}
}
return @contig_stats;
}
=head2 _update_overlap_stats
( run in 1.231 second using v1.01-cache-2.11-cpan-5735350b133 )