BioPerl

 view release on metacpan or  search on metacpan

Bio/Assembly/Tools/ContigSpectrum.pm  view on Meta::CPAN

=cut

sub _get_contig_seq_stats {
  my ($self, $contigobj, $seq_hash) = @_;
  my @contig_stats = (0, 0);
  # contig_stats = (avg_length, nof_seq)
  for my $seqobj ($contigobj->each_seq) {
    next if defined $seq_hash && !defined $$seq_hash{$seqobj->id};
    my $seq_string;
    if ($contigobj->isa('Bio::Assembly::Singlet')) { # a singlet
      $seq_string = $contigobj->seqref->seq;
    } else { # a contig
      $seq_string = $seqobj->seq;
    }
    # Number of non-gap characters in the sequence
    my $seq_len = $seqobj->_ungapped_len;
    my @seq_stats = ($seq_len);
    @contig_stats = $self->_update_seq_stats(@contig_stats, @seq_stats);
  }
  return @contig_stats;
}


=head2 _update_seq_stats

  Title   : _update_seq_stats
  Usage   : 
  Function: Update the number of sequences and their average length 1
            average identity 1
            minimum length 1
            minimum identity 1
            number of overlaps 1 average sequence length
  Returns : average sequence length
            number of sequences
  Args    : average sequence length 1
            number of sequences 1
            average sequence length 2
            number of sequences 2           

=cut

sub _update_seq_stats {
  my ($self, $p_avg_length, $p_nof_seq, $n_avg_length, $n_nof_seq) = @_;
  # Defaults
  if (not defined $n_nof_seq) {
    $n_nof_seq = 1;
  }
  # Update overlap statistics
  my $avg_length = 0;
  my $nof_seq = $p_nof_seq + $n_nof_seq;
  if ($nof_seq != 0) {
    $avg_length = ($p_avg_length * $p_nof_seq + $n_avg_length * $n_nof_seq) / $nof_seq;
  }
  return $avg_length, $nof_seq;
}


=head2 _get_assembly_overlap_stats

  Title   : _get_assembly_overlap_stats
  Usage   : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
              = $csp->_get_assembly_overlap_stats($assemblyobj);
  Function: Get statistics about pairwise overlaps in contigs of an assembly
  Returns : average overlap length
            average identity percent
            minimum overlap length
            minimum identity percent
            number of overlaps
  Args    : Bio::Assembly::Scaffold, Contig or Singlet object
            hash reference with the IDs of the sequences to consider [optional]

=cut

sub _get_assembly_overlap_stats {
  my ($self, $assembly_obj, $seq_hash) = @_;

  # Sanity check
  if ( !defined $assembly_obj ||
       ( !$assembly_obj->isa('Bio::Assembly::ScaffoldI') &&
         !$assembly_obj->isa('Bio::Assembly::Contig')       ) ) {
    $self->throw("Must provide a Bio::Assembly::ScaffoldI, Contig or Singlet object");
  }
  $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
    if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');

  # Look at all the contigs (no singlets!)
  my @asm_stats = (0, 0, undef, undef, 0);
  # asm_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)
  for my $contig_obj ( $self->_get_contig_like($assembly_obj) ) {
    @asm_stats = $self->_update_overlap_stats(  @asm_stats,
      $self->_get_contig_overlap_stats($contig_obj, $seq_hash) );
  }

  return @asm_stats;
}


=head2 _get_contig_overlap_stats

  Title   : _get_contig_overlap_stats
  Usage   : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
              = $csp->_get_contig_overlap_stats($contigobj);
  Function: Get statistics about pairwise overlaps in a contig or singlet. The
              statistics are obtained using graph theory: each read is a node
              and the edges between 2 reads are weighted by minus the number of
              conserved residues in the alignment between the 2 reads. The
              minimum spanning tree of this graph represents the overlaps that
              form the contig. Overlaps that do not satisfy the minimum overlap
              length and similarity get a malus on their score.
              Note: This function requires the optional BioPerl dependency
              module called 'Graph'
  Returns : average overlap length
            average identity percent
            minimum overlap length
            minimum identity percent
            number of overlaps
  Args    : Bio::Assembly::Contig or Singlet object
            hash reference with the IDs of the sequences to consider [optional]

=cut

sub _get_contig_overlap_stats {
  my ($self, $contig_obj, $seq_hash) = @_;

  # Sanity check
  $self->throw("Must provide a Bio::Assembly::Contig object")
    if (!defined $contig_obj || !$contig_obj->isa("Bio::Assembly::Contig"));
  $self->throw("Expecting a hash reference. Got [".ref($seq_hash)."]")
    if (defined $seq_hash && ! ref($seq_hash) eq 'HASH');   

  my @contig_stats = (0, 0, undef, undef, 0);
  # contig_stats = (avg_length, avg_identity, min_length, min_identity, nof_overlaps)

  # Build contig graph
  ### consider providing the minima to _contig_graph here too?
  my ($g, $overlaps) = $self->_contig_graph($contig_obj, $seq_hash);

  if ( defined $g ) {
    # Graph minimum spanning tree (tree that goes through strongest overlaps)
    $g = $g->MST_Kruskal();

    # Calculate minimum overlap length and identity for this contig
    for my $edge ( $g->edges ) {
      # Retrieve overlap information
      my ($id1, $id2) = @$edge;
      if (not exists $$overlaps{$id1}{$id2}) {
        ($id2, $id1) = @$edge;
      }
      my ($score, $length, $identity) = @{$$overlaps{$id1}{$id2}};
      # Update contig stats
      my @overlap_stats = ($length, $identity);
      @contig_stats = $self->_update_overlap_stats(@contig_stats, @overlap_stats);
    }
  }

  return @contig_stats;
}


=head2 _update_overlap_stats



( run in 1.231 second using v1.01-cache-2.11-cpan-5735350b133 )