Bio-MLST-Check

 view release on metacpan or  search on metacpan

lib/Bio/MLST/CompareAlleles.pm  view on Meta::CPAN

  # return len of top blast hit allele, otherwise return len of first seq in allele file
  my ($word_size, $first_seq);
  if( defined $blast_hit->{allele_name} ){
    $word_size = $word_sizes->{$blast_hit->{allele_name}};
  }
  else{
    my $seqio = Bio::SeqIO->new( -file => $allele_filename, -format => 'Fasta' );
    $word_size = $seqio->next_seq->length;
  }

  return $word_size;
}

sub _build_matching_sequences
{
  my ($self) = @_;
  my %matching_sequence_names;
  my %non_matching_sequence_names;
  my %missing_locus_names;
  
  for my $allele_filename (@{$self->allele_filenames})
  {
    my $word_sizes = $self->_word_sizes_for_given_allele_file($allele_filename);
    # TODO: You'll never get matches or contamination noted if there is a SNP
    # near the end of the allele.  This is because we filter all matches which
    # are shorter than the length of the allele in the profiles.  This could
    # mean that we're missing contamination of falsly noting matches against
    # truncated alleles.
    my $blast_results = Bio::MLST::Blast::BlastN->new(
      blast_database => $self->_blast_db_location,
      query_file     => $allele_filename,
      word_sizes     => $word_sizes,
      exec           => $self->blastn_exec
    );
    my %top_blast_hit = %{$blast_results->top_hit()};
   
    # possible missing locus
    if(! %top_blast_hit)
    {
      my %absent_loci_type = %{$self->_absent_loci};
      my $allele = $self->_get_base_filename($allele_filename);
      $missing_locus_names{$allele} = $absent_loci_type{$allele} if exists $absent_loci_type{$allele};
    }

    my $word_size = $self->_get_word_size_from_blast_hit($word_sizes, \%top_blast_hit, $allele_filename);

    # unknown allele
    if(! %top_blast_hit)
    {
      $non_matching_sequence_names{$self->_get_base_filename($allele_filename)} = $self->_pad_out_sequence("", $word_size);
      next;
    }
    
    # more than 1 allele has a good match
    if(defined($top_blast_hit{contamination}))
    {
      $self->contamination(1);
      my $contaminants = $top_blast_hit{contamination};
      my @contaminant_names = map { $_->{allele_name} } @$contaminants;
      # Add tilde to matches which are not 100%
      my @contaminant_names_with_tilde = map { $_->{percentage_identity} == 100 ? $_->{allele_name} : "$_->{allele_name}~" } @$contaminants;
      my $contamination_alleles = join( ',', sort @contaminant_names_with_tilde );
      $self->contamination_alleles( $contamination_alleles );
      $self->_translate_contamination_names_into_sequence_types(\@contaminant_names, $top_blast_hit{allele_name});
    }
    
    $top_blast_hit{allele_name} =~ s![-_]+!-!g;
    
    if($top_blast_hit{percentage_identity} == 100 )
    {
      $matching_sequence_names{$top_blast_hit{allele_name}} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
    }
    else
    {
      # If the top hit isn't 100%, add a tilde to the allele_name
      my $name_with_tilde = "$top_blast_hit{allele_name}~";
      $non_matching_sequence_names{$name_with_tilde} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
      $self->new_st(1);
    }
  }

  # deal with missing loci
  if(%matching_sequence_names && %missing_locus_names)
  {
    for my $allele (keys %missing_locus_names)
    {
      delete $non_matching_sequence_names{$allele};
      $matching_sequence_names{$allele.'-'.$missing_locus_names{$allele}} = '';
    }
  }

  # set new ST flag
  $self->new_st(1) if %non_matching_sequence_names;

  $self->non_matching_sequences(\%non_matching_sequence_names);
  return \%matching_sequence_names;
}

sub _translate_contamination_names_into_sequence_types
{
  my ($self, $contamination_names, $main_allele_name) = @_;
  my @contamination_sequence_types;
  
  for my $allele_number (@{ $contamination_names})
  {
    next if($main_allele_name eq $allele_number);
    my $st = Bio::MLST::SequenceType->new(
      profiles_filename  => $self->profiles_filename,
      matching_names     => [$allele_number],
      non_matching_names => []
    );
    
    if(defined($st->sequence_type()) )
    {
      push(@contamination_sequence_types, $st->sequence_type());
    }
  }
  
  $self->contamination_sequence_names(\@contamination_sequence_types);
}


sub _get_blast_hit_sequence
{
   my ($self, $contig_name, $start, $end, $word_size, $reverse_complement) = @_;
   seek($self->_sequence_handle->_fh, 0,0);
   while( my $input_sequence_obj = $self->_sequence_handle->next_seq() ) 
   {
     next if( $input_sequence_obj->id ne $contig_name);



( run in 2.506 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )