Bio-MLST-Check
view release on metacpan or search on metacpan
lib/Bio/MLST/CompareAlleles.pm view on Meta::CPAN
# return len of top blast hit allele, otherwise return len of first seq in allele file
my ($word_size, $first_seq);
if( defined $blast_hit->{allele_name} ){
$word_size = $word_sizes->{$blast_hit->{allele_name}};
}
else{
my $seqio = Bio::SeqIO->new( -file => $allele_filename, -format => 'Fasta' );
$word_size = $seqio->next_seq->length;
}
return $word_size;
}
sub _build_matching_sequences
{
my ($self) = @_;
my %matching_sequence_names;
my %non_matching_sequence_names;
my %missing_locus_names;
for my $allele_filename (@{$self->allele_filenames})
{
my $word_sizes = $self->_word_sizes_for_given_allele_file($allele_filename);
# TODO: You'll never get matches or contamination noted if there is a SNP
# near the end of the allele. This is because we filter all matches which
# are shorter than the length of the allele in the profiles. This could
# mean that we're missing contamination of falsly noting matches against
# truncated alleles.
my $blast_results = Bio::MLST::Blast::BlastN->new(
blast_database => $self->_blast_db_location,
query_file => $allele_filename,
word_sizes => $word_sizes,
exec => $self->blastn_exec
);
my %top_blast_hit = %{$blast_results->top_hit()};
# possible missing locus
if(! %top_blast_hit)
{
my %absent_loci_type = %{$self->_absent_loci};
my $allele = $self->_get_base_filename($allele_filename);
$missing_locus_names{$allele} = $absent_loci_type{$allele} if exists $absent_loci_type{$allele};
}
my $word_size = $self->_get_word_size_from_blast_hit($word_sizes, \%top_blast_hit, $allele_filename);
# unknown allele
if(! %top_blast_hit)
{
$non_matching_sequence_names{$self->_get_base_filename($allele_filename)} = $self->_pad_out_sequence("", $word_size);
next;
}
# more than 1 allele has a good match
if(defined($top_blast_hit{contamination}))
{
$self->contamination(1);
my $contaminants = $top_blast_hit{contamination};
my @contaminant_names = map { $_->{allele_name} } @$contaminants;
# Add tilde to matches which are not 100%
my @contaminant_names_with_tilde = map { $_->{percentage_identity} == 100 ? $_->{allele_name} : "$_->{allele_name}~" } @$contaminants;
my $contamination_alleles = join( ',', sort @contaminant_names_with_tilde );
$self->contamination_alleles( $contamination_alleles );
$self->_translate_contamination_names_into_sequence_types(\@contaminant_names, $top_blast_hit{allele_name});
}
$top_blast_hit{allele_name} =~ s![-_]+!-!g;
if($top_blast_hit{percentage_identity} == 100 )
{
$matching_sequence_names{$top_blast_hit{allele_name}} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
}
else
{
# If the top hit isn't 100%, add a tilde to the allele_name
my $name_with_tilde = "$top_blast_hit{allele_name}~";
$non_matching_sequence_names{$name_with_tilde} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
$self->new_st(1);
}
}
# deal with missing loci
if(%matching_sequence_names && %missing_locus_names)
{
for my $allele (keys %missing_locus_names)
{
delete $non_matching_sequence_names{$allele};
$matching_sequence_names{$allele.'-'.$missing_locus_names{$allele}} = '';
}
}
# set new ST flag
$self->new_st(1) if %non_matching_sequence_names;
$self->non_matching_sequences(\%non_matching_sequence_names);
return \%matching_sequence_names;
}
sub _translate_contamination_names_into_sequence_types
{
my ($self, $contamination_names, $main_allele_name) = @_;
my @contamination_sequence_types;
for my $allele_number (@{ $contamination_names})
{
next if($main_allele_name eq $allele_number);
my $st = Bio::MLST::SequenceType->new(
profiles_filename => $self->profiles_filename,
matching_names => [$allele_number],
non_matching_names => []
);
if(defined($st->sequence_type()) )
{
push(@contamination_sequence_types, $st->sequence_type());
}
}
$self->contamination_sequence_names(\@contamination_sequence_types);
}
sub _get_blast_hit_sequence
{
my ($self, $contig_name, $start, $end, $word_size, $reverse_complement) = @_;
seek($self->_sequence_handle->_fh, 0,0);
while( my $input_sequence_obj = $self->_sequence_handle->next_seq() )
{
next if( $input_sequence_obj->id ne $contig_name);
( run in 2.506 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )