Bio-MLST-Check

 view release on metacpan or  search on metacpan

MANIFEST  view on Meta::CPAN

t/SequenceTypes/SearchForFiles.t
t/SequenceTypes/SequenceType.t
t/Settings/DatabaseSettings.t
t/Validate/Executable.t
t/data/CDC_emmST_partial.tfa
t/data/Pediococcus_pentosaceus_filtering.xml
t/data/Streptococcus_pyogenes_emmST_contigs.fa
t/data/Streptococcus_pyogenes_emmST_unknown.fa
t/data/abc.fas
t/data/adk.tfa
t/data/adk_99_percent.tfa
t/data/adk_contamination.tfa
t/data/adk_imperfect.tfa
t/data/adk_imperfect_contamination.tfa
t/data/adk_less_than_95_percent.tfa
t/data/adk_top_hit_low_hit.tfa
t/data/adk_truncation.tfa
t/data/adk_two_imperfect_contamination.tfa
t/data/bbb.fas
t/data/bordetella.txt
t/data/ccc.fas
t/data/contigs.fa
t/data/contigs_check_concat_allele_order.fa
t/data/contigs_missing_locus.fa
t/data/contigs_near_match.fa

README.md  view on Meta::CPAN

## mlst_results.genomic.csv
This spreadsheet is similar to the mlst_results.allele.csv spreadsheet, however it gives the full sequences of each allele instead of the allele number.

## *unknown.fa
You can choose to output any new alleles (-c) which are not contained in the MLST database. These can then be used to feedback to the curators maintaining the MLST databases, where they can be assigned allele numbers and profiles.

## concatenated_alleles.fa and concatenated_alleles.phylip
You can choose to output a multiple FASTA/Phylip alignment of all of the MLST genes concatenated together, where each sample is represented by a single sequence. This file can then be used as input to a phylogenetic tree building application (such as...

# Method
The user can decide to use a specific MLST scheme or search all of them. The first step is to generate a blastn database using makeblastdb from the alleles.  The input sequences are then blasted against the database using blastn.  If there is a 100% ...

#Installation
Instructions are given for installing the software via Docker (can be run on all operating systems),for Debian/Ubuntu distributions and HomeBrew/LinuxBrew.

##Docker
We have a docker container which is setup and ready to go. It includes a snapshot of the MLST databases from the day it was built.  To install it:

```
docker pull sangerpathogens/mlst_check
```

lib/Bio/MLST/Blast/BlastN.pm  view on Meta::CPAN

sub _build_hit
{
  my($self, $line) = @_;
  chomp($line);
  my @row = split(/\t/,$line);
  my ($start, $end) = ($row[8], $row[9]);
  ($start, $end, my $reverse) = $start <= $end ? ($start, $end, 0) : ($end, $start, 1);
  return {
    'allele_name' => $row[0],
    'source_name' => $row[1],
    'percentage_identity' => $row[2],
    'sample_alignment_length' => $row[3],
    'matches' => $row[12],
    'source_start' => $start,
    'source_end' => $end,
    'reverse' => $reverse,
  };
}

sub _build_hits
{

lib/Bio/MLST/Blast/BlastN.pm  view on Meta::CPAN

  ###
  my ($self, $hits, $word_sizes) = @_;
  my @long_hits = grep { $_->{'sample_alignment_length'} >= $word_sizes->{$_->{'allele_name'}} } @$hits;
  return \@long_hits;
}

sub _filter_best_hits
{
  my($self, $hits, $tollerance) = @_;
  $tollerance = defined($tollerance) ? $tollerance : 2.0;
  my @percentages = map { $_->{'percentage_identity'} } @$hits;
  my $top_percentage = max @percentages;
  my @top_hits = grep { $_->{'percentage_identity'} >= $top_percentage - $tollerance } @$hits;
  return \@top_hits;
}

sub _group_overlapping_hits
{
  ###
  # Hits can overlap, this groups hits which overlap and returns a reference to
  # an array of references to these groups.
  ###
  my($self, $hits) = @_;

lib/Bio/MLST/Blast/BlastN.pm  view on Meta::CPAN

  my($self, $bins) = @_;
  my @groups = map { $_->{hits} } @$bins;
  return \@groups;
}

sub _best_hit_in_group
{
  ###
  # The best hit has the greatest number of matching bases.  If two hits have
  # the same number of matching bases, the one with the greater
  # percentage identity is selected.
  ###
  my($self, $hits) = @_;
  my @lengths = map { $_->{'matches'} } @$hits;
  my $max_length = max @lengths;
  my @longest_hits = grep { $_->{'matches'} == $max_length } @$hits;

  my $best_hit = reduce { $a->{'percentage_identity'} > $b->{'percentage_identity'} ? $a : $b } @longest_hits;
  return $best_hit;
}

sub _blastn_cmd
{
  my($self) = @_;
  my $word_size = int(100/(100 - $self->perc_identity ));
  $word_size = 11 if($word_size < 11);
  my $outfmt = "\"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident\""; # standard format + n. identical base matches
  

lib/Bio/MLST/Blast/BlastN.pm  view on Meta::CPAN

  # Find all of the best non-overlapping matches
  my $hits = $self->_build_hits($blast_output_fh);
  $hits = $self->_filter_by_alignment_length($hits, $self->word_sizes);
  my $best_hits = $self->_filter_best_hits($hits);
  my $bins = $self->_group_overlapping_hits($best_hits);
  $bins = $self->_merge_similar_bins($bins);
  my $groups = $self->_bins_to_groups($bins);

  # Find the best match
  my @best_in_groups = map { $self->_best_hit_in_group($_) } @$groups;
  $top_hit = reduce { $a->{'percentage_identity'} > $b->{'percentage_identity'} ? $a : $b } @best_in_groups;

  if (defined $top_hit)
  {
    $top_hit->{'percentage_identity'} = int($top_hit->{'percentage_identity'});
    delete $top_hit->{'sample_alignment_length'};
    delete $top_hit->{'matches'};
  }
  else {
    $top_hit = {};
  }
  if ( scalar @best_in_groups > 1 )
  {
    $top_hit->{contamination} = \@best_in_groups;
  }

lib/Bio/MLST/Blast/BlastN.pm  view on Meta::CPAN

   $blast_database->top_hit();

=head1 METHODS

=head2 top_hit

Returns a hash containing details about the top blast result.

The attributes returned in the hash are:
  allele_name
  percentage_identity
  source_name
  source_start
  source_end
  reverse
  contamination

=head1 SEE ALSO

=over 4

lib/Bio/MLST/CompareAlleles.pm  view on Meta::CPAN

      next;
    }
    
    # more than 1 allele has a good match
    if(defined($top_blast_hit{contamination}))
    {
      $self->contamination(1);
      my $contaminants = $top_blast_hit{contamination};
      my @contaminant_names = map { $_->{allele_name} } @$contaminants;
      # Add tilde to matches which are not 100%
      my @contaminant_names_with_tilde = map { $_->{percentage_identity} == 100 ? $_->{allele_name} : "$_->{allele_name}~" } @$contaminants;
      my $contamination_alleles = join( ',', sort @contaminant_names_with_tilde );
      $self->contamination_alleles( $contamination_alleles );
      $self->_translate_contamination_names_into_sequence_types(\@contaminant_names, $top_blast_hit{allele_name});
    }
    
    $top_blast_hit{allele_name} =~ s![-_]+!-!g;
    
    if($top_blast_hit{percentage_identity} == 100 )
    {
      $matching_sequence_names{$top_blast_hit{allele_name}} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
    }
    else
    {
      # If the top hit isn't 100%, add a tilde to the allele_name
      my $name_with_tilde = "$top_blast_hit{allele_name}~";
      $non_matching_sequence_names{$name_with_tilde} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
      $self->new_st(1);
    }

t/Blast/BlastN.t  view on Meta::CPAN

adk-1	SomeSequenceName	98.13	536	10	0	1	536	178	713	0.0	922	527
adk-2	SomeSequenceName	100.00	536	0	0	1	536	178	713	0.0	967	536
adk-3	SomeSequenceName	97.76	536	12	0	1	536	713	178	0.0	913	526
adk-4	SomeSequenceName	98.88	536	6	0	1	536	178	713	0.0	940	532
END_OUTPUT

my $blastn_line	= "adk-1	SomeSequenceName	98.13	536	10	0	1	536	178	713	0.0	922	527\n";
my %expected_hit = (
  'allele_name' => 'adk-1',
  'source_name' => 'SomeSequenceName',
  'percentage_identity' => '98.13',
  'sample_alignment_length' => '536',
  'matches' => '527',
  'source_start' => '178',
  'source_end' => '713',
  'reverse' => 0,
);
is_deeply($blastn_result->_build_hit($blastn_line), \%expected_hit, "Given a fake hit, check that its parsed into the hash correctly.");

$blastn_line	= "adk-1	SomeSequenceName	98.13	536	10	0	1	536	713	178	0.0	922	527\n";
%expected_hit = (
  'allele_name' => 'adk-1',
  'source_name' => 'SomeSequenceName',
  'percentage_identity' => '98.13',
  'sample_alignment_length' => '536',
  'matches' => '527',
  'source_start' => '178',
  'source_end' => '713',
  'reverse' => 1,
);
is_deeply($blastn_result->_build_hit($blastn_line), \%expected_hit, "Given a fake hit thats reversed, make sure the coordinates are correct.");

my $expected_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-3',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '97.76',
    'sample_alignment_length' => '536',
    'matches' => '526',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 1,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
my $fake_blast_output_fh = new IO::Scalar \$fake_blast_output;
is_deeply($blastn_result->_build_hits($fake_blast_output_fh), $expected_hits, "Given a set of hits, extract all into a hash correctly.");

my $input_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '436',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '613',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-3',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '336',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '513',
    'reverse' => 0,
  },
];
$expected_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '436',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '613',
    'reverse' => 0,
  },
];
my $word_sizes = {
  'adk-1' => 500,
  'adk-2' => 436,
  'adk-3' => 400
};
is_deeply($blastn_result->_filter_by_alignment_length($input_hits, $word_sizes), $expected_hits, "Given a set of hits, filter them by alignment length to remove lower quality hits.");

$input_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-3',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '97.76',
    'sample_alignment_length' => '536',
    'matches' => '526',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 1,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
$expected_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
is_deeply($blastn_result->_filter_best_hits($input_hits), $expected_hits, "Given fake blast hits, filter out the low quality results to leave the best ones.");

$expected_hits = [
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
is_deeply($blastn_result->_filter_best_hits($input_hits, 1.5), $expected_hits, "Given fake hits, filter out low quality results.");

my $overlapping_hits = [
  {
    'allele_name' => 'allele-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-end',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '178',
    'source_end' => '513',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-start',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '378',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-middle',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '278',
    'source_end' => '613',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-spill-over-end',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '478',
    'source_end' => '813',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-completely-different-truncation',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '1278',
    'source_end' => '1613',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-completely-different',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '1178',
    'source_end' => '1713',
    'reverse' => 0,
  },
];
$expected_hits = [
  {
    'start' => 178,
    'end' => 713,
    'hits' => [
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1-truncation-end',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '178',
        'source_end' => '513',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1-truncation-start',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '378',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 813,
    'hits' => [
      {
        'allele_name' => 'allele-spill-over-end',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '813',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 1178,
    'end' => 1713,
    'hits' => [
      {
        'allele_name' => 'allele-completely-different-truncation',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '1278',
        'source_end' => '1613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-completely-different',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '1178',
        'source_end' => '1713',
        'reverse' => 0,
      },
    ],
  },
];
is_deeply($blastn_result->_group_overlapping_hits($overlapping_hits), $expected_hits, "Group overlapping blast hits because they are often split up over the same gene.");

my $bins = [
  {
    'start' => 178,
    'end' => 713,
    'hits' => [
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 1013,
    'hits' => [
      {
        'allele_name' => 'allele-some-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '1013',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 180,
    'end' => 715,
    'hits' => [
      {
        'allele_name' => 'allele-lots-of-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '180',
        'source_end' => '715',
        'reverse' => 0,
      },
    ],
  },
];
my $merged_bins = [
  {
    'start' => 178,
    'end' => 715,
    'hits' => [
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-lots-of-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '180',
        'source_end' => '715',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 1013,
    'hits' => [
      {
        'allele_name' => 'allele-some-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '1013',
        'reverse' => 0,
      },
    ],
  },
];
is_deeply($blastn_result->_merge_similar_bins($bins), $merged_bins, "Merge hits on a the same genes so that they form bigger hits.");

$bins = [
  {
    'start' => 178,
    'end' => 715,
    'hits' => [
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-lots-of-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '180',
        'source_end' => '715',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 1013,
    'hits' => [
      {
        'allele_name' => 'allele-some-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '1013',
        'reverse' => 0,
      },
    ],
  },
];
my $groups = [
  [
    {
      'allele_name' => 'allele-1-truncation-middle',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '336',
      'matches' => '336',
      'source_start' => '278',
      'source_end' => '613',
      'reverse' => 0,
    },
    {
      'allele_name' => 'allele-1',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '536',
      'matches' => '536',
      'source_start' => '178',
      'source_end' => '713',
      'reverse' => 0,
    },
    {
      'allele_name' => 'allele-lots-of-overlap',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '536',
      'matches' => '536',
      'source_start' => '180',
      'source_end' => '715',
      'reverse' => 0,
    },
  ],
  [
    {
      'allele_name' => 'allele-some-overlap',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '336',
      'matches' => '336',
      'source_start' => '478',
      'source_end' => '1013',
      'reverse' => 0,
    },
  ],
];
is_deeply($blastn_result->_bins_to_groups($bins), $groups, "Convert sets of hits into summerised groups of hits over an allele.");

$input_hits = [
  {
    'allele_name' => 'allele-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '90.00',
    'sample_alignment_length' => '536',
    'matches' => '484',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-end',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '178',
    'source_end' => '513',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '95.00',
    'sample_alignment_length' => '536',
    'matches' => '511',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
my $expected_hit = {
  'allele_name' => 'allele-2',
  'source_name' => 'SomeSequenceName',
  'percentage_identity' => '95.00',
  'sample_alignment_length' => '536',
  'matches' => '511',
  'source_start' => '178',
  'source_end' => '713',
  'reverse' => 0,
};
is_deeply($blastn_result->_best_hit_in_group($input_hits), $expected_hit, "Report the best match in the group based.");

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk.tfa',
   word_sizes     => word_sizes('t/data/adk.tfa')
 )), 'Prepare the blast hits with perfect data.');
is_deeply($blastn_result->top_hit, {allele_name => 'adk-2', percentage_identity => 100, source_name => 'SomeSequenceName', source_start => 178, source_end => 713, reverse => 0 }, 'An exact match to an allele of full length should be the best hit.');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk_contamination.tfa',
   word_sizes     => word_sizes('t/data/adk_contamination.tfa')
 )), 'Prepare the blast hits with some contamination.');
ok(defined($blastn_result->top_hit->{contamination}), 'Contamination should be flagged');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),

t/Blast/BlastN.t  view on Meta::CPAN

is($blastn_result->top_hit->{allele_name}, 'adk-3', 'Picks longer allele if one allele is a truncation of another');

my $blast_database_near_match= Bio::MLST::Blast::Database->new(fasta_file => 't/data/contigs_near_match.fa');
ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database_near_match->location(),
   query_file     => 't/data/adk_top_hit_low_hit.tfa',
   word_sizes     => word_sizes('t/data/adk_top_hit_low_hit.tfa')
 )), 'Prepare the blast hits where there are multiple close matches');

is($blastn_result->top_hit->{allele_name}, 'adk-2', 'Correct allele found out of multiple hits');
is($blastn_result->top_hit->{percentage_identity}, 100,'Correct allele found out of multiple hits');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk_99_percent.tfa',
   word_sizes     => word_sizes('t/data/adk_99_percent.tfa')
 )), 'Prepare the blast hits when there is a 99% match');

is($blastn_result->top_hit->{allele_name}, 'adk-2', 'Correct allele close match');
is($blastn_result->top_hit->{percentage_identity}, 99,'Correct allele close match');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk_less_than_95_percent.tfa',
   word_sizes     => word_sizes('t/data/adk_less_than_95_percent.tfa')
 )), 'Prepare the blast hits where the match is less than 95% of any existing allele.');

is_deeply($blastn_result->top_hit, {}, 'Report no hits found if the hits are less than 95%.');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk.tfa', # << ignore this, not used
   word_sizes     => {
                       'gdh_18'  => 460,
                       'gdh_9'   => 460,

t/Output/SpreadsheetRow.t  view on Meta::CPAN

is_deeply($spreadsheet_row_obj->allele_numbers_row, ['contigs', 4,'','test_contamination',2,3,1], 'Construct the row for the allele number spreadsheet where there is contamination detected.');
is_deeply($spreadsheet_row_obj->genomic_row, ['contigs', 4,'','test_contamination','GGGGAAAGGGACTCAGGCTCAGTTCATCATGGAGAAATATGGTATTCCGCAAATCTCCACTGGCGATATGCTGCGTGCTGCGGTCAAATCTGGCTCCGAGCTGGGTAAACAAGCAAAAGACATTATGGATGCTGGCAAACTGGTTACCGACGAACTGGTGATCGCG...
'ATAACGCGCGTGAGAAAGCGCGTGGCGCGAAAGCGATCGGCACCACCGGTCGTGGTATCGGGCCTGCTTATGAAGATAAAGTGGCACGTCGCGGTCTGCGTGTTGGCGACCTTTTCGACAAAGAAACCTTCGCTGAAAAACTGAAAGAAGTGATGGAATATCACAACTTCCAGTTGGTTAACTACTACAAAGCTGAAGCGGTTGATTACCAGAAAGTTCTGGATGATACGATGGCTGTTGCCGACATCC...
'CGCACGTAAACTGGGCGTCGATATCGACAACCTGCTGTGCTCCCAGCCGGACACCGGCGAGCAGGCACTGGAAATCTGTGACGCCCTGGCGCGTTCTGGTGCAGTAGACGTTATCGTCGTTGACTCCGTGGCGGCACTGACGCCGAAAGCGGAAATCGAAGGCGAAATCGGCGACTCTCACATGGGCCTTGCGGCACGTATGATGAGCCAGGCGATGCGTAAGCTGGCGGGTAACCTGAAGCAGTCCAA...
 'Construct the row for the genomic data spreadsheet where there is contamination detected.');
$compare_alleles->contamination(0);

# no match for adk
$compare_alleles = Bio::MLST::CompareAlleles->new(
  sequence_filename => 't/data/contigs.fa',
  allele_filenames  => ['t/data/adk_less_than_95_percent.tfa','t/data/purA.tfa','t/data/recA.tfa'],
  profiles_filename => 't/data/databases/Escherichia_coli_1/profiles/escherichia_coli.txt',
);
$sequence_type_obj = Bio::MLST::SequenceType->new(
  profiles_filename  => 't/data/databases/Escherichia_coli_1/profiles/escherichia_coli.txt',
  matching_names     => $compare_alleles->found_sequence_names,
  non_matching_names => [],
  report_lowest_st   => 1
);
ok(($spreadsheet_row_obj = Bio::MLST::Spreadsheet::Row->new(sequence_type_obj => $sequence_type_obj, compare_alleles => $compare_alleles)),'Spreadsheet row constructor where there is no hit for one of the alleles.');

t/SequenceTypes/CompareAlleles.t  view on Meta::CPAN

is_deeply( $compare_alleles->non_matching_sequences, {}, 'No non-matching alleles were found as expected.');
is($compare_alleles->new_st, 0, 'As all the alleles were found, it is not a New ST.');
is($compare_alleles->contamination, 0, 'No contamination found since there is only a single allele for each gene.');

note('A contamination free assembly containing 1 alelle with multiple close matches to an allele in the MLST scheme.');
ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
  sequence_filename => 't/data/contigs.fa',
  allele_filenames => ['t/data/adk_top_hit_low_hit.tfa'],
    profiles_filename => 't/data/databases/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'Compare assembly containing 1 previously unseen allele where there are multiple close matches to a previously seen allele.');
is_deeply( $compare_alleles->found_sequence_names,sort(['adk-2']), 'Identified the nearest match to an allele in the database, based match with most bases in common and highest percentage identity.');
is_deeply( $compare_alleles->non_matching_sequences, {}, 'No non-matching alleles were found as expected');
is($compare_alleles->new_st, 0, 'This isnt a New ST combination, because it contains a Novel allele.');
is($compare_alleles->contamination, 0, 'No contamination found since there is only a single allele for each gene.');

note('A contaminated assembly where there is an exact match more than once to the same allele');
ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
  sequence_filename => 't/data/contigs.fa',
  allele_filenames => ['t/data/adk_contamination.tfa'],
    profiles_filename => 't/data/databases/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'Pass in a contaminated assembly');

t/SequenceTypes/CompareAlleles.t  view on Meta::CPAN

)), 'Pass in an assembly containing a single gene which is an imperfect match.');
is_deeply( $compare_alleles->found_sequence_names, [], 'No perfect hit were found as expected.');
is_deeply( $compare_alleles->found_non_matching_sequence_names, ['adk-3~'], 'The nearest allele is to the imperfect one is returned.');
is($compare_alleles->new_st, 1, 'This is a new ST as novel sequences have been found.');
is($compare_alleles->contamination, 0, 'The contamination flag should not be set since theres no contamination.');
is($compare_alleles->contamination_alleles, undef, 'No contamination alleles should be listed since there no contamination.');

note('An assembly where there are no hits to any of the alles in the database.');
ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
  sequence_filename => 't/data/contigs.fa',
  allele_filenames => ['t/data/adk_less_than_95_percent.tfa'],
    profiles_filename => 't/data/databases/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'Pass in an assembly where there are less than 95% hits to an existing database.');
is_deeply( $compare_alleles->found_sequence_names, [], 'No matching sequences found as expected.');
is_deeply( $compare_alleles->non_matching_sequences, {'adk_less_than_95_percent' => 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
is($compare_alleles->new_st, 1, 'As there no exact matches this could be a new ST.');
is($compare_alleles->contamination, 0, 'No contamination or duplication of alleles is found.');

note('An assembly where there is a mismatch between the allele names and the profile.');
ok(($compare_alleles = Bio::MLST::CompareAlleles->new(
  sequence_filename => 't/data/contigs_missing_locus.fa',
  allele_filenames => ['t/data/databases/Helicobacter_pylori/alleles/atpA.tfa',' t/data/databases/Helicobacter_pylori/alleles/efp.tfa','t/data/databases/Helicobacter_pylori/alleles/mutY.tfa'],
    profiles_filename => 't/data/databases/Escherichia_coli_1/profiles/escherichia_coli.txt',
)), 'An assembly where there is a mismatch between the allele names and the profile.');
is_deeply( $compare_alleles->found_sequence_names,sort(['atpA-3','efp-9999','mutY-3']), 'The correct alleles are found in the assembly.');



( run in 0.413 second using v1.01-cache-2.11-cpan-709fd43a63f )