Bio-MLST-Check

 view release on metacpan or  search on metacpan

t/Blast/BlastN.t  view on Meta::CPAN

#!/usr/bin/env perl
use strict;
use warnings;
use Bio::SeqIO;
use File::Temp;
use IO::Scalar;

BEGIN { unshift(@INC, './lib') }
BEGIN {
    use Test::Most;
    use Bio::MLST::Blast::Database;
    use_ok('Bio::MLST::Blast::BlastN');
}

note('A wrapper around NCBI blastn which allows for a sequence to be queried against a database and the results returned in a hash.');

my $blast_database= Bio::MLST::Blast::Database->new(fasta_file => 't/data/contigs.fa');

ok((my $blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk.tfa',
   word_sizes     => word_sizes('t/data/adk.tfa')
 )), 'Setup a blast runner with a valid database and allele.');

my $fake_blast_output = <<'END_OUTPUT';
adk-1	SomeSequenceName	98.13	536	10	0	1	536	178	713	0.0	922	527
adk-2	SomeSequenceName	100.00	536	0	0	1	536	178	713	0.0	967	536
adk-3	SomeSequenceName	97.76	536	12	0	1	536	713	178	0.0	913	526
adk-4	SomeSequenceName	98.88	536	6	0	1	536	178	713	0.0	940	532
END_OUTPUT

my $blastn_line	= "adk-1	SomeSequenceName	98.13	536	10	0	1	536	178	713	0.0	922	527\n";
my %expected_hit = (
  'allele_name' => 'adk-1',
  'source_name' => 'SomeSequenceName',
  'percentage_identity' => '98.13',
  'sample_alignment_length' => '536',
  'matches' => '527',
  'source_start' => '178',
  'source_end' => '713',
  'reverse' => 0,
);
is_deeply($blastn_result->_build_hit($blastn_line), \%expected_hit, "Given a fake hit, check that its parsed into the hash correctly.");

$blastn_line	= "adk-1	SomeSequenceName	98.13	536	10	0	1	536	713	178	0.0	922	527\n";
%expected_hit = (
  'allele_name' => 'adk-1',
  'source_name' => 'SomeSequenceName',
  'percentage_identity' => '98.13',
  'sample_alignment_length' => '536',
  'matches' => '527',
  'source_start' => '178',
  'source_end' => '713',
  'reverse' => 1,
);
is_deeply($blastn_result->_build_hit($blastn_line), \%expected_hit, "Given a fake hit thats reversed, make sure the coordinates are correct.");

my $expected_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-3',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '97.76',
    'sample_alignment_length' => '536',
    'matches' => '526',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 1,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
my $fake_blast_output_fh = new IO::Scalar \$fake_blast_output;
is_deeply($blastn_result->_build_hits($fake_blast_output_fh), $expected_hits, "Given a set of hits, extract all into a hash correctly.");

my $input_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '436',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '613',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-3',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '336',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '513',
    'reverse' => 0,
  },
];
$expected_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '436',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '613',
    'reverse' => 0,
  },
];
my $word_sizes = {
  'adk-1' => 500,
  'adk-2' => 436,
  'adk-3' => 400
};
is_deeply($blastn_result->_filter_by_alignment_length($input_hits, $word_sizes), $expected_hits, "Given a set of hits, filter them by alignment length to remove lower quality hits.");

$input_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-3',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '97.76',
    'sample_alignment_length' => '536',
    'matches' => '526',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 1,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
$expected_hits = [
  {
    'allele_name' => 'adk-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.13',
    'sample_alignment_length' => '536',
    'matches' => '527',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
is_deeply($blastn_result->_filter_best_hits($input_hits), $expected_hits, "Given fake blast hits, filter out the low quality results to leave the best ones.");

$expected_hits = [
  {
    'allele_name' => 'adk-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'adk-4',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '98.88',
    'sample_alignment_length' => '536',
    'matches' => '532',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
is_deeply($blastn_result->_filter_best_hits($input_hits, 1.5), $expected_hits, "Given fake hits, filter out low quality results.");

my $overlapping_hits = [
  {
    'allele_name' => 'allele-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-end',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '178',
    'source_end' => '513',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-start',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '378',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-middle',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '278',
    'source_end' => '613',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-spill-over-end',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '478',
    'source_end' => '813',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-completely-different-truncation',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '1278',
    'source_end' => '1613',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-completely-different',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '536',
    'matches' => '536',
    'source_start' => '1178',
    'source_end' => '1713',
    'reverse' => 0,
  },
];
$expected_hits = [
  {
    'start' => 178,
    'end' => 713,
    'hits' => [
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1-truncation-end',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '178',
        'source_end' => '513',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1-truncation-start',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '378',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 813,
    'hits' => [
      {
        'allele_name' => 'allele-spill-over-end',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '813',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 1178,
    'end' => 1713,
    'hits' => [
      {
        'allele_name' => 'allele-completely-different-truncation',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '1278',
        'source_end' => '1613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-completely-different',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '1178',
        'source_end' => '1713',
        'reverse' => 0,
      },
    ],
  },
];
is_deeply($blastn_result->_group_overlapping_hits($overlapping_hits), $expected_hits, "Group overlapping blast hits because they are often split up over the same gene.");

my $bins = [
  {
    'start' => 178,
    'end' => 713,
    'hits' => [
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 1013,
    'hits' => [
      {
        'allele_name' => 'allele-some-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '1013',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 180,
    'end' => 715,
    'hits' => [
      {
        'allele_name' => 'allele-lots-of-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '180',
        'source_end' => '715',
        'reverse' => 0,
      },
    ],
  },
];
my $merged_bins = [
  {
    'start' => 178,
    'end' => 715,
    'hits' => [
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-lots-of-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '180',
        'source_end' => '715',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 1013,
    'hits' => [
      {
        'allele_name' => 'allele-some-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '1013',
        'reverse' => 0,
      },
    ],
  },
];
is_deeply($blastn_result->_merge_similar_bins($bins), $merged_bins, "Merge hits on a the same genes so that they form bigger hits.");

$bins = [
  {
    'start' => 178,
    'end' => 715,
    'hits' => [
      {
        'allele_name' => 'allele-1-truncation-middle',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '278',
        'source_end' => '613',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-1',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '178',
        'source_end' => '713',
        'reverse' => 0,
      },
      {
        'allele_name' => 'allele-lots-of-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '536',
        'matches' => '536',
        'source_start' => '180',
        'source_end' => '715',
        'reverse' => 0,
      },
    ],
  },
  {
    'start' => 478,
    'end' => 1013,
    'hits' => [
      {
        'allele_name' => 'allele-some-overlap',
        'source_name' => 'SomeSequenceName',
        'percentage_identity' => '100.00',
        'sample_alignment_length' => '336',
        'matches' => '336',
        'source_start' => '478',
        'source_end' => '1013',
        'reverse' => 0,
      },
    ],
  },
];
my $groups = [
  [
    {
      'allele_name' => 'allele-1-truncation-middle',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '336',
      'matches' => '336',
      'source_start' => '278',
      'source_end' => '613',
      'reverse' => 0,
    },
    {
      'allele_name' => 'allele-1',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '536',
      'matches' => '536',
      'source_start' => '178',
      'source_end' => '713',
      'reverse' => 0,
    },
    {
      'allele_name' => 'allele-lots-of-overlap',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '536',
      'matches' => '536',
      'source_start' => '180',
      'source_end' => '715',
      'reverse' => 0,
    },
  ],
  [
    {
      'allele_name' => 'allele-some-overlap',
      'source_name' => 'SomeSequenceName',
      'percentage_identity' => '100.00',
      'sample_alignment_length' => '336',
      'matches' => '336',
      'source_start' => '478',
      'source_end' => '1013',
      'reverse' => 0,
    },
  ],
];
is_deeply($blastn_result->_bins_to_groups($bins), $groups, "Convert sets of hits into summerised groups of hits over an allele.");

$input_hits = [
  {
    'allele_name' => 'allele-1',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '90.00',
    'sample_alignment_length' => '536',
    'matches' => '484',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-1-truncation-end',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '100.00',
    'sample_alignment_length' => '336',
    'matches' => '336',
    'source_start' => '178',
    'source_end' => '513',
    'reverse' => 0,
  },
  {
    'allele_name' => 'allele-2',
    'source_name' => 'SomeSequenceName',
    'percentage_identity' => '95.00',
    'sample_alignment_length' => '536',
    'matches' => '511',
    'source_start' => '178',
    'source_end' => '713',
    'reverse' => 0,
  },
];
my $expected_hit = {
  'allele_name' => 'allele-2',
  'source_name' => 'SomeSequenceName',
  'percentage_identity' => '95.00',
  'sample_alignment_length' => '536',
  'matches' => '511',
  'source_start' => '178',
  'source_end' => '713',
  'reverse' => 0,
};
is_deeply($blastn_result->_best_hit_in_group($input_hits), $expected_hit, "Report the best match in the group based.");

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk.tfa',
   word_sizes     => word_sizes('t/data/adk.tfa')
 )), 'Prepare the blast hits with perfect data.');
is_deeply($blastn_result->top_hit, {allele_name => 'adk-2', percentage_identity => 100, source_name => 'SomeSequenceName', source_start => 178, source_end => 713, reverse => 0 }, 'An exact match to an allele of full length should be the best hit.');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk_contamination.tfa',
   word_sizes     => word_sizes('t/data/adk_contamination.tfa')
 )), 'Prepare the blast hits with some contamination.');
ok(defined($blastn_result->top_hit->{contamination}), 'Contamination should be flagged');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk_truncation.tfa',
   word_sizes     => word_sizes('t/data/adk_truncation.tfa')
 )), 'Prepare the blast hits with a truncated gene.');
ok((! defined($blastn_result->top_hit->{contamination})), 'Contamination not detected where one allele is a truncation of another');
is($blastn_result->top_hit->{allele_name}, 'adk-3', 'Picks longer allele if one allele is a truncation of another');

my $blast_database_near_match= Bio::MLST::Blast::Database->new(fasta_file => 't/data/contigs_near_match.fa');
ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database_near_match->location(),
   query_file     => 't/data/adk_top_hit_low_hit.tfa',
   word_sizes     => word_sizes('t/data/adk_top_hit_low_hit.tfa')
 )), 'Prepare the blast hits where there are multiple close matches');

is($blastn_result->top_hit->{allele_name}, 'adk-2', 'Correct allele found out of multiple hits');
is($blastn_result->top_hit->{percentage_identity}, 100,'Correct allele found out of multiple hits');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk_99_percent.tfa',
   word_sizes     => word_sizes('t/data/adk_99_percent.tfa')
 )), 'Prepare the blast hits when there is a 99% match');

is($blastn_result->top_hit->{allele_name}, 'adk-2', 'Correct allele close match');
is($blastn_result->top_hit->{percentage_identity}, 99,'Correct allele close match');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk_less_than_95_percent.tfa',
   word_sizes     => word_sizes('t/data/adk_less_than_95_percent.tfa')
 )), 'Prepare the blast hits where the match is less than 95% of any existing allele.');

is_deeply($blastn_result->top_hit, {}, 'Report no hits found if the hits are less than 95%.');

ok(($blastn_result = Bio::MLST::Blast::BlastN->new(
   blast_database => $blast_database->location(),
   query_file     => 't/data/adk.tfa', # << ignore this, not used
   word_sizes     => {
                       'gdh_18'  => 460,
                       'gdh_9'   => 460,
                       'gdh_325' => 460,
                       'gdh_32'  => 460,
                     },
   'exec'         => 't/data/gdh_fake_blast_output.sh' # ignores arguments and outputs some fake output to stdout
 )), 'Check overlapping reads.');
ok(!defined($blastn_result->top_hit->{contamination}), 'Contamination not detected for mostly overlapping alleles');


# Exec not available
dies_ok( sub {
  Bio::MLST::Blast::BlastN->new(
    blast_database => $blast_database->location(),
    query_file     => 't/data/adk.tfa',
    word_sizes     => {},
    exec           => 'non_existant_executable'
  );
}, 'Validate if the executable is available');

sub word_sizes {
  my $filename = shift;

  my %seq_lens;
  my $seqio = Bio::SeqIO->new( -file => $filename , -format => 'Fasta');
  while( my $seq = $seqio->next_seq() ){
    $seq_lens{$seq->primary_id} = $seq->length;
  }
  return \%seq_lens;
}

done_testing();



( run in 2.228 seconds using v1.01-cache-2.11-cpan-98e64b0badf )