Bio-MLST-Check

 view release on metacpan or  search on metacpan

t/Output/MultipleFastas.t  view on Meta::CPAN


  while( my $line = <$FILE> ) {
    my $trimmed_line = trim($line);
    if ($number_of_known_sequences == 0) {
      # We don't know how many sequences there are so create a new one
      push( @sequences, [$trimmed_line]);
      # The first time we find a blank 'sequence' we now know the number of sequences
      if ($trimmed_line eq '') {
        $number_of_known_sequences = $line_number + 1;
      }
    } else {
      # Now that we know the number of sequences, append this line to it's corresponding sequence
      my $sequence_number = $line_number % $number_of_known_sequences;
      push( @{$sequences[$sequence_number]}, $trimmed_line);
    }
    $line_number++;
  }

  return @sequences;

}

sub compare_phylip_files {
  my($calculated_file, $expected_file) = @_;

  open(my $CALC_FILE, $calculated_file);
  open(my $EXPECTED_FILE, $expected_file);

  my $calculated_file_header = <$CALC_FILE>;
  my $expected_file_header = <$EXPECTED_FILE>;

  is($calculated_file_header, $expected_file_header, "Header matches expected value in ".$expected_file);

  my @calculated_file_sequences = sort({ $a->[0] cmp $b->[0] } get_sequences_from_file($CALC_FILE));
  my @expected_file_sequences = sort({ $a->[0] cmp $b->[0] } get_sequences_from_file($EXPECTED_FILE));

  close($CALC_FILE);
  close($EXPECTED_FILE);

  is_deeply(\@calculated_file_sequences, \@expected_file_sequences, "Sequences match ".$expected_file);

}

note('Check it can handle multiple assemblies where some have partial allele matches.');
$tmpdirectory_obj = File::Temp->newdir(DIR => getcwd, CLEANUP => 1);
$tmpdirectory = $tmpdirectory_obj->dirname();
ok(($multiple_fastas = Bio::MLST::Check->new(
  species               => "E.coli",
  base_directory        => 't/data/databases',
  raw_input_fasta_files => ['t/data/contigs.fa','t/data/contigs_pipe_character_in_seq_name.fa','t/data/contigs_one_unknown.tfa'],
  makeblastdb_exec      => 'makeblastdb',
  blastn_exec           => 'blastn',
  output_directory      => $tmpdirectory,
  output_fasta_files    => 1,
  output_phylip_files   => 1,
  spreadsheet_basename  => 'mlst_results',
  parallel_processes    => 3,
  report_lowest_st      => 1
)),'Pass in 3 assemblies, 2 perfect and where 1 has partial matches.');
ok(($multiple_fastas->create_result_files),'Create all the results files for three assemblies.');
compare_files( $tmpdirectory.'/mlst_results.genomic.csv',    't/data/expected_three_mlst_results.genomic.csv', 'Create a spreadsheet with the 3 sets of assemblies combined and the sequences, and give one best guess ST.' );
compare_files( $tmpdirectory.'/mlst_results.allele.csv',     't/data/expected_three_mlst_results.allele.csv', 'Create a spreadsheet with the 3 sets of assemblies combined and the allele numbers, and give one best guess ST.' );
compare_files( $tmpdirectory.'/concatenated_alleles.fa',     't/data/expected_three_concatenated_alleles.fa', 'Create a multi-FASTA file containing the concatenated sequences.');
###
compare_phylip_files( $tmpdirectory.'/concatenated_alleles.phylip', 't/data/expected_three_concatenated_alleles.phylip', 'Output the alignment of the concatenated gene sequences in phylip format, which is used as input to some tree building applicati...
compare_files( $tmpdirectory.'/contigs_one_unknown.unknown_allele.adk-2~.fa',  't/data/expected_three_contigs_one_unknown.unknown_allele.adk-2~.fa', 'Create FASTA files for alleles which are not in the database, so that they can be added later.' );
compare_files( $tmpdirectory.'/contigs_one_unknown.unknown_allele.recA-1~.fa', 't/data/expected_three_contigs_one_unknown.unknown_allele.recA-1~.fa', 'Create FASTA files for alleles which are not in the database, so that they can be added later.' );


$tmpdirectory_obj = File::Temp->newdir(DIR => getcwd, CLEANUP => 1);
$tmpdirectory = $tmpdirectory_obj->dirname();
ok(($multiple_fastas = Bio::MLST::Check->new(
  species               => "E.coli",
  base_directory        => 't/data/databases',
  raw_input_fasta_files => ['t/data/contigs.fa'],
  makeblastdb_exec      => 'makeblastdb',
  blastn_exec           => 'blastn',
  output_directory      => $tmpdirectory,
  output_fasta_files    => 1,
  spreadsheet_basename  => 'mlst_results',
  parallel_processes    => 1
)),'Make sure the input files exist.');
ok($multiple_fastas->input_fasta_files_exist,'Check the input FASTA file exists.');

done_testing();

sub compare_files
{
  my( $actual_file, $expected_file ) = @_;
  ok((-e $actual_file),' results file exist');
  ok((-e $expected_file)," $expected_file expected file exist");
  
  my $expected_line =  read_file($expected_file);
  my $actual_line = read_file($actual_file);
  $expected_line =~ s/ \n//gi;
  $actual_line   =~ s/ \n//gi;
  
  # parallel processes mean the order isnt guaranteed.
  my @split_expected  = split(/\n/,$expected_line);
  my @split_actual  = split(/\n/,$actual_line);
  my @sorted_expected = sort(@split_expected);
  my @sorted_actual  = sort(@split_actual);
  
  return is_deeply(\@sorted_actual, \@sorted_expected, "Content matches expected $expected_file");
}



( run in 2.001 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )