Bio-Roary
view release on metacpan or search on metacpan
lib/Bio/Roary/SortFasta.pm view on Meta::CPAN
has 'similarity' => ( is => 'rw', isa => 'Num', default => 1 );
has 'sequences_unaligned' => ( is => 'rw', isa => 'Bool', default => 0 );
has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' );
sub _build_output_filename {
my ($self) = @_;
return $self->input_filename . ".sorted.fa";
}
sub _build__input_seqio {
my ($self) = @_;
return Bio::SeqIO->new( -file => $self->input_filename, -format => 'Fasta' );
}
sub _build__output_seqio {
my ($self) = @_;
return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
}
sub _add_padding_to_make_sequence_length_multiple_of_three {
my ( $self, $input_seq ) = @_;
my $seq_length = $input_seq->length();
if ( $seq_length % 3 == 1 ) {
$input_seq->seq( $input_seq->seq() . "NN" );
}
elsif ( $seq_length % 3 == 2 ) {
$input_seq->seq( $input_seq->seq() . "N" );
}
return $input_seq;
}
sub _remove_nnn_from_all_sequences {
my ( $self, $input_sequences ) = @_;
for my $sequence_name ( sort keys %{$input_sequences} ) {
my $sequence = $input_sequences->{$sequence_name}->seq();
$sequence =~ s/NNN$//i;
$input_sequences->{$sequence_name}->seq($sequence);
}
return $input_sequences;
}
sub sort_fasta {
my ($self) = @_;
my %input_sequences;
my $nnn_at_end_of_all_sequences = 1;
my $sequence;
my $variation_detected = 0;
while ( my $input_seq = $self->_input_seqio->next_seq() ) {
$sequence = $input_seq->seq if(!defined($sequence));
$self->_add_padding_to_make_sequence_length_multiple_of_three($input_seq) if ( $self->make_multiple_of_three );
$nnn_at_end_of_all_sequences = 0 if ( $nnn_at_end_of_all_sequences == 1 && !( $input_seq->seq() =~ /NNN$/i ) );
$input_sequences{ $input_seq->display_id } = $input_seq;
my $factor = $self->_percentage_similarity($sequence, $input_seq->seq);
if($factor < $self->similarity)
{
$self->similarity($factor);
}
}
$self->_remove_nnn_from_all_sequences( \%input_sequences ) if ( $self->remove_nnn_from_end && $nnn_at_end_of_all_sequences );
my $sequence_length = 0;
my $sequences_unaligned = 0;
for my $sequence_name ( sort keys %input_sequences ) {
$sequence_length = $input_sequences{$sequence_name}->length if($sequence_length == 0);
$self->sequences_unaligned(1) if($input_sequences{$sequence_name}->length != $sequence_length);
$self->_output_seqio->write_seq( $input_sequences{$sequence_name} );
}
return $self;
}
sub replace_input_with_output_file {
my ($self) = @_;
move( $self->output_filename, $self->input_filename );
return $self;
}
sub _percentage_similarity
{
my ($self, $string1, $string2) = @_;
my $num_differences = 0;
my $string1_length = length($string1);
for(my $i = 0; $i < $string1_length && $i< length($string2); $i++)
{
$num_differences++ if( substr($string1, $i, 1) ne substr($string2, $i, 1));
}
return 1 if($num_differences == 0);
return 0 if($string1_length == 0);
return (1.0 - ($num_differences/$string1_length));
}
no Moose;
__PACKAGE__->meta->make_immutable;
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
Bio::Roary::SortFasta - sort a fasta file by name
=head1 VERSION
version 3.13.0
=head1 SYNOPSIS
sort a fasta file by name
use Bio::Roary::SortFasta;
my $obj = Bio::Roary::SortFasta->new(
input_filename => 'infasta.fa',
);
$obj->sort_fasta->replace_input_with_output_file;
=head1 AUTHOR
Andrew J. Page <ap13@sanger.ac.uk>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
This is free software, licensed under:
The GNU General Public License, Version 3, June 2007
=cut
( run in 2.348 seconds using v1.01-cache-2.11-cpan-8f98c5d2c55 )