Bio-Gonzales
view release on metacpan or search on metacpan
bin/faeq-seq view on Meta::CPAN
$dup_seq->namespace($file) if ( @files > 1 );
my $fit = faiterate($file);
while ( my $so = $fit->() ) {
$dup_seq->add_seq($so);
print STDERR "." if $counter++ % 100 == 0;
}
}
print STDERR "\n";
#filter out unique seqs
#my @deleted_seqs = map { delete $eq{$_} if ( @{ $eq{$_} } < 2 ) } keys %eq;
my @deleted_seqs = @{ $dup_seq->seq_groups_wo_duplicates };
print STDERR "Writing to stdout";
#print unique ids if filter is active
if ($filter) {
for my $del_seq_objs (@deleted_seqs) {
print_filtered($del_seq_objs);
print STDERR "." if $counter++ % 100 == 0;
}
}
#calculate also some statistics
my $num_identical_seqs = $dup_seq->num_duplicate_seqs;
my $num_unique_seqs = $dup_seq->num_groups_w_duplicates;
#my %stats;
#update_stats($seq_objs, ...)
for my $seq_objs ( @{ $dup_seq->seq_groups_w_duplicates } ) {
if ($filter) {
print_filtered($seq_objs);
} else {
print_ids($seq_objs);
}
print STDERR "." if $counter++ % 100 == 0;
}
print STDERR "\n";
#init only if needed
our $seq_writer;
sub print_filtered {
my ($seq_objs) = @_;
$seq_writer = Bio::SeqIO->new(
-format => 'fasta',
-fh => \*STDOUT,
) unless ($seq_writer);
return unless ($seq_objs);
$seq_writer->write_seq($seq_objs->[0] );
}
#FIXME better statistics
#sub update_stats {
#my ($seq_objs, $stats, $is_duplicated) = @_;
#my %tmp_stat;
#if($is_duplicated) {
## count number of duplicated sequences per namespace
#$tmp_stat
#}
#for my $so ( @{$seq_objs} ) {
#$stats{$so->info->{namespace}} //= { };
#if($is_duplicated) {
#}
#print "\t" if ( not $first );
#say stringify_seq_obj($seq_obj);
#undef $first;
#}
#}
sub print_ids {
my ($seq_objs) = @_;
my $first = 1;
for my $seq_obj ( @{$seq_objs} ) {
print "\t" if ( not $first );
say stringify_seq_obj($seq_obj);
undef $first;
}
}
sub stringify_seq_obj {
my ($so) = @_;
return join "\t", ( $so->id, ( $so->desc // "" ), ( $so->info->{namespace} // "" ) );
}
say STDERR
"Found a total number of $num_identical_seqs identical sequences, which can be reduced to $num_unique_seqs. ";
say STDERR ( $num_identical_seqs - $num_unique_seqs ) . " sequences redundant.";
__END__
=head1 NAME
fafind-eq-seq - find equal sequences
=head1 SYNOPSIS
./fafind-eq-seq [--help] [--eval 'perlcode'] <file1> [<file2> ... <fileN>] >file_with_results.txt
./fafind-eq-seq [--help] [--eval 'perlcode'] --filter <file1> [<file2> ... <fileN>] >file_with_only_unique_seqs.fasta
=head1 DESCRIPTION
Find identical / equal sequences in a given set of fasta files. Info messages
go to standard error (stderr), results to standard output (stdout).
The result output of F<file_with_results.txt> consists of lines following the pattern
( run in 1.023 second using v1.01-cache-2.11-cpan-ceb78f64989 )