Bio-Gonzales

 view release on metacpan or  search on metacpan

bin/faeq-seq  view on Meta::CPAN

  $dup_seq->namespace($file) if ( @files > 1 );
  my $fit = faiterate($file);

  while ( my $so = $fit->() ) {
    $dup_seq->add_seq($so);

    print STDERR "." if $counter++ % 100 == 0;
  }
}

print STDERR "\n";

#filter out unique seqs
#my @deleted_seqs = map { delete $eq{$_} if ( @{ $eq{$_} } < 2 ) } keys %eq;
my @deleted_seqs = @{ $dup_seq->seq_groups_wo_duplicates };

print STDERR "Writing to stdout";

#print unique ids if filter is active
if ($filter) {
  for my $del_seq_objs (@deleted_seqs) {
    print_filtered($del_seq_objs);
    print STDERR "." if $counter++ % 100 == 0;
  }
}

#calculate also some statistics
my $num_identical_seqs = $dup_seq->num_duplicate_seqs;
my $num_unique_seqs    = $dup_seq->num_groups_w_duplicates;

#my %stats;
#update_stats($seq_objs, ...)
for my $seq_objs ( @{ $dup_seq->seq_groups_w_duplicates } ) {
  if ($filter) {
    print_filtered($seq_objs);
  } else {
    print_ids($seq_objs);
  }
  print STDERR "." if $counter++ % 100 == 0;
}

print STDERR "\n";

#init only if needed
our $seq_writer;

sub print_filtered {
  my ($seq_objs) = @_;

  $seq_writer = Bio::SeqIO->new(
    -format => 'fasta',
    -fh     => \*STDOUT,
  ) unless ($seq_writer);

  return unless ($seq_objs);

  $seq_writer->write_seq($seq_objs->[0] );
}

#FIXME better statistics
#sub update_stats {
  #my ($seq_objs, $stats, $is_duplicated) = @_;

  #my %tmp_stat;
  #if($is_duplicated) {
    ## count number of duplicated sequences per namespace
    #$tmp_stat
  #}

  #for my $so ( @{$seq_objs} ) {
    #$stats{$so->info->{namespace}} //= { };
    #if($is_duplicated) {

    #}
    #print "\t" if ( not $first );
    #say stringify_seq_obj($seq_obj);
    #undef $first;
  #}

#}

sub print_ids {
  my ($seq_objs) = @_;

  my $first = 1;
  for my $seq_obj ( @{$seq_objs} ) {
    print "\t" if ( not $first );
    say stringify_seq_obj($seq_obj);
    undef $first;
  }
}

sub stringify_seq_obj {
  my ($so) = @_;

  return join "\t", ( $so->id, ( $so->desc // "" ), ( $so->info->{namespace} // "" ) );
}

say STDERR
  "Found a total number of $num_identical_seqs identical sequences, which can be reduced to $num_unique_seqs. ";
say STDERR ( $num_identical_seqs - $num_unique_seqs ) . " sequences redundant.";

__END__

=head1 NAME

fafind-eq-seq - find equal sequences

=head1 SYNOPSIS

    ./fafind-eq-seq [--help] [--eval 'perlcode'] <file1> [<file2> ... <fileN>] >file_with_results.txt

    ./fafind-eq-seq [--help] [--eval 'perlcode'] --filter <file1> [<file2> ... <fileN>] >file_with_only_unique_seqs.fasta

=head1 DESCRIPTION

Find identical / equal sequences in a given set of fasta files. Info messages
go to standard error (stderr), results to standard output (stdout).

The result output of F<file_with_results.txt> consists of lines following the pattern



( run in 1.023 second using v1.01-cache-2.11-cpan-ceb78f64989 )