Bio-MUST-Core

 view release on metacpan or  search on metacpan

bin/inst-abbr-ids.pl  view on Meta::CPAN

#!/usr/bin/env perl
# PODNAME: inst-abbr-ids.pl
# ABSTRACT: Abbreviate seq ids in FASTA files (optimized)
# CONTRIBUTOR: Valerian LUPO <valerian.lupo@doct.uliege.be>

use Modern::Perl '2011';
use autodie;

use File::Basename;
use Getopt::Euclid qw(:vars);
use Path::Class qw(dir file);
use Smart::Comments;

use Bio::MUST::Core;
use Bio::MUST::Core::Constants qw(:seqids);
use Bio::MUST::Core::Utils qw(secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdMapper';

# TODO add optional %seen hash, either global or filewise and either .1, .2 etc or .2, .3 etc
# TODO: make things more souple
# perl -nle 'if ( ($prot,$gca) = m/^>(\S+).*:(GC[AF]_[^:]+)/ ) { print q{>} . $gca . q{|} . $prot } else { print }' hexa-900-p-a_prot_cplt.fa > hexa-900-p-a_prot_cplt_abbr2.fa

# regexes for capturing unique identifier component
my %regex_for = (
    ':DEF' => $DEF_ID,
    ':GI'  =>  $GI_ID,
    ':GNL' => $GNL_ID,
    ':JGI' => $JGI_ID,
    ':PAC' => $PAC_ID,
);

my $regex = $regex_for{$ARGV_id_regex} // $ARGV_id_regex;
### Using seq id regex: $regex

# build optional output dir
my $dir = q{.};
if ($ARGV_outdir) {
    $dir = dir($ARGV_outdir)->relative;
    $dir->mkpath();
}

# load optional prefix mapper file
my $prefix_mapper;
if ($ARGV_id_prefix_mapper) {
    ### Taking prefixes from: $ARGV_id_prefix_mapper
    $prefix_mapper = IdMapper->load($ARGV_id_prefix_mapper);
}

# global variables that will be updated for each infile
my $prefix;
my @long_ids;
my @abbr_ids;

my $abbrid_filter = sub {
    my $seq = shift;

    # get long id
    my $long_id = $seq->full_id;
    push @long_ids, $long_id;

    # abbreviate seq_id
    my $abbr_id = $seq->seq_id->abbr_with_regex($prefix, $regex);
    push @abbr_ids, $abbr_id;

    # store allowed seqs

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 0.415 second using v1.00-cache-2.02-grep-82fe00e-cpan-2c419f77a38b )