Bio-MUST-Core
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
bin/inst-abbr-ids.pl view on Meta::CPAN
#!/usr/bin/env perl
# PODNAME: inst-abbr-ids.pl
# ABSTRACT: Abbreviate seq ids in FASTA files (optimized)
# CONTRIBUTOR: Valerian LUPO <valerian.lupo@doct.uliege.be>
use Modern::Perl '2011';
use autodie;
use File::Basename;
use Getopt::Euclid qw(:vars);
use Path::Class qw(dir file);
use Smart::Comments;
use Bio::MUST::Core;
use Bio::MUST::Core::Constants qw(:seqids);
use Bio::MUST::Core::Utils qw(secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdMapper';
# TODO add optional %seen hash, either global or filewise and either .1, .2 etc or .2, .3 etc
# TODO: make things more souple
# perl -nle 'if ( ($prot,$gca) = m/^>(\S+).*:(GC[AF]_[^:]+)/ ) { print q{>} . $gca . q{|} . $prot } else { print }' hexa-900-p-a_prot_cplt.fa > hexa-900-p-a_prot_cplt_abbr2.fa
# regexes for capturing unique identifier component
my %regex_for = (
':DEF' => $DEF_ID,
':GI' => $GI_ID,
':GNL' => $GNL_ID,
':JGI' => $JGI_ID,
':PAC' => $PAC_ID,
);
my $regex = $regex_for{$ARGV_id_regex} // $ARGV_id_regex;
### Using seq id regex: $regex
# build optional output dir
my $dir = q{.};
if ($ARGV_outdir) {
$dir = dir($ARGV_outdir)->relative;
$dir->mkpath();
}
# load optional prefix mapper file
my $prefix_mapper;
if ($ARGV_id_prefix_mapper) {
### Taking prefixes from: $ARGV_id_prefix_mapper
$prefix_mapper = IdMapper->load($ARGV_id_prefix_mapper);
}
# global variables that will be updated for each infile
my $prefix;
my @long_ids;
my @abbr_ids;
my $abbrid_filter = sub {
my $seq = shift;
# get long id
my $long_id = $seq->full_id;
push @long_ids, $long_id;
# abbreviate seq_id
my $abbr_id = $seq->seq_id->abbr_with_regex($prefix, $regex);
push @abbr_ids, $abbr_id;
# store allowed seqs
view all matches for this distributionview release on metacpan - search on metacpan
( run in 0.415 second using v1.00-cache-2.02-grep-82fe00e-cpan-2c419f77a38b )