Bio-EUtilities
view release on metacpan or search on metacpan
bin/bp_genbank_ref_extractor view on Meta::CPAN
if (ref($seq) eq 'ARRAY') {
foreach (@{$seq}) {
$val = find_in_entrezgene ($_, $keys);
## value may be false, but will be undefined if just not found
last if defined $val;
}
} elsif (ref($seq) eq 'HASH') {
my $key = shift (@{$keys});
$val = find_in_entrezgene ($seq->{$key}, $keys) if (exists $seq->{$key});
} elsif (!ref($seq)) {
## not a reference, must be the value we are looking for
$val = $seq;
} else {
die "error when transversing entrezgene structure.\n";
}
last if defined $val;
}
return $val;
}
## Removes repeated elements from an array. Does not respect original order
sub clean_array {
my %hash;
foreach (@{$_[0]}) {
if ($hash{$_}) {
log_it (9, "DEBUG: value '$_' removed from array " . by_caller_and_location('here') . " called " . by_caller_and_location('before') );
} else {
$hash{$_} = 1;
}
}
@{$_[0]} = keys %hash;
}
## Returns a pretty string about current time
sub get_time {
my ($second, $minute, $hour, $day, $month, $year) = (localtime)[0,1,2,3,4,5];
return sprintf ("[%04d-%02d-%02d %02d:%02d:%02d]", $year+1900, $month+1, $day, $hour, $minute, $second);
}
## Tries to sanitize a filename
sub fix_filename {
my $file = $_[0];
$file =~ s/[^a-z0-9\-\+ \.,\(\){}\[\]']/_/ig;
log_it (9, "DEBUG: filepath '$_[0]' was converted to '$file' " . by_caller_and_location('here') . " called " . by_caller_and_location('before') );
return $file;
}
sub by_caller_and_location {
my $level;
if (!@_ || $_[0] eq 'here') {
$level = 1;
} elsif ($_[0] eq 'before'){
$level = 2;
} elsif ($_[0] =~ /^[0-9]+$/){
$level = 1 + $_[0];
} else {
die "Bug found when calculating level for caller function. Please report.";
}
my $deeper = shift;
return "by " . (caller($level))[3] . " at line " . (caller($level))[2];
}
sub file_extension_for {
## TODO in some cases, extension changes whether it's protein or DNA or whatever
## and this should be supported
## XXX there must be a more elegant to handle the formats on this scripts
## to update this list, look in the _guess_format method, inside SeqIO.pm of bioperl
for ($_[0]) {
if (/embl/i) {return '.embl';}
elsif (/entrezgene/i) {return '.asn';}
elsif (/fasta/i) {return '.fasta';} # fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa
elsif (/fastq/i) {return '.fastq';}
elsif (/gcg/i) {return '.gcg';}
elsif (/genbank/i) {return '.gb';} # gb|gbank|genbank|gbk|gbs
elsif (/swiss/i) {return '.swiss';} # swiss|sp
else {
log_it (9, "DEBUG: couldn't find the right extension for the requested format. Using '.seq' as default.");
return ".seq";
}
}
}
sub save_structure {
if ($save_data eq 'csv') { create_csv($_[0]); }
}
sub create_csv {
my $struct = shift;
my $csv = Text::CSV->new ({
binary => 1,
eol => $/,
}) or die "Cannot use Text::CSV: ". Text::CSV->error_diag ();
my $csv_file = File::Spec->catfile ($save, 'data.csv');
open (my $fh, ">", $csv_file) or die "Couldn't open file $csv_file for writing: $!";
$csv->print ($fh, ['gene symbol', 'species', 'gene UID', 'EnsEMBL ID', 'gene name', 'pseudo', 'transcript accession','protein accession', 'locus', 'chromosome accession', 'chromosome start coordinates', 'chromosome stop coordinates', 'assembly'] );
my @uids = $struct->get_list('gene');
foreach my $uid(@uids) {
my @lines;
my @mRNA_acc = $struct->get_product_list('transcript', $uid);
if (!@mRNA_acc) { @mRNA_acc = (''); } # this allows the next loop to run once for pseudo genes
foreach my $mRNA_acc (@mRNA_acc) {
push(@lines, [
$struct->get_info('gene', $uid, 'symbol'),
$struct->get_info('gene', $uid, 'species'),
$uid,
$struct->get_info('gene', $uid, 'ensembl'),
$struct->get_info('gene', $uid, 'name'),
$struct->get_info('gene', $uid, 'pseudo'),
$mRNA_acc,
$struct->get_info('transcript', $mRNA_acc, 'protein'),
$struct->get_info('gene', $uid, 'locus'),
$struct->get_info('gene', $uid, 'ChrAccVer'),
$struct->get_info('gene', $uid, 'ChrStart'),
$struct->get_info('gene', $uid, 'ChrStop'),
$struct->get_info('gene', $uid, 'assembly'),
( run in 2.037 seconds using v1.01-cache-2.11-cpan-ceb78f64989 )