Bio-EUtilities

 view release on metacpan or  search on metacpan

bin/bp_genbank_ref_extractor  view on Meta::CPAN

    if (ref($seq) eq 'ARRAY') {
      foreach (@{$seq}) {
        $val = find_in_entrezgene ($_, $keys);
        ## value may be false, but will be undefined if just not found
        last if defined $val;
      }
    } elsif (ref($seq) eq 'HASH') {
      my $key = shift (@{$keys});
      $val = find_in_entrezgene ($seq->{$key}, $keys) if (exists $seq->{$key});
    } elsif (!ref($seq)) {
      ## not a reference, must be the value we are looking for
      $val = $seq;
    } else {
      die "error when transversing entrezgene structure.\n";
    }
    last if defined $val;
  }
  return $val;
}

## Removes repeated elements from an array. Does not respect original order
sub clean_array {
  my %hash;
  foreach (@{$_[0]}) {
    if ($hash{$_}) {
      log_it (9, "DEBUG: value '$_' removed from array " . by_caller_and_location('here') . " called " . by_caller_and_location('before') );
    } else {
      $hash{$_} = 1;
    }
  }
  @{$_[0]} = keys %hash;
}

## Returns a pretty string about current time
sub get_time {
  my ($second, $minute, $hour, $day, $month, $year) = (localtime)[0,1,2,3,4,5];
  return sprintf ("[%04d-%02d-%02d %02d:%02d:%02d]", $year+1900, $month+1, $day, $hour, $minute, $second);
}


## Tries to sanitize a filename
sub fix_filename {
  my $file = $_[0];
  $file =~ s/[^a-z0-9\-\+ \.,\(\){}\[\]']/_/ig;
  log_it (9, "DEBUG: filepath '$_[0]' was converted to '$file' " . by_caller_and_location('here') . " called " . by_caller_and_location('before') );
  return $file;
}

sub by_caller_and_location {
  my $level;
  if (!@_ || $_[0] eq 'here') {
    $level = 1;
  } elsif ($_[0] eq 'before'){
    $level = 2;
  } elsif ($_[0] =~ /^[0-9]+$/){
    $level = 1 + $_[0];
  } else {
    die "Bug found when calculating level for caller function. Please report.";
  }
  my $deeper = shift;
  return "by " . (caller($level))[3] . " at line " . (caller($level))[2];
}


sub file_extension_for {
  ## TODO in some cases, extension changes whether it's protein or DNA or whatever
  ## and this should be supported
  ## XXX there must be a more elegant to handle the formats on this scripts

  ## to update this list, look in the _guess_format method, inside SeqIO.pm of bioperl
  for ($_[0]) {
    if    (/embl/i)       {return '.embl';}
    elsif (/entrezgene/i) {return '.asn';}
    elsif (/fasta/i)      {return '.fasta';} # fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa
    elsif (/fastq/i)      {return '.fastq';}
    elsif (/gcg/i)        {return '.gcg';}
    elsif (/genbank/i)    {return '.gb';} # gb|gbank|genbank|gbk|gbs
    elsif (/swiss/i)      {return '.swiss';} # swiss|sp
    else {
      log_it (9, "DEBUG: couldn't find the right extension for the requested format. Using '.seq' as default.");
      return ".seq";
    }
  }
}

sub save_structure {
  if ($save_data eq 'csv') { create_csv($_[0]); }
}

sub create_csv {
  my $struct = shift;
  my $csv = Text::CSV->new ({
                              binary => 1,
                              eol => $/,
                              }) or die "Cannot use Text::CSV: ". Text::CSV->error_diag ();

  my $csv_file  = File::Spec->catfile ($save, 'data.csv');
  open (my $fh, ">", $csv_file) or die "Couldn't open file $csv_file for writing: $!";

  $csv->print ($fh, ['gene symbol', 'species', 'gene UID', 'EnsEMBL ID', 'gene name', 'pseudo', 'transcript accession','protein accession', 'locus', 'chromosome accession', 'chromosome start coordinates', 'chromosome stop coordinates', 'assembly'] );

  my @uids = $struct->get_list('gene');
  foreach my $uid(@uids) {
    my @lines;
    my @mRNA_acc = $struct->get_product_list('transcript', $uid);
    if (!@mRNA_acc) { @mRNA_acc = (''); }   # this allows the next loop to run once for pseudo genes
    foreach my $mRNA_acc (@mRNA_acc) {
      push(@lines, [
                    $struct->get_info('gene', $uid, 'symbol'),
                    $struct->get_info('gene', $uid, 'species'),
                    $uid,
                    $struct->get_info('gene', $uid, 'ensembl'),
                    $struct->get_info('gene', $uid, 'name'),
                    $struct->get_info('gene', $uid, 'pseudo'),
                    $mRNA_acc,
                    $struct->get_info('transcript', $mRNA_acc, 'protein'),
                    $struct->get_info('gene', $uid, 'locus'),
                    $struct->get_info('gene', $uid, 'ChrAccVer'),
                    $struct->get_info('gene', $uid, 'ChrStart'),
                    $struct->get_info('gene', $uid, 'ChrStop'),
                    $struct->get_info('gene', $uid, 'assembly'),



( run in 2.037 seconds using v1.01-cache-2.11-cpan-ceb78f64989 )