Bioinf
view release on metacpan or search on metacpan
$R_start='';
$sequence='';
$seq_found1=''; ## reset $R_start, $seq_found1,,
next F0;
}
}
}
}else{
print "\n# Error, the sequence pos for $NAME (from $Keys[$e]) in DB doesnt exist in xxxx.idx file?\n";
}
}
close DB_FASTA;
}
#print "\n# (6) fetch_sequence_from_db: counted fetched seqs: $found_seq_count, $acquired_seq_count";
#print "\n# (7) fetch_sequence_from_db: Fetching seq has finished \n";
return(\%sequence);
}
#______________________________________________________________
# Title : fetch_seq
# Usage : &fetch_seq(@ARGV);
# Function : fetches swissprot entry or fasta format seq with
# given seq name(like SAA_HORSE, SA*HORSE, SAA,..)
# you can give multi files(SAA*, SAU*) at the same
# time. This uses ENV setting of 'SWDIR'
# Example : &fetch_swiss_seq(@ARGV);
# Keywords : fetch_swissprot_sequence, fetch_sequence,
# find_swiss_sequence, find_sequence
# Options : _ for debugging.
# # for debugging.
# -f for fasta format file output
# -a is for ALL matched seq. (same as using glob=> *YEAST)
# -c is for Creating seq.idx file
# -h is for HELP!
# -g is for GDF file format output
# -l is for list of match entries(in 1 column)
# -s is for species option (input name mst be species (YEAST, RAT, HUMAN..)
# n= is for Number of seq you want to get from swissprot
# s= is for Size limit. Min seq size in swiss, s=10 -> minimum 11 aa seq.
# S= is for Size limit. Max seq size in swiss, s=1000 -> get less than 1000
#
# Argument : swissprot seqname
# Category :
# Version : 1.7
#--------------------------------------------------------------
sub fetch_seq{
my @in=@_;
my ($FASTA_index, $FASTA, $where_index, %index, $question, $i,
$s,$t,$fasta,$index_file, $all, $species,$target, $matched, $seq, $gdf, $list, $count, $create);
my $SEQ_size_max=100000000;
if(@_ < 1){ &HELP_fetch_seq;
}else{
F: for($t=0; $t<@in; $t++){ #'''''''''''' PROMPT ARGV processing ''''''''''''''''''
if($in[$t]=~/^\-c$/i){
$create=1; splice(@in, $t, 1); $t--;
print "\n You should provide database\(e.g, seq.dat\) file with this opt, I guess you did\n";
print "\n If you wanted to make an index with any fasta db, you also have to\n";
print " give the file name. e.g:\n $0 -c /DB/swiss/seq.dat\n";
print " or $0 -c my_db.fa\n\n";
next; }
if($in[$t]=~/^\-af$/){ $fasta=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-afs$/){ $species=$fasta=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-ag$/){ $gdf=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-g$/){ $gdf=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-f$/i){ $fasta=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-a$/i){ $all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-l$/i){ $list=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-s$/i){ $species=$all=1; splice(@in, $t, 1); $t--; next; }
if( ($in[$t]=~/seq\.dat/)&&(-f $in[$t])){ ## if the path for swiss prot is given
$DB=$in[$t]; splice(@in, $t, 1); $t--; next; }
if( ($in[$t]=~/seq\.idx/)&&(-e $in[$t])){ ## if the path for swiss index is given
$index_file=$in[$t]; splice(@in, $t, 1); $t--; next; }
#''''''' SWiss prompt input file check ''''''''''''''''''
if( -f $in[$t]){
open(TEMP, "$in[$t]");
while(<TEMP>){
if(/^ID[\t ]+\w+/){$DB=$in[$t]; splice(@in, $t, 1);$t--;next F;}}
close TEMP;
}
#'''''''' FASTA prmpt input file check '''''''''''''''''''
if( (-f $in[$t]) && !(defined($FASTA))){
open(TEMP, "$in[$t]");
while(<TEMP>){
if(/^\> {0,4}\S+/){$FASTA=$in[$t]; $fasta=1;
if(-s "$FASTA\.idx"){ $FASTA_index="$FASTA\.idx"; }
splice(@in, $t, 1);$t--;next F;}}
close TEMP;
}
#'''''''' INDEX file automatic check ''''''''''''''''''
if( -f $in[$t]){
open(TEMP2, "$in[$t]");
my ($first_pos, $Count, @splited);
while(<TEMP2>){
$Count++;
if( $Count>3 ){
if(/^ {0,2}\S+ +(\d+)/){
if(defined($first_pos) && ($1-$first_pos ) > 1000 ){
$index_file=$in[$t];
splice(@in, $t, 1);$t--;next F;
}elsif( defined($first_pos) && ($1-$first_pos)<1000 ){
$FASTA_index=$in[$t]; $fasta=1;
if($FASTA_index=~/^(\S+)\.\w+$/){
if(-s $1){ $FASTA= $1; }
}
splice(@in, $t, 1);$t--;next F;
}
$first_pos=$1;
}
}
}
close TEMP2;
}
if($in[$t]=~/^\-h$/i){ &HELP_fetch_seq; exit;}
# %hash4 = ('s4', '-TESH-CHEEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#
# s1_s2_s3_s4 --E-H-CH-EE----E-H--HHEE----E-H--HHEE----E-H-CHHEE--
#
# Warning : This gets more than 2 hashes. Not more than that!
#
# Class : get_common_column, get_common_column_in_seq, get common column in sequence
# for secondary structure only representation.
# Keywords : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# get_common_column, get_com_column, get_common_sequence,
# get_common_seq_region, multiply_seq_hash, get_common_column_in_sequence
# Options :
# Reference :
# Returns : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Tips :
# Argument : 2 or more ref for hash of identical keys and value length.
# One optional arg for replacing space char to the given one.
# Author : jong@salt2.med.harvard.edu
# Category :
# Version : 1.6
#--------------------------------------------------------------------
sub get_common_column{
my($i, $k,$j, $name1, $name2, $sorted_name, @in, %out, @out_chars,
$gap_chr, @str1, @str2);
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Sub argument handling $gap_chr here can be 'HE' etc.
#_______________________________________________________
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); } }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Checking if the input hashes were right
#_______________________________________________________
if( (@in < 2) && ( ref($in[0]) eq 'HASH') ){
print "\n", __LINE__, " # get_common_column usually needs 2 hashes. Error \n";
print "\n", __LINE__, " # get_common_column : Anyway, I will just return the single input hash: @in. \n";
%out=%{$in[0]}; # <--- This is essential to return the single input hash!!
goto FINAL;
}
if(@in ==2){ print "\n# (INFO) \@in has 2 elements for ref. of hash, good!\n\n"; }
%out = %{$in[0]}; ## Initializing %out
print "\n",__LINE__, " # get_common_column hashes given are: @in \n" if $debug eq 1;
for( $k=1; $k < @in; $k++){
my(@out_chars); ## <-- Necessary to prevent concatenation.
my(%hash1)=%out;
my(%hash2)=%{$in[$k]};
my(@names1)= sort keys %hash1;
my(@names2)= sort keys %hash2;
$name1 = $names1[0];
$name2 = $names2[0];
@str1=split(/\||\,/, $hash1{$names1[0]});
@str2=split(/\||\,/, $hash2{$names2[0]});
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Trying to guess the correct gap char
#_____________________________________________________
if($hash2{$names2[0]}=~/_/){
$gap_chr='_';
}elsif($hash2{$names2[0]}=~/(\W)\W/){
$gap_chr=$1;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If I fail to split string by ',', then split by ''
#_____________________________________________________
if(@str1 == 1){ @str1=split(//, $hash1{$names1[0]}); }
if(@str2 == 1){ @str2=split(//, $hash2{$names2[0]}); }
for($i=0; $i < @str1; $i++){
if($str1[$i] eq $str2[$i] ){
push(@out_chars, $str1[$i]);
}else{
if( defined($gap_chr) ){
push(@out_chars, $gap_chr);
}else{
push(@out_chars, ' ');
}
}
}
$sorted_name=join('', ($name1, $name2));
%out='';
$out{"$sorted_name"}= join("", @out_chars);
}
FINAL:
if ($debug eq 1){
print "\n",__LINE__, " # get_common_column Final res. \%out :\n",
&show_hash(%out);
}
return(\%out);
}
#________________________________________________________________________
# Title : overlay_seq_for_identical_chars
# Usage : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# Function : (name1 --EHH--HHEE-- )
# (name2 --HHH--EEEE-- ) ==> result is;
#
# (name1_name2 -- HH-- EE-- )
# to get the identical chars in hash strings of sequences.
#
# Example : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# output> with 'E' option >>> "name1 --HHH--1232-"
# Warning : Works only for 2 sequence hashes.
# Keywords : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# Options :
# Returns : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Argument : 2 ref for hash of identical keys and value length. One optional arg for
# replacing space char to the given one.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub overlay_seq_for_identical_chars{
my($i, $k,$j, $name1, $name2, @in, %out, @out_chars, $gap_chr, @str1, @str2);
######################################
####### Sub argument handling ######## $gap_chr here can be 'HE' etc.
$answer_char='';
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print "\n# (3) write_nhco_files: chaning all the file input extension to 'parf'";
#_______________________________________________________________________________
for($i=0; $i< @file; $i++){
$base=${&get_base_names($file[$i])};
$file[$i]="$base\.$file_ext_wanted";
}
}else{
$answer_char='';
print "\n\n\n# check_input_file_extension: You rudely put rubbish, I am dying. $0\n\n\n";
print chr(7);
exit;
}
}
return(\@file);
}
#________________________________________________________________________________
# Title : geanfammer_main
# Usage : &geanfammer_main;
# Function : The main sub of geanfammer
# Example :
# Keywords : main_geanfammer, geanfammer
# Options :
# Version : 1.9
#--------------------------------------------------------------------------------
sub geanfammer_main{
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# All the defaults, Evalues are determined in geanfammer sub
#__________________________________________________________________
$|=1;
unless($algorithm){
$algorithm='fasta'; # default search algorithm setting(will be overridden
# by 'a=xxx' prompt argument
}
$sub_dir_size=2; # this is the subdirectory name char size
$machine_readable='M'; # this is to invoke FASTA's m=10 option
$make_msp_in_sub_dir_opt='m';
$make_subdir_gzipped='d';
$make_subdir_out='D';
$Evalue_cut_single_link=0.01;
$Evalue_cut_divclus =0.01;
$length_thresh=30; # default
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# preprocessing the inputs, parse_arguments reads in the options in the headerbox
#__________________________________________________________________________________
@your_genome_or_db_to_analyse_file=@{&parse_arguments(1)};
print "\n\n\n# (2) geanfammer_main (with $algorithm): \@your_genome_or_db_to_analyse_file are(is)\n";
print "\n => @your_genome_or_db_to_analyse_file with $algorithm. Min domain size is \"$length_thresh\"\n\n\n";
if(@your_genome_or_db_to_analyse_file < 1){
print "\n# (E) geanfammer_main: ERROR!\n";
print "\n# Dear $ENV{'USER'}, $0: failed to find input file!\n
Did you put FASTA format DB file as input?\n
Or I guess your INPUT file DOES NOT exist in PWD.\n\n";
print " As like: $0 MG.fa \n\n\n";
print chr(7);
exit;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Checking if they have 'fasta' or 'ssearch' in the path
#__________________________________________________________________________________
if($algorithm=~/\/([^\/]+) *$/){
print "\n# Checking \$algorithm ($algorithm) name\n";
$algorithm_name=$1;
if(-s $algorithm){
print "\n# (i) $0 will use \"$algorithm\"\n";
}else{
$result_of_which_run=`which $algorithm_name`;
if($result_of_which_run=~/^ *(\/\S+\/)[fastassearch]+\d* *$/){ ## after Lily Fu's suggestion
print "\n# $0: Your $algorithm_name is in $1, good, I am running it\n";
}else{
print "\n# (E) \$algorithm value $algorithm is not found\n";
print "\n# (E) $0 ran \'which\' Linux command and the result is:\n";
print " $result_of_which_run\n";
print "\n# Please check your path for $algorithm\n\n"; print chr(7);
exit;
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Running the actual big sub
#______________________________
@final_clu_files=@{&geanfammer(\@your_genome_or_db_to_analyse_file,
"a=$algorithm",
"T=$length_thresh",
$verbose,
"d=$sub_dir_size",
$over_write,
$dynamic_factor,
$create_sso_file,
$reverse_sequence,
$machine_readable,
$make_msp_in_sub_dir_opt,
"E=$Evalue_cut_single_link",
"e=$Evalue_cut_divclus",
$make_subdir_gzipped,
$Lean_output,
)};
if($verbose){
print "\n\n\n#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~";
print "\n#";
print "\n# $0 is finished \n\n";
}
print "\n# $0 : the final output \'@final_clu_files\' is created " if $make_separate_summary;
print "\n#__________________________________________________________________\n\n" if $verbose;
return(\@final_clu_files);
}
#______________________________________________________________________________
# Title : encrypt_passwd
( run in 1.555 second using v1.01-cache-2.11-cpan-39bf76dae61 )