Bioinf
view release on metacpan or search on metacpan
# Returns : [$av_correct, $num_enq_seq]
# Tips :
# Argument :
# Todo :
# Author : A Scientist
# Category :
# Version : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
$sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
$av_correct, $av_false, $actual_e_value, $correct_matched,
%correcting_pairs, @correcting_pairs, %correct);
my %seqs=%{$_[0]};
my @pdbg_seqs=@{$_[1]};
my $evalue=${$_[2]};
my $pdbg_base=${$_[3]} || $ARGV[3];
my $E_mult_factor1=${$_[4]};
my $E_mult_factor2=${$_[4]};
if(ref($_[5])){ $leng_thresh =${$_[5]} }else{ $leng_thresh=$_[5]; }
}
}
if($correct_matched !=1){
$false_positive++;
unless( $correct{$sorted} ){
$correct_group{$base} .="Nomolog: $sorted $base $msp_0{$sorted}\n";
}
$correct{$sorted} = "Nomolog: $base $msp_0{$sorted}";
}
}
if(@match_seqs == 0){ @match_seqs=1; $percent_correct=0; }
$sum_correct += $correct;
$sum_false += $false_positive;
}
$av_correct = $sum_correct/$num_enq_seq;
$av_false = $sum_false /($num_enq_seq);
#### $actual_e_value becomes whatever $E_mult_factor1 defined ~~~~~~~~~~~~
if($E_mult_factor1 != 1){
$actual_e_value=$evalue * $E_mult_factor1;
}elsif($E_mult_factor2 != 1){
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
$score, @temp_show_sub, $optimize, $file, $evalue, $over_write, $din_dom,
$sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
$average_region, $dynamic_factor, @sub_clustering_clu_files);
$factor=7; # default factor is 7 for 70%
$length_thresh=30;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Dealing with options
#_________________________________________
}if($char_opt=~/L/){ $large_region ='L';
}if($char_opt=~/A/){ $average_region='A';
}if($vars{'T'}=~/\d+/){ $length_thresh= $vars{'T'};
}if($vars{'l'}=~/\d+/){ $length_thresh= $vars{'l'}; ## synonym of 't'
}if($vars{'f'}=~/\S+/){ $factor= $vars{'f'};
}if($vars{'s'}=~/\d+/){ $score = $vars{'s'};
}if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'};
}if($vars{'E'}=~/\d+/){ $evalue= $vars{'E'}; # synonym of e
}
$percent_fac=$factor*10;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
#________________________________________________________________________________
if(@file == 1){ #<=== @file has xxxx.msp, yyyy.msp zzzz.msp ....,
print "\n# (1) divide_clusters: One single file was given=> \"@file\"\n" if $verbose;
$file=$file[0];
$base_1=${&get_base_names($file)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) This is critical seqlet merging step
#________________________________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
"f=$factor", $short_region, $large_region, $average_region, $optimize)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (5) This is showing the result in clu file format
#________________________________________________________________________________
@temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor" );
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $file);
}else{ push(@bad, $file); }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (6) Final write up stage (unecessary)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) Clustering the sets of merged seqlets => CORE algorithm
#____________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
$short_region, $large_region, $average_region)};
@temp_show_sub=&show_subclusterings(\@out, $big_msp_file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor");
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $big_msp_file);
}else{ push(@bad, $big_msp_file); }
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($max_size, $sat_file_name, $clu_file_name,
$ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
$indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
@sub_clustering_out_files); # clusall_1e-5_clu_14-324_ss.sat
my @out=@{$array[0]};
$indup_count=0;
if($char_opt=~/d/){ $dindom=1; }
if($char_opt=~/i/){ $indup=1; }
if($vars{'f'}=~/\S+/){ $factor= $vars{'f'}; }
if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
if($vars{'s'}=~/\d+/){ $score = $vars{'s'}; }
if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'}; }
#print "\n# (1) show_subclusterings : \@file has : @file\n";
if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.msp/ or
$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/ ){
$ori_cluster_size=$2;
$ori_cluster_num =$3;
$base=$1;
$sat_file_name="$base\.sat";
$ori_cluster_size,
$dindom,
$clu_file_name,
\%tem3, \%tem4,
$indup, );
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Local subroutine
#_______________________________________________________________
sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
my(@keys, $indup_count, $x, $m, $percentage_fac);
my $count=$_[0]; # count of cluster
my %tem2=%{$_[1]}; my $num_seq=@keys=sort keys %tem2;
my %tem=%{$_[2]}; my $ori_cluster_num=$_[3];
my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
my $ori_cluster_size=$_[4];
my $dindom=$_[5]; my %tem3=%{$_[7]};
my $indup=$_[9]; my (%internal_dup);
my %tem4=%{$_[8]};
#~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
if($dindom){
for($m=0; $m< @dup_keys; $m++){
printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
}
}
}
}
#~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
print CLU "Cluster size $num_seq\n";
printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
print "Cluster size $num_seq\n";
printf ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
for($x=0; $x <@keys; $x++){
printf CLU (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
printf (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
}
return($indup_count);
}
}
# 1cdg_6taa -------442---------------2222-----------------000-
# 1cdg_2aaa -------222---------------2222-----------------000-
#
# <Final output>
# 2aaa_6taa 0%
# 1cdg_6taa 67%
# 1cdg_2aaa 67%
#
# Warning :
# Keywords :
# Options : 'p' or 'P' for percentage term(default)
# 'r' or 'R' for ratio term (0.0 - 1.0), where 1 means all the
# segments were wrongly aligned.
# 's' or 'S' for Shift rate (it actually caculates the position shift
# rate for the secondary structure segment.
# 'h' or 'H' for position Shift rate (it actually caculates the position
# shift rate for helical segments). If this is the only option, it
# will show the default percentage term rate for helical segments.
# If used with 'r', it will give you ratio (0.0 - 1.0) for helical
# segment. If used with 's' option, it will give you position shift
# rate for only helical segments.
# 'e' or 'E' for position Shift rate (it actually caculates the position
# shift rate for beta segments). If this is the only option, it will
# show the default percentage term rate for beta segments. If used
# with 'r', it will give you ratio (0.0 - 1.0) for beta. If used
# with 's' option, it will give you position shift rate for only
# beta segments.
# Returns :
# Argument : Two references of hashes. One for error rate the other for sec.
# assignment.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub get_segment_shift_rate{
# Title : get_wrong_segment_rate
# Usage : print_seq_in_block( &get_wrong_segment_rate(\%superposed_hash) );
# Function : Treats the segment as one single big error.
# calculates the wrong segment number compared to the correct ones.
# Example : <input example> hash of 3 keys and values.
# 2aaa_6taa -------00000---------00000000----0000-------00000-
# 1cdg_6taa -------442---------------2222-----------------000-
# 1cdg_2aaa -------222---------------2222-----------------000-
#
# In the above there are two segments wrong in 3 segment blocks = 2/3
# <output example> hash of 3 percentage rates.
#
# 2aaa_6taa 0 %
# 1cdg_6taa 66.6666666666667 %
# 1cdg_2aaa 66.6666666666667 %
#
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# by 9(occurance) in the above
# for '0' of the first hash, so (0, 2.222, 1, 2.1666, 2, 2.4 )
# Average is the average of numbers
# average value in 0-9 scale (or 0-100 with 'p' option)
# So, if there are
# seq1 00111110000, The 'a' value of 0 and 1 as in the seq2
# seq2 33000040000 is 0-> 6/6, 1-> 4/5, while the 'n'
# calc would be, 0-> 6 (60%), 1-> 4(40%)
#
# Argument : (\%hash1, \%hash2) or optionally (\%hash1, \%hash2, ['n', 'i', 'p', 'a'])
# 'n' => normalizing, 'p' => percentage out, 'i' => make int out, 'a'=> averaged
# Category :
# Version : 1.2
#--------------------------------------------------------------------
sub tally_2_hashes{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if( $char_opt=~/m/i){ $minus_whole_cs = 'm'}
@input = @{$array[0]};
if(defined(${$_[2]})){ $base_level =${$_[2]}; }
if(defined(${$_[3]})){ $scale =${$_[3]}; }
for ($t=0; $t< @input; $t++){ $length=length($input[$t]) if(length($input[$t])>$length);}
if ($length < $window_size){ $window_size = $length; }
#___________ getting ratio for the whole sequence ___________
$compos_whole_seq=${&main::compos_id_percent_array(\@input)}; ## for whole composition rate
$seq_id_whole_seq=${&main::seq_id_percent_array(\@input)};
print "\nComposition ID of the alignment: $compos_whole_seq\%\n";
print "Sequence ID of the alignment: $seq_id_whole_seq\%\n";
if ($seq_id_whole_seq == 0){ $ratio_whole_seq =0; }
else{ $ratio_whole_seq = $compos_whole_seq/$seq_id_whole_seq; }
print "Composition and Sequ. ID Ratio: $ratio_whole_seq\n";
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
########## Initial Window size setting ##############################
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if ( $ratio_whole_seq >9 ){ $window_size= 28; } # $ratio_whole_seq is
## Go back if the window size is too small due to gaps ##
#####################################################################
if( (length($win_no_gap1) < $ori_win_size)&&($variable_win_size ==1) )
{
$window_size+=1; $w--; next;
}
#####################################################################
## Getting Compos and Seq ids ##
#####################################################################
$compos_id=${&main::compos_id_percent_array(\@array_of_2_seq)};
$seq_id =${&main::seq_id_percent_array(\@array_of_2_seq)};
#####################################################################
#### Go back if the Seq id is bigger than Compos id #######
#####################################################################
if(($variable_win_size eq 'v') && ($seq_id > $compos_id))
{
$window_size+=1; $w--; next;
}
my(@ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $t, $length, $w,
$compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
$ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate );
for ($t=0; $t < @input; $t++){
$length=length($input[$t]) if (length($input[$t])>$length); }
if ($length < $window_size){ $window_size = $length; }
#___________ getting ratio for the whole sequence ___________
$compos_whole_seq=${&compos_id_percent_array(\@input)};
$seq_id_whole_seq=${&seq_id_percent_array(\@input)};
if ($seq_id_whole_seq == 0){ $ratio_whole_seq =$compos_whole_seq/10; }
else{ $ratio_whole_seq =$compos_whole_seq/$seq_id_whole_seq; }
#___________ getting ratio for each window sequence ___________
for ($w=0; $w < $length; $w++){
$offset = $w - int($window_size/2); # $offset starts from -5 when window_size is 10.
$offset=0 if ($offset < 0);
$window_1=substr($input[0], $offset, $window_size); # window_1 is one segment
$window_2=substr($input[1], $offset, $window_size); # of defined length(size)
@array_of_2_seq=($window_1, $window_2); # making an array like this = ('ABCDE', 'BDESA')
$compos_id=${&compos_id_percent_array(\@array_of_2_seq)};
$seq_id =${&seq_id_percent_array(\@array_of_2_seq)};
#print "\n offset = $offset Wind1 = $window_1 Wind2 = $window_2 ";
#print " Compos1 = $compos_id Seqid = $seq_id \n";
#______ Handle special case when $seqid is zero > the rate becomes $compos_id/10 ______
if (($seq_id == 0) && ($compos_id != 0)){ $ratio_compos_vs_seqid = $compos_id/10; }
elsif(($seq_id == $compos_id)&&($seq_id == 0)){ $ratio_compos_vs_seqid = 0;}
elsif(($seq_id == $compos_id)&&($seq_id == 100)){ $ratio_compos_vs_seqid = 0;}
else{ $ratio_compos_vs_seqid=($compos_id/$seq_id); }
push(@ratio_array, $ratio_compos_vs_seqid); }
#
# Function : gets the numbers of occurances for 1, 2, 3 ... position shifts.
# If hash is given, it only looks at the values.
# If multiple string, array, hash or combinations of these
# are given, it will add up to one single result
# Example :
# Warning :
# Keywords : composition of chars, composition table making,
# make_composition, make composition table
# occurances_of_char, get_char_occurances, occurances
# get_percentage_occurances_of_char, percentage_occurances_of_char
# Options : 'p' for percentage output of the char among others
# 'n' for NO name option when HASH input is given
# Returns : one ref. of hash (a =>5, b=>6, c=>4,,,,,)
# Argument : one ref. of hash (seq1 alsdfjlsj
# seq2 asldfjsld
# seq3 owiurouou);
# Category :
# Version : 1.4
#--------------------------------------------------------------------
sub get_occurances_of_char{
my ($i, %H, $no_name, %out, $N,@splited, $val,$percentage_out,
$split, $sum);
for($i=0; $i< @_; $i++){
if($_[$i]=~/^[\-]?p$/i){
$percentage_out=1; splice(@_, $i, 1); $i--;
}elsif($_[$i]=~/^[\-]?n$/i){
$no_name=1; splice(@_, $i, 1); $i--;
}
}
for($i=0; $i< @_; $i++){
if( ref($_[$i]) eq 'HASH'){
my %H=%{$_[$i]};
my @names=keys %H;
for $key (@names){
@splited=@{$_[$i]};
for $split (@splited){ $out{$split}++; $sum++ }
}elsif(ref($_[$i]) eq 'SCALAR'){
@splited = split(//, ${$_[$i]});
for $split (@splited){ $out{$split}++; $sum++ }
}elsif( !(ref($_[$i])) ){
@splited = split(//, $_[$i]);
for $split (@splited){ $out{$split}++; $sum++ }
}
}
if($percentage_out==1){
my @keys=keys %out;
my %percent;
for($i=0; $i< @keys; $i++){
$percent{$keys[$i]} = $out{$keys[$i]}/$sum*100;
}
return(\%percent);
}else{
return(\%out);
}
}
#________________________________________________________________________
# Title : make_composition_table
# Usage : %occurances=%{&make_compos_table(\%key_and_value_for_seq)};
# Function : gets the numbers of occurances for 1, 2, 3 ... position shifts.
$aa{"c"} = "cys"; $aa{"n"} = "asn"; $aa{"e"} = "glu"; $aa{"q"} = "gln";
$aa{"f"} = "phe"; $aa{"r"} = "arg"; $aa{"g"} = "gly"; $aa{"s"} = "ser";
$aa{"h"} = "his"; $aa{"t"} = "thr"; $aa{"i"} = "ile"; $aa{"v"} = "val";
$aa{"k"} = "lys"; $aa{"w"} = "trp"; $aa{"l"} = "leu"; $aa{"y"} = "tyr";
return(\%aa);
}
#________________________________________________________________________
# Title : amino_acid_compos_id_percent
# Usage : $percent = &amino_acid_compos_id_percent (%any_hash_with_sequences);
# The way identity(composition) is derived is;
#
# Function : gets amino acid composition identity of any given
# number of sequences(at least 2).
# Example :
# Warning :
# Keywords : get_amino_acid_composition, get_protein_composition, composition
# Options :
# Argument : hash of at least 2 sequences.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent{
my(%input)= %{$_[0]};
my(@names)=keys (%input);
my(@temp, $i, $j, $iden, @all_pairs_id, $iden_sum);
my(%compos_table1, %compos_table2, $final_iden, $larger);
for ($i=0; $i < @names; $i ++){ # putting seqs in arrays.
$input{$names[$i]}=~ tr/a-z/A-Z/;
$input{$names[$i]}=~ s/\W//g;
@{"string$i"}= split('', $input{$names[$i]});
$larger = @{"string$i"} if @{"string$i"} > $larger;
}
push (@all_pairs_id, $iden );
}
}
for $iden (@all_pairs_id){ $iden_sum+= $iden; }
if(@all_pairs_id == 0){ @all_pairs_id =1; }
$final_iden=$iden_sum/@all_pairs_id;
\$final_iden;
}
#________________________________________________________________________
# Title : seq_id_percent_array (more than 2 elements array)
# Usage : $percent = &seq_id_percent_array(@any_array_sequences);
# The way identity(pairwise) is derived is;
#
# Function : produces amino acid composition identity of any given number of sequences.
# Example :
# Warning : This can handle 'common gaps' in the sequences
# Keywords : get_percent_composition_identity, seq_composition_identity,
# percent_sequence_composition_id
#
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub seq_id_percent_array{
my(@input, $denominator,@all_pairs_id, $percent_id);
my($largest,$p,$i,$j,$k,$iden_residue_num,$iden,@temp,$iden_sum,$gap_num,$final_iden);
for($d=0; $d<@_; $d++){
if(ref($_[$d]) eq 'ARRAY'){ @input=@{$_[$d]}; }
elsif( (ref($_[$d]) eq 'SCALAR') &&( ${$_[$d]}=~/^[aA]/) ){ $average_len_opt =1 }
elsif( !(ref($_[$d])) && ( $_[$d] =~/^[aA]/) ){ $average_len_opt =1 } }
if ((@input== 1)||( @input== 0)){
print "\n\n \" $0 \" requires at least 2 sequences\n\n";
print "\n Abnormally dying at seq_id_percent_array in $0 \n\n";
print chr(7); exit;}
$shortest=length($input[0]);
my($sans_gap_seq, $length_sum, $average_seq_len);
for($p=0; $p < @input; $p++){
$input[$p]=~ tr/a-z/A-Z/;
$sans_gap_seq=$input[$p];
$sans_gap_seq=~s/\W//g;
$input[$p]=~ s/\W/./g;
(@{"string$p"})=split('', $input[$p]);
$largest = length($input[$p]) if length($input[$p]) > $largest;
$average_seq_len = $length_sum/@input;
for($i=0; $i< @input; $i++){
for($j=$i+1; $j< @input; $j++){
for ($k=0; $k < $largest; $k ++){ # getting composition tables for two seqs.
if ((${"string$i"}[$k] !~ /\W/)&&(${"string$i"}[$k] eq ${"string$j"}[$k])){
$iden_residue_num++; }
elsif((${"string$i"}[$k] =~ /\W/)&&(${"string$i"}[$k]=~ /\W/)){ $gap_num++; }}
if( $average_len_opt == 1){ $denominator = $average_seq_len; }
else{ $denominator = $shortest; }
if($denominator == 0){ $denominator=1; } # in the above it is 50% rather than 0.07%
$percent_id=($iden_residue_num/($denominator))*100;
push(@all_pairs_id, $percent_id);
undef ($iden_residue_num, $gap_num);
}
}
for (@all_pairs_id){ $iden_sum+=$_; }
$final_iden=$iden_sum/($#all_pairs_id+1);
return( \$final_iden );
}
#________________________________________________________________________
# Title : compos_id_percent_array (more than 2 elements array)
# Usage : $percent = &compos_id_percent_array(@any_array_sequences);
# The way identity(composition) is derived is;
# Function : produces amino acid composition identity of any given number of sequences.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub compos_id_percent_array{
my(@input)=@{$_[0]};
my($largest,$iden,@temp,$iden_sum,$final_iden, @all_pairs_id);
for($i=0; $i<=$#input; $i++){ $input[$i]=~ tr/a-z/A-Z/; $input[$i]=~ s/[\.\-\s]//g;
@temp = split('', $input[$i]); (@{"string$i"})= @temp;
$largest = @{"string$i"} if @{"string$i"} > $largest; }
for($i=0; $i< @input; $i++){ #_________ permutating ___________
for($j=$i+1; $j<=$#input; $j++){
for ($k=0; $k <= $largest; $k ++){ # getting composition tables for two seqs.
$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/); }
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
$sum_of_the_common_residue_no += $common; }
$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100; }
\$compos_id;
}
#-----------------------------------------------------
return ( \$final_iden ); # final identity for any given set of strings(seq).
}
#________________________________________________________________________________
# Title : compos_id_percent_hash (synonym of amino_acid_compos_id_percent)
# Usage : $percent = &compos_id_percent_hash(%any_hash_with_sequences);
# The way identity(composition) is derived is;
#
# Function : gets amino acid composition identity of any given number of sequences.
# Example :
# Warning :
# Keywords : get_amino_acid_composiiton
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub compos_id_percent_hash{ my(%input, @names);
if(ref($_[0]) eq 'HASH'){ %input= %{$_[0]}; @names= keys %input; }
else{ print "\n hash ref was not passed to compos_id_percent_hash\n"; exit; }
my(@temp, $iden, @all_pairs_id, $i, $j, $k,$iden_sum);
my(%compos_table1, %compos_table2, $final_iden, $larger);
for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
$input{$names[$i]}=~ s/\W//g; @temp = split('', $input{$names[$i]});
(@{"string$i"})=@temp; $larger = @{"string$i"} if @{"string$i"}>$larger;}
for ($i=0; $i < @names; $i++){
for ($j=$i; $j < @names; $j ++){
if ($j == $i){ next; }
for ($k=0; $k < $larger; $k ++){
$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
#________________________________________________________________________
# Title : common_compos_id_hash (BUG free)
# Usage : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function : actual calculation of identity
# Example : ('A', 200, 'C', 191, D, 99)
# ('A', 290, 'C', 199, D, 100)
# uses only two sequences.
# Warning :
# Keywords :
# Options :
# Returns : ref. of a scaler (in percent) eg) 95
# Argument : two references of hash of seqeunces.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub common_compos_id_hash{
my(%hash1)=%{$_[0]};
my(%hash2)=%{$_[1]};
my(%common_of_the_2)=(); my($common, $compos_id, $sum_of_the_common_residue_no);
my(@values1) = values (%hash1); my(@values2) = values (%hash2);
my(@combined_values)=(@values1, @values2);
#________________________________________________________________________
# Title : calc_compos_id_hash (the same as 'common_compos_hash')
# Usage : %hash = &calc_compos_hash(\%any_hash1, \%any_hash1);
# Function : actual calculation of identity
# Example : ('A', 200, 'C', 191, D, 99)
# ('A', 290, 'C', 199, D, 100)
# uses only two sequences.
# Warning :
# Keywords :
# Options :
# Returns : ref. of a scaler (in percent) eg) 95
# Argument : two references of hash of seqeunces.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub calc_compos_id_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; my(%common_of_the_2)=();
my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
my(@values1) = values (%hash1); my(@values2) = values (%hash2);
my(@combined_values)=(@values1,@values2);
for $elem (@combined_values){
$sum_residues += $elem; }
if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
else{
for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
$sum_of_the_common_residue_no += $common; }
$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100; }
\$compos_id;
}
#________________________________________________________________________
# Title : get_percentage
# Usage : %out= %{&get_percentage(\%result, '1')};
# Function : calculates the percentage content of any single char over the whole
# length of strings in it.
# Example : if the string is 'seq ABCDEEEEEFFEFE' given in a hash
# if you put 'A' as one argument, it counts the occurances of 'A'
# and gets the percentage of it.
# Warning : This converts array and string input as ref. into arbitrary hash and
# returns hash
# programmed by A Biomatic
# Keywords : get_percentage_of_char
# Options : None yet.
# Returns : Numerical Percentage
# Argument : ref. for Scalar string or Array of chars or Hash AND 'the target char'
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percentage{
my(@in, $k, $sort, $numerator, $residue, @out_hash_ref, %hash_out );
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k])&& (length($_[$k]) == 1 )){
$numerator = $_[$k];
}
elsif( (ref($_[$k]) eq 'SCALAR') && (length(${$_[$k]}) == 1 )){
$numerator = ${$_[$k]};
}
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); }
elsif(ref($_[$k]) eq "ARRAY") { push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
elsif(ref($_[$k]) eq "SCALAR"){ push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
}
####### final output is => @in of hash ref elements #######
for (@in){ my(%H) = %{$_}; my(@keys)= sort keys %H;
for $name(@keys){
my($numerator_count);
my($seq_len) = length($H{$name}); print "\n $name Sequence length: ", $seq_len, "\n";
my(@string) = split(//, $H{$name});
for $residue (@string){ if($residue =~/^$numerator$/){ $numerator_count ++; }}
my($percent) = $numerator_count/$seq_len *100;
$hash_out{$name}=$percent; }
push(@out_hash_ref, \%hash_out); }
if(@out_hash_ref ==1){ return($out_hash_ref[0]); }
elsif( @out_hash_ref > 1){ return(@out_hash_ref); }
}
#________________________________________________________________________
# Title : pairwise_percent_id (pairwise sequence identity in percentage)
# Usage : $identity = ${&pairwise_percent_id(%arrayinput)};
#
# Function : takes a ref. of a hash of names and sequences, returns
# percent identity.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub pairwise_percent_id{
my($i,$j,$k, @iden_array_ref);
for($i=0; $i< @_; $i++){ my %input= %{$_[$i]}; my @names= sort keys %input;
my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
@temp = split('', $input{$names[$i]}); (@{"string$i"})=@temp;
$larger = @{"string$i"} if @{"string$i"} > $larger; }
for ($i=0; $i < @names; $i++){ # to make permutated pairs.
for ($j=$i+1; $j < @names; $j ++){
for ($k=0; $k < $larger; $k ++){ # getting composition tables for two seqs.
$iden+=2 if ((${"string$i"}[$k] eq ${"string$j"}[$k])&&(${"string$i"}[$k] =~ /\w/));
$residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
$residue_sum2++ if (${"string$j"}[$k] =~ /\w/); }
$whole_seq_len =($residue_sum1+$residue_sum2);
$percent_for_pair = $iden/$whole_seq_len*100;
push(@percent_for_pair,$percent_for_pair);
$residue_sum1=0; $residue_sum2=0; $iden=0; } }
for $iden (@percent_for_pair){ $iden_sum+=$iden;}
$final_av_iden=$iden_sum/( @percent_for_pair );
push(@iden_array_ref, \$final_av_iden); }
if(@iden_array_ref ==1){ return($iden_array_ref[0]);}else{ return(@iden_array_ref);}
}
#________________________________________________________________________
# Title : get_seq_identity
# Usage : $identity = ${&get_seq_identity(%arrayinput)};
#
# Function : takes a ref. of a hash of names and sequences, returns
# percent identity. NOT composition identity.
# Example :
# Warning :
# Keywords : get_sequence_identity
# Options :
# Returns :
# Argument : hash(es) of sequences.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_seq_identity{
my($i,$j,$k, $c, @iden_array_ref);
for($c=0; $c< @_; $c++){
my %input= %{$_[$c]};
my @names= sort keys %input;
my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
for ($i=0; $i < @names; $i ++){
$input{$names[$i]}=~ tr/a-z/A-Z/;
@temp = split(//, $input{$names[$i]});
@{"string$i"}=@temp;
$larger = @{"string$i"} if @{"string$i"} > $larger; }
for ($i=0; $i < @names; $i++){ # to make permutated pairs.
for ($j=$i+1; $j < @names; $j ++){
for ($k=0; $k < $larger; $k ++){
if ( ${"string$i"}[$k] eq ${"string$j"}[$k] and ${"string$i"}[$k] =~ /\w/){
$iden+=2 ;
}
$residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
$residue_sum2++ if (${"string$j"}[$k] =~ /\w/);
}
$whole_seq_len =($residue_sum1+$residue_sum2);
$percent_for_pair = $iden/$whole_seq_len*100;
push(@percent_for_pair,$percent_for_pair);
$residue_sum1=0;
$residue_sum2=0;
$iden=0;
}
}
for $iden (@percent_for_pair){
$iden_sum+=$iden;
}
if(@percent_for_pair <1){ @percent_for_pair=(1); }
$final_av_iden=$iden_sum/( @percent_for_pair );
push(@iden_array_ref, \$final_av_iden);
}
if(@iden_array_ref ==1){
return($iden_array_ref[0]);
}else{
return(@iden_array_ref);
}
}
#________________________________________________________________________
# Title : get_correct_percent_alignment_rate (made for Bissan)
# Usage : &get_correct_percent_alignment_rate(\$file1, \$file2);
# Function : accepts two files and prints out the sequence identities of the alignment.
# Example :
# Warning : Alpha version, A Biomatic , made for Bissan
# Keywords :
# Options : h # for help
# v # for verbose printouts(prints actual sequences)
# Returns : reference of Scalar for percentage correct alignment(for already
# aligned sequences)
# Argument : two sequence files which have identical sequence names.
# Category :
# Version :
#--------------------------------------------------------------------
sub get_correct_percent_alignment_rate{
my($i, $j, $k, $verbose, @string1, @string2, $larger, $seq_pair_id, @seq_pair_ids );
my(%inputhash1) = %{&read_any_seq_files($_[0])};
my(%inputhash2) = %{&read_any_seq_files($_[1])};
my(@names)= sort keys %inputhash1;
######################################
####### Sub argument handling ########
######################################
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k])&& (length(${$_[$k]}) < 5)){ # when inputs are not ref.
if($_[$k]=~ /^[\-vV]$/){ $verbose = 1; next;}
}
print "\n", "_"x88, "\n";
my($whole_average_of_the_id)=${&array_average(\@seq_pair_ids)};
print "The whole average is; $whole_average_of_the_id\n";
if(@seq_pair_ids == 1){ return( \$seq_pair_ids[0] ); }
elsif(@seq_pair_ids > 1){ return( \@seq_pair_ids ); }
}
#________________________________________________________________________
# Title : amino_acid_compos_id_percent_trend
# Usage :
# Function :
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent_trend{
my(%input) = %{$_[0]};
my(@common, @string,@accumu_percent_iden)=(); my(%common_so_far, %compos_table);
my($percent_id_so_far, $length_of_one_seq,$length_of_all_seq, $seq_no)=0;
for $key(keys %input){
$input{$key}=~s/[. \d-]//g;
@string= split(//, $input{$key});
print @string; print "\n";
$length_of_one_seq = $#string+1;
$length_of_all_seq +=$length_of_one_seq;
$seq_no += 1;
%compos_table = &composition_table(@string);
@check = keys (%common_so_far);
if ($#check < 0){ %common_so_far = %compos_table; }
else{ %common_so_far= %{&common_compos_2_hash(\%common_so_far,\%compos_table)};}
for $value(values %common_so_far){ $common_residue_sum +=$value; }
$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
$common_residue_sum =0; }
for $value(values %common_so_far){ $common_residue_sum +=$value; }
$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
return(\$final_percent_id);
}
#________________________________________________________________________
# Title : composition_table (can handle both nucleic and protein seq)
# Usage : %output = %{&compos_table(@input_array1, @input_array2,,,,)};
# example input
#
# Function : returns a table of alphabet with occurances.
# can handle any char, this converts char to upper case.
# Example :
my(%common_of_the_2)=(); my($common)=0;
for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
if ($common =~ /\d+/){
$common_of_the_2{$key1}=$common; } }
\%common_of_the_2;
}
#________________________________________________________________________
# Title : pair_percent_id_trend
# Usage : @array = &pair_percent_id_trend (%arrayinput);
# Function :
# Example : common gaps means only '.' (dots, not alphabets!!)
# AAA....BBCB
# AAAB..B.BCC --> A.A.....BC. (as in an array)
# A.AAA...BCA
# The resulting array XXXXX..XXXX is literally like so.
# This is to detect absurd gaps in the above.
#
#
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub pair_percent_id_trend{
my(%input) = %{$_[0]};
my(@common, @string,@accumu_percent_iden)=();
my($percent_id_so_far)=0;
for $key(keys %input){
my($len) = &smaller_one($#common, $#string) unless $#common < 0;
$input{$key}=~s/ //g;
@string= split(//, $input{$key});
$length_of_one_seq = $#string+1;
$length_of_all_seq +=$length_of_one_seq;
$seq_no += 1;
for ($k=0; $k <= $len;$k++){
if($#common == -1){
@common = @string;
}
elsif ( $string[$k] eq $common[$k] ){
$common[$k] = $string[$k];
$identical_count +=1;
}elsif( $string[$k] ne $common[$k]){
$common[$k]='.';
}
}
$num_of_iden_char = &count_num_of_char(\@common);
$av_seq_no = $length_of_all_seq/$seq_no;
$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
print "\n percent_id so far = $percent_id_so_far \n";
push(@accumu_percent_iden,$percent_id_so_far);
} # end of for (after all sequences have been run).
$num_of_iden_char = &count_num_of_char(@common);
$av_seq_no = $length_of_all_seq/$seq_no;
$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
print "\n percent_id so far = $percent_id_so_far \n";
\@accumu_percent_iden; # final ids array.
}
#________________________________________________________________________
# Title : smaller_one
# Usage : $smaller = & smaller_one($var, $var2);
#
# Function : gets smaller value of the two inputs
# Example : will return 5 with &smaller_one(5, 50);
# Warning : gets only digits!!
# Keywords :
# Options :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub count_num_of_char{
my(@input)={$_[0]};
my($num_of_char)=0;
for $elem(@input){ # this is for the percentage of TWO seqs.
if ($elem =~ /\w/){
$num_of_char +=1;
}
}
$num_of_char;
}
#________________________________________________________________________
# Title : remov_com_column2 (this is the older and slower version)
# Usage : %new_string = %{&remov_com_column2(\%input_hash)};
# Function :
# ((-l option for long output )
# NAME MOZ_HUMAN_part
# HEADER |- Residue -| Pred Rel NAli Asn
# PRED 1 MET M c 0.000 0 ?
# PRED 2 ALA A c 0.000 0 ?
#
# Warning :
# Keywords : open_prd_files, open_pred_files, predator, open_prdl_files
# open_pre_files, secondary structure prediction file
# Options : 's' for sequence output as well (\%sec_str, \%seq)
# 'p' for percentage of the sec. str.
# 'a' for accumulated percentage. This will
# set 'p' automatically
# 'n' for NO name when outputing Percentage of chars with
# HASH input to get_occurances_of_char sub.
# $reverse_residue_order=r by r
# Returns :
# Argument :
# Category :
# Version : 1.8
#-----------------------------------------------------------
sub open_predator_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my( @out_ref, $seq_out, %sec_str, %seq, $percent_out, $NO_name_out,
$short_form_out_detected, $long_form_out_detected, $accumulate,
$reverse_residue_order, %rev_sec_str);
if($char_opt=~/s/i){ $seq_out=1 }
if($char_opt=~/a/i){ $accumulate=1 }
if($char_opt=~/p/i){ $percent_out=1 }
if($char_opt=~/n/i){ $NO_name_out='n' }
if($char_opt=~/r/){ $reverse_residue_order='r' }
for($i=0; $i< @file; $i++){
my (%sec_str, %seq) if($accumulate !=1);
open(PREDATOR_FILE, "$file[$i]");
print "\n# (INFO) open_predator_files: opening $file[$i]";
while(<PREDATOR_FILE>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
$residue_3_letter=$2;
$residue_1_letter=$3;
$sec_str_predicted=$4;
$reliability=$5;
$sec_str{$position}=[$residue_1_letter, $sec_str_predicted, $reliability];
}
}
close (PREDATOR_FILE);
print "\n \%sec_str is: ", %sec_str, "\n" if ($debug == 1);
if($seq_out==1){ push(@out_ref, \%sec_str, \%seq);
}elsif($percent_out==1 ){
push(@out_ref, [%{&get_occurances_of_char(\%sec_str, $NO_name_out, 'p')}] );
}elsif($percent_out !=1){ push(@out_ref, \%sec_str) }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If -r option is set (for long form, this does not affect
#____________________________________________________________
if($short_form_out_detected and $reverse_residue_order){
@keys=keys %sec_str;
for($r=0; $r<@keys; $r++){
$rev_sec_str{$keys[$r]}=reverse($sec_str{$keys[$r]});
}
%sec_str=%rev_sec_str;
# >P1;1cdg
# secondary structure and phi angle
# CCCCCCCCCCCCCCCCEEECCHHHHCCCCHHHCCCPHHCCCCPCC-CCCCCPCCHHHHHHHHHCPHHHHHPCCEE
# >P1;1cdg
# solvent accessibility
# TTTTTTTTTTTFFFFFFFFFFFFFFTTTTTTTTTTTTTTTTTFTT-TTTTFFFFFTFFTTTFTTTFFTTFTFTFF
# >P1;1cdg
# DSSP
# CCCCCCCCCCCCCCCCEEECCHHHHCCCCGGGCCCGGGCCCCCCC-CCCCCCCCHHHHHHHHHCCHHHHHCCCEE
# >P1;1cdg
# percentage accessibility
# 67523272360000000000000002213792129b722248085-14110000030015105660028040200
# 2ltn ----TETTSFLITKFSPDQQNLIFQGDGYTT-KEKLTLTK------AVKNTVGRALYSSP
# 1loe ----TETTSFSITKFGPDQQNLIFQGDGYTT-KERLTLTK------AVRNTVGRALYSSP
#
# 2ltn ----CEEEEEEECCCCCCCCCEEEEPCCEEP-PPCEEEEC------CCCPCEEEEEECCC
# 1loe ----CEEEEEEECCCCCCCCCEEEEPCCEEE-PPEEEEEC------CCCPCEEEEEECCC
#
# 2ltn ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFTFFT------TTTTTTFFFFTTTT
# 1loe ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFFFFT------TTTTTTFFFFTTTT
#
# Version : 1.0
#--------------------------------------------------------------------
sub open_tem_files{
my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
$s, $t, $u, $v, $w, $x, $y, $z, $pwd, $file, $dir, $output, $in_dir,
%hash, @keys, @array, @hash, $option_string, $string, @in, $line,
$name, %out, $gap_chr, @str1, @str2, $num_opt, @file, @dir,
$char_opt, $char_opt_given, $num_opt_given,
@char_options, @file, $original_dir, @read_files, %array_msf, %array_jp,
$jp_file, $error_rate, $id_compos, @dir, @names, $name, $name_found,
@outref, %sequence, %secondary,%solvent_access, %DSSP, %percent_accessibility,
$name_found,$type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc
);
##################################################
##### Start of general argument handling ######
##################################################
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k]) ){
if($_[$k]=~ /^[\-]*(\w)$/){
$char_opt .= "\,$1";
$char_opt_given =1;
} #from below ============== SECONDARY
}elsif(/^secondary structure and phi angle/){ $type_secon = 1;
}elsif(($type_secon ==1)&&(/^([\w\-]+)[\*]*$/)){
$secondary{$name}.=$1; #from below============= SOLVENT ACCESSIBILITY
}elsif(/^solvent accessibility/){ $type_sol = 1;
}elsif(($type_sol ==1)&&(/^([\w\-]+)[\*]*$/)){
$solvent_access{$name}.=$1; #from below========= DSSP
}elsif(/^DSSP/){ $type_DSSP = 1;
}elsif(($type_DSSP ==1)&&(/^([\w\-]+)[\*]*$/)){
$DSSP{$name}.=$1; #from below=================== PERCENTAGE ACCESSIBILITY
}elsif(/^percentage accessibility/){ $type_acc = 1;
}elsif(($type_acc ==1)&&(/^([\w\-]+)[\*]*$/)){
$percent_accessibility{$name}.=$1; } }
push(@outref,\%sequence,\%secondary,\%solvent_access,\%DSSP,\%percent_accessibility);
} }
if( ($char_opt =~ /s/i) || ( @outref == 1 ) ){
return(\%sequence); }
elsif( @outref > 1){ return(@outref); } # <-- contains (\%sequence,\%secondary,....)
}
#________________________________________________________________________
# Title : open_hlx_files
# Usage :
# Function :
chdir($original_dir);
if($realfile1 =~/(\d+\-$no\.msf)$/){
@dir=split(/\//, $realfile1);
$dir=$dir[($#dir-1)]; # where am I ?
# $jp_file = $original_dir.'/'.$dir.'.jp';
# %array_msf =&open_msf_files($realfile1);
# %array_jp =&open_jp_files ($jp_file);
# $array_ref_msf = \%array_msf;
# $array_ref_jp = \%array_jp;
# $error_rate =&get_posi_shift_hash($array_ref_msf, $array_ref_jp);
# $id_compos =&amino_acid_compos_id_percent($array_ref_jp);
# push(@rates_accumu,$error_rate);
# push(@compos_id,$id_compos);
}
}
else
{
next;
}
}
}
}
}
$sum = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}
\$id_counter; # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_percent_homol_arr
# Usage : $homology_out = ${&get_pair_homol(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function : get pair wise seq. identity of any two strings, outputs a scalar (%)
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percent_homol_arr{
my(@input)=@{$_[0]};
$input[0] =~ tr/a-z/A-Z/; # capitalizing.
$input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$input[0]);
my(@string2)= split(//,$input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter, $sum,$percent_homol)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
$sum = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}else{
$percent_homol=($id_counter/$sum)*100;
}
return(\$percent_homol); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_pair_homol_hash
# Usage : $homology_out = & get_pair_homol (%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function : get pair wise seq. identity as a scalar count
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
}
}
my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}
return ($id_counter); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_percent_homo_hash
# Usage : $homology_out = &get_pair_homol_hash(%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function : get pair wise seq. identity(%) of any two strings put in as a hash
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percent_homo_hash{
my(%input)=@_;
&hash_chk(\%input);
my(@keys_input)= keys (%input);
my(@values_input) = values (%input);
$values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
$values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$values_input[0]);
my(@string2)= split(//,$values_input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter,$percent_homol,)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}else{
$percent_homol=($id_counter/$sum)*100;
}
return ($percent_homol);
}
#________________________________________________________________________
# Title : file_size
# Usage : $outputfilesize = &file_size($input_file_name);
# Function : returns the size of any single testing file
# Example :
# Warning : Q is for quality of this sub. This can't be wrong.
# Keywords :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub file_size { my($infile)=$_[0];
if ( $size=(-s "$infile")){ return $size; }
}
#________________________________________________________________________
# Title : seq_comp_percent2
# Usage : @outarray = &seq_comp_percent2(@any_input_string_array);
# Function : get string seq COMPOSITION identities(a to z). gets array
# of strings and outs array of % numbers
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub seq_comp_percent2{ # simple and basic seq. id. eg. ABC on ABCABC is 50 % identical.
my(@input)=@_;
my(@array_of_ids2, $id2, @char1, @char2);
&array_chk(sort @input);
my($longest_str_size) = &get_long_str_size (@input), "\n";
my($shortest_str_size) = &get_short_str_size(@input), "\n";
print "longest_str_size",$longest_str_size;
print "shortest_str_size",$shortest_str_size;
if (($longest_str_size/$shortest_str_size) > 4){
print "\n The shortest string is less than 1/4 of the longest\n";
print " This is quite meaningless, but will go on\n";
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub cls{ my($cls) = `clear`;
print $cls;
}
#________________________________________________________________________
# Title : seq_comp_percent1
# Usage : @outarray = &seq_comp_percent1(@any_input_string_array);
# Function : get string seq identities(a to z). gets array of strings and outs array of % numbers
# Example :
# Warning :
# Keywords :
# Options :
# Returns : one ref. of an array
# Argument : one ref. of an array
# Category :
# Version :
#--------------------------------------------------------------------
sub seq_comp_percent1{ # this is affected by seq. length
my(@input)=@{$_[0]};
my(@array_of_ids1, $id1, @char1, @char2);
&array_chk(\@input);
@input = sort (@input);
$longest_str_size = &get_long_str_size (@input), "\n";
$shortest_str_size = &get_short_str_size(@input), "\n";
if (($longest_str_size/$shortest_str_size) > 4){
print "\n The shortest string is less than 1/4 of the longest\n";
print " This is quite meaningless, but will go on\n";
}
for $key1 (sort keys %hash1){
for $key2 (sort keys %hash2){
if ($key1 eq $key2){
$no_of_same = &min($hash1{$key1},$hash2{$key2});
$sum_of_same += $no_of_same;
last;
}
}
}
$identity = $sum_of_same*2/(&sum_array(@num_char1,@num_char2))*100;
# print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title : get_id_among_2_2
# Usage : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function : gets the % id of any two sequences, returns in 100.0% format.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
$no_of_same = &min($hash1{$key1},$hash2{$key2});
$sum_of_same += $no_of_same;
last;
}
}
}
$seq1=&sum_array(@num_char1);
$seq2=&sum_array(@num_char2);
$longer_seq = &max($seq1, $seq2);
$identity = $sum_of_same/$longer_seq*100;
#print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title : array_average
# Usage : $output = &array_average(\@any_array);
# Function : (the same as average_array)
# Example :
# Warning : If divided by 0, it will automatically replace it with 1
# Keywords : get_array_average, av_array, average_array, get_average_array
# average_of_array, average_array
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%hash, @pdbd_seqs, @superfamily, @members, @files, $over_write, $msp_file_gz,
$msp_seq_file_dir, $msp_file, $msp_file_long, $msp_file_gz_long,
$pdbd_seq_long, $final_merged_ISL_fasta,$pdbg_file, $pwd , $range_start,
$range_stop, @files_NOT_processed);
my $source_db_fasta=$ENV{'NRDB_FASTA'}; ## general default
my $evalue_thresh=0.001; # default
my $score_thresh=70; # default
my $range_thresh=10;
my $percent_id_thresh=0.95;
print "\n# (i) Running sub of: make_intermediate_sequence_library\n";
if($vars{'FASTA_DB'}=~/\S+/){ $source_db_fasta=$vars{'FASTA_DB'} }
if($vars{'MSP_DIR'}=~/\S+/){ $msp_seq_file_dir=$vars{'MSP_DIR'} }
if($vars{'p'}=~/\S+/){ $pdbg_file=$vars{'p'}; print "\n# (i) $vars{'p'} is given \n"; } ## PDBG file input
if($vars{'m'}=~/\S+/){ $msp_seq_file_dir=$vars{'m'} }
if($vars{'DB'}=~/\S+/){ $source_db_fasta=$vars{'DB'} }
if($vars{'E'}=~/\S+/){ $evalue_thresh=$vars{'E'} }
if($vars{'e'}=~/\S+/){ $evalue_thresh=$vars{'e'} }
if($vars{'s'}=~/\S+/){ $score_thresh=$vars{'s'} }
next;
}
if(@pdbd_seqs){ print "\n# (i) \@pdbd_seqs for $superfamily is: @pdbd_seqs\n";
}else{ print "\n# (E) \@pdbd_seqs is empty, strange, error in making $out_fasta_file \n\n";
}
for($j=0; $j< @pdbd_seqs; $j++){
$pdbd_seq=$pdbd_seqs[$j];
$pdbd_seq_long="pdb\_$pdbd_seqs[$j]";
my (@msp_content, $evalue, $sub_dir, $percentage_id, $range_length);
if($pdbd_seq=~/^ *$/){ next; }
$msp_file="$pdbd_seq\.msp";
$msp_file_gz="$pdbd_seq\.msp\.gz";
$msp_file_long="pdb\_$pdbd_seq\.msp"; ## this is to handle Sarah's pdb_ prefixed pdbd files
$msp_file_gz_long="pdb\_$pdbd_seq\.msp\.gz";
print "# (i) I am processing $msp_file <- subroutine:make_intermediate_sequence_library\n";
if($msp_file_gz=~/^([dec])\S/ or $msp_file_long=~/pdb_([dec])\S/){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
print "\n# (i) Now I have finished reading in ONE msp file for $pdbd_seq ($superfamily) into \@msp_content";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
# Processing each MSP line
#_______________________________________________________________________________________
my ($identical_one_added, $interm_seq);
for($k=0; $k< @msp_content; $k++){ ## NEW MSP format
if($msp_content[$k]=~/^ *(\S+) +(\S+) +(\S+) +\d+ +\d+ +[pdb_]*$pdbd_seq[_\d+\-\d+]* +(\d+) +(\d+) +(\S+)/){ # [pdb_]* is for Sarah's pdb_ prefix
#if($pdbd_seq eq $6){ next } ## this is to exclude the same seq match
$evalue=$2; $percentage_id=$3;
$range_length=$5-$4; $score =$1; $interm_seq=$6; $range_start=$4, $range_stop=$5;
if($interm_seq=~/(\S+)_\d+\-\d+/){ $interm_seq=$1; }
if($evalue < $evalue_thresh and $range_length > $range_thresh and $score > $score_thresh){
if($percentage_id <= $percent_id_thresh){ ## to keep one pdb seq ($percentage_id=1)
unless($interm_seq=~/^d\d\S/){ # to remove entries like: d1abc_10-20_d2acb__ (two pdb seqs)
$interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
}
}elsif($percentage_id ==1 and $identical_one_added < 1){ ## this is to prevent more than 1 100% id seq
$interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
$identical_one_added++;
}
}
}elsif($msp_content[$k]=~/^ *(\d+) +(\S+) +\d+ +\d+ +$pdbd_seq +(\d+) +(\d+) +(\S+)/){
$evalue=$2; $score=$1;
$range_length=$4-$3; $interm_seq=$5;
if($evalue < $evalue_thresh and $range_length > $range_thresh
and $score > $score_thresh){
$interm_hash{$superfamily} .=" $interm_seq\_$3\-$4\_$pdbd_seq";
package Bioinf;
require 5.000;
require Exporter;
use Carp;
@ISA = qw(Exporter);
@EXPORT= qw( ISS_server ONE_TO_THREE_LETTER One_To_Three_Letter Richardson_alpha_matrix
Roman Sheraga_alpha_matrix abs_numerically add_columns
add_ranges_in_msp_line amino_acid_compos_id_percent amino_acid_compos_id_percent_trend amino_acid_homology_matrix
arabic array_average array_chk array_least_occur
array_median array_most_occur array_sum ask_for_ENV_vars
assign_options_to_variables attach_classification_to_pdb_seq average_array average_of_array
beep bla_to_msf break_down_clu_file by_values
calc_compos_id_hash calc_factorial calculate_protein_volume capitalize_sentence
capitalize_word cc check_common_elements_in_array check_file_exists_in_path
check_homology_of_seq_pair check_if_defined check_if_files_exist check_if_sec_str_form_hash
check_input_file_extension check_linkage_of_2_similar_seqlet_sets check_parf_files chop_word
cls clu_to_sso_to_msp cluster_merged_seqlet_sets com_gap_pos_hash
common_compos_2_hash common_compos_id_hash compare_sec_template_with_db compos_id_percent_array
compos_id_percent_hash composition_table compress_files_by_gzip condense_number_string
condense_script convert_1_to_3_letter convert_3_to_1_letter convert_arr_and_str_2_hash
convert_array_to_hash convert_bla_multaln_to_msf convert_bla_to_msf convert_bla_to_msp
convert_char_to_0_or_1_hash convert_clu_to_msp convert_clu_to_sso_to_msp convert_dna_to_protein
convert_hmmls_to_msp_files convert_mmp_to_mrg convert_msp_line_to_mmp_line convert_num_0_or_1_hash_opposite
convert_num_to_0_or_1_hash convert_rna_to_protein convert_sso_to_msp convert_string_to_hash
convert_to_anti_sense corelation_coefficient correct_head_box count_num_of_char
cp create_sorted_cluster ctime default_help
define_secondary_structure_segments delbut detect_file_format_type die_if_file_not_present
diff_dates digitize_char dir_name dir_path
dir_search dir_search_single divide_array divide_clusters
extract_words fasta_append fasta_kt1_search fasta_out_seq_no
fasta_output fasta_permute_array_write fasta_permute_hash_write fetch_seq
fetch_sequence_from_db fetch_subroutines fetch_swiss_seq file_size
fill_ending_space filter_by_string_length filter_hash_by_num_value filter_intermediates_by_E_value
filter_seq_DB_by_seq_length find_central_seq_msp_chunk find_central_sequence find_low_complexity_region
find_program_in_path find_seq_file_old find_seq_files find_source_perl_library
follow_seqlet_link fromJulian full_pwd_path geanfammer
geanfammer_main get_added_matched_regions_in_msp get_all_dirs_from_ENV get_all_msp_files
get_av_and_sd_seq_length get_av_seq_length get_average_sequence_size get_averaged_prediction
get_base_names get_column get_common_array_entry get_common_column
get_common_hash_keys get_correct_percent_alignment_rate get_date get_dir_names_only
get_domain_inside_domain get_each_posi_diff_hash get_extension_names get_false_positive_seq_matches
get_file_dir_names get_file_extensions get_first_seq_in_alignment get_full_dir_names
get_full_file_name get_full_path_dir_names get_full_pwd_path get_gap_positions
get_hash_value_average get_high_score_blocks get_homology_info_of_seq_pairs get_host_by_addr
get_host_by_name get_id_among_2 get_id_among_2_1 get_id_among_2_2
get_internal_dup_in_a_cluster get_isearch_result_stat get_largest_element get_largest_file
get_linked_sequence get_linux_kernel_version get_longest_str_size get_max_hash_by_value
get_median get_median get_msp_enquiry_sequence get_msp_matched_sequence
get_msp_range get_multiple_array_entry get_occurances_of_char get_occurances_of_shift_type_hash
get_occurances_of_shift_type_hash_all get_overlapping_range get_overlapping_seq_match_size get_pair_homol_array
get_pair_homol_hash get_path_dirs_from_ENV get_pdb_file_start_number get_peptide_occurance
get_percent_homo_hash get_percent_homol_arr get_percentage get_perl_keywords
get_posi_diff get_posi_diff_abs get_posi_diff_and_rms_hash get_posi_diff_hash
get_posi_rates_hash_out get_posi_rates_hash_out get_posi_rates_hash_out_compact get_posi_rates_hash_out_jp
get_posi_rates_hash_out_msf get_posi_sans_gaps get_posi_shift_hash get_posi_shift_hash_rms
get_posi_shift_rate get_posi_shift_rms_hash get_posi_shift_rms_whole get_position_shift_rate
get_probable_half get_pwd_dir get_pwd_dir_name get_residue_error_rate
get_scop_correcting_pairs get_sd_of_length_diff get_segment_shift_rate get_seq_fragments
get_seq_hash_sans_gaps get_seq_identity get_seqblock get_sequence_complexity
get_sequence_number get_shortest_str_size get_smallest_file get_stat_FASTA_search_result_in_msp_0_files
get_sub_hash get_subdir_names get_subroutine_calls get_time
get_total_memory_size_in_linux get_unix_shell_name get_whole_pwd_path get_windows_cs_rate_array
open_dna_files open_dssp_files open_embl_files open_fasta_files
open_fil_file open_hlx_files open_hmmfs_files open_hmmls_files
open_jp_files open_lottery_file open_msf_files open_msf_jp_files
open_msp_files open_out_files open_pdb_files open_pdbg_files
open_phd_files open_pir_files open_predator_files open_rms_files
open_rms_files2 open_sdb_files open_self open_seq_alignment_files
open_seq_files open_sequence_index_files open_slx_files open_sso_files
open_sst_files open_sst_files_with_gap open_stride_dat_files open_stride_dat_files
open_subdir_and_go_in_and_do open_swissprot_seq_files open_tem_files opendir_and_go
opendir_and_go_in_and_do_something opendir_and_go_rand_fasta opendir_and_go_rand_fasta_and_clustal overlay_seq_by_certain_chars
overlay_seq_for_identical_chars overlay_seq_hash pair_percent_id_trend pairwise_iden_pos
pairwise_percent_id parse_arguments permute permute_binary
pick_random_files pick_random_hash_pairs pir_permute_array_write pir_permute_hash_write
pir_write pir_write plot_histogram_horizontally plot_vertically
print_clusfile_from_hash print_in_block print_seq_in_block print_seq_in_block_old
print_seq_in_block_with_print print_seq_in_columns produce_random_numbers push_if_not_already
push_if_not_already put_gaps_every_x_position_in_string put_gaps_every_x_position_in_string_special put_gaps_in_hash
put_msp_lines_to_hash_from_bla put_position_back_to_str_seq put_slash_before_special_chars pwd_dir_name
pwd_path rand_DNA_seq_generate rand_RNA_seq_generate rand_sequence_mul_array
rand_sequence_one_array rand_sequence_one_string rand_word randomise_file_contents
randomise_lines read_all_head_boxes read_any_dir read_any_dir2
read_any_dir_for_dir read_any_dir_simple read_any_seq_files read_blast_hits
remove_file_extension remove_mail_header_in_files remove_non_char remove_repetitives_in_array
remove_similar_seqlets remove_small_files remove_text replace_lines
replace_subroutines replace_text reset_all_the_vars reset_shell_environment
rev_abs_numerically rev_lines_pdb rev_sequence_mul_array reverse_hash
reverse_sequences roman rotate_seq round
round_number round_numbers run_fasta_sequence_search scale_for_horizontal_histogram
scan_win_and_get_cs_rate_pairs scan_win_and_get_sc_rate_pairs scan_win_get_average scan_window_and_calc_average
scan_window_and_calc_something scan_windows_and_get_compos_seqid_rate scoring scramble_array
scramble_sequences sd se search_files_in_subdir
search_palindromes search_self self_self_search send_mail
sep seq_comp_percent1 seq_comp_percent2 seq_id_percent_array
seq_to_regexp set_debug set_debug_option set_special_options
shift_word shift_word_recursively show_array show_default_help
show_hash show_in_fasta show_options show_subclusterings
smaller_one sort_by_E_values sort_by_cluster_size sort_by_column
sort_by_column_bigger_first sort_by_digits_in_string sort_by_hash_values sort_by_keys
sort_files_by_size sort_files_by_time sort_hash_by_keys sort_hash_by_value
sort_hash_by_value_and_make_array sort_hash_value_by_column sort_string_by_length sort_words_in_string
split_fasta_files split_file_by_string split_files split_sequence
sqrt_array square_array sso_to_msp ssp_permute_array_write
ssp_permute_hash_write ssp_write steve_permute_array strip_rotated_seq
# Returns : [$av_correct, $num_enq_seq]
# Tips :
# Argument :
# Todo :
# Author : A Scientist
# Category :
# Version : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
$sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
$av_correct, $av_false, $actual_e_value, $correct_matched,
%correcting_pairs, @correcting_pairs, %correct);
my %seqs=%{$_[0]};
my @pdbg_seqs=@{$_[1]};
my $evalue=${$_[2]};
my $pdbg_base=${$_[3]} || $ARGV[3];
my $E_mult_factor1=${$_[4]};
my $E_mult_factor2=${$_[4]};
if(ref($_[5])){ $leng_thresh =${$_[5]} }else{ $leng_thresh=$_[5]; }
}
}
if($correct_matched !=1){
$false_positive++;
unless( $correct{$sorted} ){
$correct_group{$base} .="Nomolog: $sorted $base $msp_0{$sorted}\n";
}
$correct{$sorted} = "Nomolog: $base $msp_0{$sorted}";
}
}
if(@match_seqs == 0){ @match_seqs=1; $percent_correct=0; }
$sum_correct += $correct;
$sum_false += $false_positive;
}
$av_correct = $sum_correct/$num_enq_seq;
$av_false = $sum_false /($num_enq_seq);
#### $actual_e_value becomes whatever $E_mult_factor1 defined ~~~~~~~~~~~~
if($E_mult_factor1 != 1){
$actual_e_value=$evalue * $E_mult_factor1;
}elsif($E_mult_factor2 != 1){
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
$score, @temp_show_sub, $optimize, $file, $evalue, $over_write, $din_dom,
$sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
$average_region, $dynamic_factor, @sub_clustering_clu_files);
$factor=7; # default factor is 7 for 70%
$length_thresh=30;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Dealing with options
#_________________________________________
}if($char_opt=~/L/){ $large_region ='L';
}if($char_opt=~/A/){ $average_region='A';
}if($vars{'T'}=~/\d+/){ $length_thresh= $vars{'T'};
}if($vars{'l'}=~/\d+/){ $length_thresh= $vars{'l'}; ## synonym of 't'
}if($vars{'f'}=~/\S+/){ $factor= $vars{'f'};
}if($vars{'s'}=~/\d+/){ $score = $vars{'s'};
}if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'};
}if($vars{'E'}=~/\d+/){ $evalue= $vars{'E'}; # synonym of e
}
$percent_fac=$factor*10;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
#________________________________________________________________________________
if(@file == 1){ #<=== @file has xxxx.msp, yyyy.msp zzzz.msp ....,
print "\n# (1) divide_clusters: One single file was given=> \"@file\"\n" if $verbose;
$file=$file[0];
$base_1=${&get_base_names($file)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) This is critical seqlet merging step
#________________________________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
"f=$factor", $short_region, $large_region, $average_region, $optimize)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (5) This is showing the result in clu file format
#________________________________________________________________________________
@temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor" );
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $file);
}else{ push(@bad, $file); }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (6) Final write up stage (unecessary)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) Clustering the sets of merged seqlets => CORE algorithm
#____________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
$short_region, $large_region, $average_region)};
@temp_show_sub=&show_subclusterings(\@out, $big_msp_file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor");
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $big_msp_file);
}else{ push(@bad, $big_msp_file); }
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($max_size, $sat_file_name, $clu_file_name,
$ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
$indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
@sub_clustering_out_files); # clusall_1e-5_clu_14-324_ss.sat
my @out=@{$array[0]};
$indup_count=0;
if($char_opt=~/d/){ $dindom=1; }
if($char_opt=~/i/){ $indup=1; }
if($vars{'f'}=~/\S+/){ $factor= $vars{'f'}; }
if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
if($vars{'s'}=~/\d+/){ $score = $vars{'s'}; }
if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'}; }
#print "\n# (1) show_subclusterings : \@file has : @file\n";
if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.msp/ or
$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/ ){
$ori_cluster_size=$2;
$ori_cluster_num =$3;
$base=$1;
$sat_file_name="$base\.sat";
$ori_cluster_size,
$dindom,
$clu_file_name,
\%tem3, \%tem4,
$indup, );
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Local subroutine
#_______________________________________________________________
sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
my(@keys, $indup_count, $x, $m, $percentage_fac);
my $count=$_[0]; # count of cluster
my %tem2=%{$_[1]}; my $num_seq=@keys=sort keys %tem2;
my %tem=%{$_[2]}; my $ori_cluster_num=$_[3];
my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
my $ori_cluster_size=$_[4];
my $dindom=$_[5]; my %tem3=%{$_[7]};
my $indup=$_[9]; my (%internal_dup);
my %tem4=%{$_[8]};
#~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
if($dindom){
for($m=0; $m< @dup_keys; $m++){
printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
}
}
}
}
#~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
print CLU "Cluster size $num_seq\n";
printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
print "Cluster size $num_seq\n";
printf ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
for($x=0; $x <@keys; $x++){
printf CLU (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
printf (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
}
return($indup_count);
}
}
# 1cdg_6taa -------442---------------2222-----------------000-
# 1cdg_2aaa -------222---------------2222-----------------000-
#
# <Final output>
# 2aaa_6taa 0%
# 1cdg_6taa 67%
# 1cdg_2aaa 67%
#
# Warning :
# Keywords :
# Options : 'p' or 'P' for percentage term(default)
# 'r' or 'R' for ratio term (0.0 - 1.0), where 1 means all the
# segments were wrongly aligned.
# 's' or 'S' for Shift rate (it actually caculates the position shift
# rate for the secondary structure segment.
# 'h' or 'H' for position Shift rate (it actually caculates the position
# shift rate for helical segments). If this is the only option, it
# will show the default percentage term rate for helical segments.
# If used with 'r', it will give you ratio (0.0 - 1.0) for helical
# segment. If used with 's' option, it will give you position shift
# rate for only helical segments.
# 'e' or 'E' for position Shift rate (it actually caculates the position
# shift rate for beta segments). If this is the only option, it will
# show the default percentage term rate for beta segments. If used
# with 'r', it will give you ratio (0.0 - 1.0) for beta. If used
# with 's' option, it will give you position shift rate for only
# beta segments.
# Returns :
# Argument : Two references of hashes. One for error rate the other for sec.
# assignment.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub get_segment_shift_rate{
# Title : get_wrong_segment_rate
# Usage : print_seq_in_block( &get_wrong_segment_rate(\%superposed_hash) );
# Function : Treats the segment as one single big error.
# calculates the wrong segment number compared to the correct ones.
# Example : <input example> hash of 3 keys and values.
# 2aaa_6taa -------00000---------00000000----0000-------00000-
# 1cdg_6taa -------442---------------2222-----------------000-
# 1cdg_2aaa -------222---------------2222-----------------000-
#
# In the above there are two segments wrong in 3 segment blocks = 2/3
# <output example> hash of 3 percentage rates.
#
# 2aaa_6taa 0 %
# 1cdg_6taa 66.6666666666667 %
# 1cdg_2aaa 66.6666666666667 %
#
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# by 9(occurance) in the above
# for '0' of the first hash, so (0, 2.222, 1, 2.1666, 2, 2.4 )
# Average is the average of numbers
# average value in 0-9 scale (or 0-100 with 'p' option)
# So, if there are
# seq1 00111110000, The 'a' value of 0 and 1 as in the seq2
# seq2 33000040000 is 0-> 6/6, 1-> 4/5, while the 'n'
# calc would be, 0-> 6 (60%), 1-> 4(40%)
#
# Argument : (\%hash1, \%hash2) or optionally (\%hash1, \%hash2, ['n', 'i', 'p', 'a'])
# 'n' => normalizing, 'p' => percentage out, 'i' => make int out, 'a'=> averaged
# Category :
# Version : 1.2
#--------------------------------------------------------------------
sub tally_2_hashes{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if( $char_opt=~/m/i){ $minus_whole_cs = 'm'}
@input = @{$array[0]};
if(defined(${$_[2]})){ $base_level =${$_[2]}; }
if(defined(${$_[3]})){ $scale =${$_[3]}; }
for ($t=0; $t< @input; $t++){ $length=length($input[$t]) if(length($input[$t])>$length);}
if ($length < $window_size){ $window_size = $length; }
#___________ getting ratio for the whole sequence ___________
$compos_whole_seq=${&main::compos_id_percent_array(\@input)}; ## for whole composition rate
$seq_id_whole_seq=${&main::seq_id_percent_array(\@input)};
print "\nComposition ID of the alignment: $compos_whole_seq\%\n";
print "Sequence ID of the alignment: $seq_id_whole_seq\%\n";
if ($seq_id_whole_seq == 0){ $ratio_whole_seq =0; }
else{ $ratio_whole_seq = $compos_whole_seq/$seq_id_whole_seq; }
print "Composition and Sequ. ID Ratio: $ratio_whole_seq\n";
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
########## Initial Window size setting ##############################
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if ( $ratio_whole_seq >9 ){ $window_size= 28; } # $ratio_whole_seq is
## Go back if the window size is too small due to gaps ##
#####################################################################
if( (length($win_no_gap1) < $ori_win_size)&&($variable_win_size ==1) )
{
$window_size+=1; $w--; next;
}
#####################################################################
## Getting Compos and Seq ids ##
#####################################################################
$compos_id=${&main::compos_id_percent_array(\@array_of_2_seq)};
$seq_id =${&main::seq_id_percent_array(\@array_of_2_seq)};
#####################################################################
#### Go back if the Seq id is bigger than Compos id #######
#####################################################################
if(($variable_win_size eq 'v') && ($seq_id > $compos_id))
{
$window_size+=1; $w--; next;
}
my(@ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $t, $length, $w,
$compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
$ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate );
for ($t=0; $t < @input; $t++){
$length=length($input[$t]) if (length($input[$t])>$length); }
if ($length < $window_size){ $window_size = $length; }
#___________ getting ratio for the whole sequence ___________
$compos_whole_seq=${&compos_id_percent_array(\@input)};
$seq_id_whole_seq=${&seq_id_percent_array(\@input)};
if ($seq_id_whole_seq == 0){ $ratio_whole_seq =$compos_whole_seq/10; }
else{ $ratio_whole_seq =$compos_whole_seq/$seq_id_whole_seq; }
#___________ getting ratio for each window sequence ___________
for ($w=0; $w < $length; $w++){
$offset = $w - int($window_size/2); # $offset starts from -5 when window_size is 10.
$offset=0 if ($offset < 0);
$window_1=substr($input[0], $offset, $window_size); # window_1 is one segment
$window_2=substr($input[1], $offset, $window_size); # of defined length(size)
@array_of_2_seq=($window_1, $window_2); # making an array like this = ('ABCDE', 'BDESA')
$compos_id=${&compos_id_percent_array(\@array_of_2_seq)};
$seq_id =${&seq_id_percent_array(\@array_of_2_seq)};
#print "\n offset = $offset Wind1 = $window_1 Wind2 = $window_2 ";
#print " Compos1 = $compos_id Seqid = $seq_id \n";
#______ Handle special case when $seqid is zero > the rate becomes $compos_id/10 ______
if (($seq_id == 0) && ($compos_id != 0)){ $ratio_compos_vs_seqid = $compos_id/10; }
elsif(($seq_id == $compos_id)&&($seq_id == 0)){ $ratio_compos_vs_seqid = 0;}
elsif(($seq_id == $compos_id)&&($seq_id == 100)){ $ratio_compos_vs_seqid = 0;}
else{ $ratio_compos_vs_seqid=($compos_id/$seq_id); }
push(@ratio_array, $ratio_compos_vs_seqid); }
#
# Function : gets the numbers of occurances for 1, 2, 3 ... position shifts.
# If hash is given, it only looks at the values.
# If multiple string, array, hash or combinations of these
# are given, it will add up to one single result
# Example :
# Warning :
# Keywords : composition of chars, composition table making,
# make_composition, make composition table
# occurances_of_char, get_char_occurances, occurances
# get_percentage_occurances_of_char, percentage_occurances_of_char
# Options : 'p' for percentage output of the char among others
# 'n' for NO name option when HASH input is given
# Returns : one ref. of hash (a =>5, b=>6, c=>4,,,,,)
# Argument : one ref. of hash (seq1 alsdfjlsj
# seq2 asldfjsld
# seq3 owiurouou);
# Category :
# Version : 1.4
#--------------------------------------------------------------------
sub get_occurances_of_char{
my ($i, %H, $no_name, %out, $N,@splited, $val,$percentage_out,
$split, $sum);
for($i=0; $i< @_; $i++){
if($_[$i]=~/^[\-]?p$/i){
$percentage_out=1; splice(@_, $i, 1); $i--;
}elsif($_[$i]=~/^[\-]?n$/i){
$no_name=1; splice(@_, $i, 1); $i--;
}
}
for($i=0; $i< @_; $i++){
if( ref($_[$i]) eq 'HASH'){
my %H=%{$_[$i]};
my @names=keys %H;
for $key (@names){
@splited=@{$_[$i]};
for $split (@splited){ $out{$split}++; $sum++ }
}elsif(ref($_[$i]) eq 'SCALAR'){
@splited = split(//, ${$_[$i]});
for $split (@splited){ $out{$split}++; $sum++ }
}elsif( !(ref($_[$i])) ){
@splited = split(//, $_[$i]);
for $split (@splited){ $out{$split}++; $sum++ }
}
}
if($percentage_out==1){
my @keys=keys %out;
my %percent;
for($i=0; $i< @keys; $i++){
$percent{$keys[$i]} = $out{$keys[$i]}/$sum*100;
}
return(\%percent);
}else{
return(\%out);
}
}
#________________________________________________________________________
# Title : make_composition_table
# Usage : %occurances=%{&make_compos_table(\%key_and_value_for_seq)};
# Function : gets the numbers of occurances for 1, 2, 3 ... position shifts.
$aa{"c"} = "cys"; $aa{"n"} = "asn"; $aa{"e"} = "glu"; $aa{"q"} = "gln";
$aa{"f"} = "phe"; $aa{"r"} = "arg"; $aa{"g"} = "gly"; $aa{"s"} = "ser";
$aa{"h"} = "his"; $aa{"t"} = "thr"; $aa{"i"} = "ile"; $aa{"v"} = "val";
$aa{"k"} = "lys"; $aa{"w"} = "trp"; $aa{"l"} = "leu"; $aa{"y"} = "tyr";
return(\%aa);
}
#________________________________________________________________________
# Title : amino_acid_compos_id_percent
# Usage : $percent = &amino_acid_compos_id_percent (%any_hash_with_sequences);
# The way identity(composition) is derived is;
#
# Function : gets amino acid composition identity of any given
# number of sequences(at least 2).
# Example :
# Warning :
# Keywords : get_amino_acid_composition, get_protein_composition, composition
# Options :
# Argument : hash of at least 2 sequences.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent{
my(%input)= %{$_[0]};
my(@names)=keys (%input);
my(@temp, $i, $j, $iden, @all_pairs_id, $iden_sum);
my(%compos_table1, %compos_table2, $final_iden, $larger);
for ($i=0; $i < @names; $i ++){ # putting seqs in arrays.
$input{$names[$i]}=~ tr/a-z/A-Z/;
$input{$names[$i]}=~ s/\W//g;
@{"string$i"}= split('', $input{$names[$i]});
$larger = @{"string$i"} if @{"string$i"} > $larger;
}
push (@all_pairs_id, $iden );
}
}
for $iden (@all_pairs_id){ $iden_sum+= $iden; }
if(@all_pairs_id == 0){ @all_pairs_id =1; }
$final_iden=$iden_sum/@all_pairs_id;
\$final_iden;
}
#________________________________________________________________________
# Title : seq_id_percent_array (more than 2 elements array)
# Usage : $percent = &seq_id_percent_array(@any_array_sequences);
# The way identity(pairwise) is derived is;
#
# Function : produces amino acid composition identity of any given number of sequences.
# Example :
# Warning : This can handle 'common gaps' in the sequences
# Keywords : get_percent_composition_identity, seq_composition_identity,
# percent_sequence_composition_id
#
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub seq_id_percent_array{
my(@input, $denominator,@all_pairs_id, $percent_id);
my($largest,$p,$i,$j,$k,$iden_residue_num,$iden,@temp,$iden_sum,$gap_num,$final_iden);
for($d=0; $d<@_; $d++){
if(ref($_[$d]) eq 'ARRAY'){ @input=@{$_[$d]}; }
elsif( (ref($_[$d]) eq 'SCALAR') &&( ${$_[$d]}=~/^[aA]/) ){ $average_len_opt =1 }
elsif( !(ref($_[$d])) && ( $_[$d] =~/^[aA]/) ){ $average_len_opt =1 } }
if ((@input== 1)||( @input== 0)){
print "\n\n \" $0 \" requires at least 2 sequences\n\n";
print "\n Abnormally dying at seq_id_percent_array in $0 \n\n";
print chr(7); exit;}
$shortest=length($input[0]);
my($sans_gap_seq, $length_sum, $average_seq_len);
for($p=0; $p < @input; $p++){
$input[$p]=~ tr/a-z/A-Z/;
$sans_gap_seq=$input[$p];
$sans_gap_seq=~s/\W//g;
$input[$p]=~ s/\W/./g;
(@{"string$p"})=split('', $input[$p]);
$largest = length($input[$p]) if length($input[$p]) > $largest;
$average_seq_len = $length_sum/@input;
for($i=0; $i< @input; $i++){
for($j=$i+1; $j< @input; $j++){
for ($k=0; $k < $largest; $k ++){ # getting composition tables for two seqs.
if ((${"string$i"}[$k] !~ /\W/)&&(${"string$i"}[$k] eq ${"string$j"}[$k])){
$iden_residue_num++; }
elsif((${"string$i"}[$k] =~ /\W/)&&(${"string$i"}[$k]=~ /\W/)){ $gap_num++; }}
if( $average_len_opt == 1){ $denominator = $average_seq_len; }
else{ $denominator = $shortest; }
if($denominator == 0){ $denominator=1; } # in the above it is 50% rather than 0.07%
$percent_id=($iden_residue_num/($denominator))*100;
push(@all_pairs_id, $percent_id);
undef ($iden_residue_num, $gap_num);
}
}
for (@all_pairs_id){ $iden_sum+=$_; }
$final_iden=$iden_sum/($#all_pairs_id+1);
return( \$final_iden );
}
#________________________________________________________________________
# Title : compos_id_percent_array (more than 2 elements array)
# Usage : $percent = &compos_id_percent_array(@any_array_sequences);
# The way identity(composition) is derived is;
# Function : produces amino acid composition identity of any given number of sequences.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub compos_id_percent_array{
my(@input)=@{$_[0]};
my($largest,$iden,@temp,$iden_sum,$final_iden, @all_pairs_id);
for($i=0; $i<=$#input; $i++){ $input[$i]=~ tr/a-z/A-Z/; $input[$i]=~ s/[\.\-\s]//g;
@temp = split('', $input[$i]); (@{"string$i"})= @temp;
$largest = @{"string$i"} if @{"string$i"} > $largest; }
for($i=0; $i< @input; $i++){ #_________ permutating ___________
for($j=$i+1; $j<=$#input; $j++){
for ($k=0; $k <= $largest; $k ++){ # getting composition tables for two seqs.
$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/); }
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
$sum_of_the_common_residue_no += $common; }
$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100; }
\$compos_id;
}
#-----------------------------------------------------
return ( \$final_iden ); # final identity for any given set of strings(seq).
}
#________________________________________________________________________________
# Title : compos_id_percent_hash (synonym of amino_acid_compos_id_percent)
# Usage : $percent = &compos_id_percent_hash(%any_hash_with_sequences);
# The way identity(composition) is derived is;
#
# Function : gets amino acid composition identity of any given number of sequences.
# Example :
# Warning :
# Keywords : get_amino_acid_composiiton
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub compos_id_percent_hash{ my(%input, @names);
if(ref($_[0]) eq 'HASH'){ %input= %{$_[0]}; @names= keys %input; }
else{ print "\n hash ref was not passed to compos_id_percent_hash\n"; exit; }
my(@temp, $iden, @all_pairs_id, $i, $j, $k,$iden_sum);
my(%compos_table1, %compos_table2, $final_iden, $larger);
for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
$input{$names[$i]}=~ s/\W//g; @temp = split('', $input{$names[$i]});
(@{"string$i"})=@temp; $larger = @{"string$i"} if @{"string$i"}>$larger;}
for ($i=0; $i < @names; $i++){
for ($j=$i; $j < @names; $j ++){
if ($j == $i){ next; }
for ($k=0; $k < $larger; $k ++){
$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
#________________________________________________________________________
# Title : common_compos_id_hash (BUG free)
# Usage : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function : actual calculation of identity
# Example : ('A', 200, 'C', 191, D, 99)
# ('A', 290, 'C', 199, D, 100)
# uses only two sequences.
# Warning :
# Keywords :
# Options :
# Returns : ref. of a scaler (in percent) eg) 95
# Argument : two references of hash of seqeunces.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub common_compos_id_hash{
my(%hash1)=%{$_[0]};
my(%hash2)=%{$_[1]};
my(%common_of_the_2)=(); my($common, $compos_id, $sum_of_the_common_residue_no);
my(@values1) = values (%hash1); my(@values2) = values (%hash2);
my(@combined_values)=(@values1, @values2);
#________________________________________________________________________
# Title : calc_compos_id_hash (the same as 'common_compos_hash')
# Usage : %hash = &calc_compos_hash(\%any_hash1, \%any_hash1);
# Function : actual calculation of identity
# Example : ('A', 200, 'C', 191, D, 99)
# ('A', 290, 'C', 199, D, 100)
# uses only two sequences.
# Warning :
# Keywords :
# Options :
# Returns : ref. of a scaler (in percent) eg) 95
# Argument : two references of hash of seqeunces.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub calc_compos_id_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; my(%common_of_the_2)=();
my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
my(@values1) = values (%hash1); my(@values2) = values (%hash2);
my(@combined_values)=(@values1,@values2);
for $elem (@combined_values){
$sum_residues += $elem; }
if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
else{
for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
$sum_of_the_common_residue_no += $common; }
$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100; }
\$compos_id;
}
#________________________________________________________________________
# Title : get_percentage
# Usage : %out= %{&get_percentage(\%result, '1')};
# Function : calculates the percentage content of any single char over the whole
# length of strings in it.
# Example : if the string is 'seq ABCDEEEEEFFEFE' given in a hash
# if you put 'A' as one argument, it counts the occurances of 'A'
# and gets the percentage of it.
# Warning : This converts array and string input as ref. into arbitrary hash and
# returns hash
# programmed by A Biomatic
# Keywords : get_percentage_of_char
# Options : None yet.
# Returns : Numerical Percentage
# Argument : ref. for Scalar string or Array of chars or Hash AND 'the target char'
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percentage{
my(@in, $k, $sort, $numerator, $residue, @out_hash_ref, %hash_out );
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k])&& (length($_[$k]) == 1 )){
$numerator = $_[$k];
}
elsif( (ref($_[$k]) eq 'SCALAR') && (length(${$_[$k]}) == 1 )){
$numerator = ${$_[$k]};
}
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); }
elsif(ref($_[$k]) eq "ARRAY") { push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
elsif(ref($_[$k]) eq "SCALAR"){ push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
}
####### final output is => @in of hash ref elements #######
for (@in){ my(%H) = %{$_}; my(@keys)= sort keys %H;
for $name(@keys){
my($numerator_count);
my($seq_len) = length($H{$name}); print "\n $name Sequence length: ", $seq_len, "\n";
my(@string) = split(//, $H{$name});
for $residue (@string){ if($residue =~/^$numerator$/){ $numerator_count ++; }}
my($percent) = $numerator_count/$seq_len *100;
$hash_out{$name}=$percent; }
push(@out_hash_ref, \%hash_out); }
if(@out_hash_ref ==1){ return($out_hash_ref[0]); }
elsif( @out_hash_ref > 1){ return(@out_hash_ref); }
}
#________________________________________________________________________
# Title : pairwise_percent_id (pairwise sequence identity in percentage)
# Usage : $identity = ${&pairwise_percent_id(%arrayinput)};
#
# Function : takes a ref. of a hash of names and sequences, returns
# percent identity.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub pairwise_percent_id{
my($i,$j,$k, @iden_array_ref);
for($i=0; $i< @_; $i++){ my %input= %{$_[$i]}; my @names= sort keys %input;
my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
@temp = split('', $input{$names[$i]}); (@{"string$i"})=@temp;
$larger = @{"string$i"} if @{"string$i"} > $larger; }
for ($i=0; $i < @names; $i++){ # to make permutated pairs.
for ($j=$i+1; $j < @names; $j ++){
for ($k=0; $k < $larger; $k ++){ # getting composition tables for two seqs.
$iden+=2 if ((${"string$i"}[$k] eq ${"string$j"}[$k])&&(${"string$i"}[$k] =~ /\w/));
$residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
$residue_sum2++ if (${"string$j"}[$k] =~ /\w/); }
$whole_seq_len =($residue_sum1+$residue_sum2);
$percent_for_pair = $iden/$whole_seq_len*100;
push(@percent_for_pair,$percent_for_pair);
$residue_sum1=0; $residue_sum2=0; $iden=0; } }
for $iden (@percent_for_pair){ $iden_sum+=$iden;}
$final_av_iden=$iden_sum/( @percent_for_pair );
push(@iden_array_ref, \$final_av_iden); }
if(@iden_array_ref ==1){ return($iden_array_ref[0]);}else{ return(@iden_array_ref);}
}
#________________________________________________________________________
# Title : get_seq_identity
# Usage : $identity = ${&get_seq_identity(%arrayinput)};
#
# Function : takes a ref. of a hash of names and sequences, returns
# percent identity. NOT composition identity.
# Example :
# Warning :
# Keywords : get_sequence_identity
# Options :
# Returns :
# Argument : hash(es) of sequences.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_seq_identity{
my($i,$j,$k, $c, @iden_array_ref);
for($c=0; $c< @_; $c++){
my %input= %{$_[$c]};
my @names= sort keys %input;
my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
for ($i=0; $i < @names; $i ++){
$input{$names[$i]}=~ tr/a-z/A-Z/;
@temp = split(//, $input{$names[$i]});
@{"string$i"}=@temp;
$larger = @{"string$i"} if @{"string$i"} > $larger; }
for ($i=0; $i < @names; $i++){ # to make permutated pairs.
for ($j=$i+1; $j < @names; $j ++){
for ($k=0; $k < $larger; $k ++){
if ( ${"string$i"}[$k] eq ${"string$j"}[$k] and ${"string$i"}[$k] =~ /\w/){
$iden+=2 ;
}
$residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
$residue_sum2++ if (${"string$j"}[$k] =~ /\w/);
}
$whole_seq_len =($residue_sum1+$residue_sum2);
$percent_for_pair = $iden/$whole_seq_len*100;
push(@percent_for_pair,$percent_for_pair);
$residue_sum1=0;
$residue_sum2=0;
$iden=0;
}
}
for $iden (@percent_for_pair){
$iden_sum+=$iden;
}
if(@percent_for_pair <1){ @percent_for_pair=(1); }
$final_av_iden=$iden_sum/( @percent_for_pair );
push(@iden_array_ref, \$final_av_iden);
}
if(@iden_array_ref ==1){
return($iden_array_ref[0]);
}else{
return(@iden_array_ref);
}
}
#________________________________________________________________________
# Title : get_correct_percent_alignment_rate (made for Bissan)
# Usage : &get_correct_percent_alignment_rate(\$file1, \$file2);
# Function : accepts two files and prints out the sequence identities of the alignment.
# Example :
# Warning : Alpha version, A Biomatic , made for Bissan
# Keywords :
# Options : h # for help
# v # for verbose printouts(prints actual sequences)
# Returns : reference of Scalar for percentage correct alignment(for already
# aligned sequences)
# Argument : two sequence files which have identical sequence names.
# Category :
# Version :
#--------------------------------------------------------------------
sub get_correct_percent_alignment_rate{
my($i, $j, $k, $verbose, @string1, @string2, $larger, $seq_pair_id, @seq_pair_ids );
my(%inputhash1) = %{&read_any_seq_files($_[0])};
my(%inputhash2) = %{&read_any_seq_files($_[1])};
my(@names)= sort keys %inputhash1;
######################################
####### Sub argument handling ########
######################################
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k])&& (length(${$_[$k]}) < 5)){ # when inputs are not ref.
if($_[$k]=~ /^[\-vV]$/){ $verbose = 1; next;}
}
print "\n", "_"x88, "\n";
my($whole_average_of_the_id)=${&array_average(\@seq_pair_ids)};
print "The whole average is; $whole_average_of_the_id\n";
if(@seq_pair_ids == 1){ return( \$seq_pair_ids[0] ); }
elsif(@seq_pair_ids > 1){ return( \@seq_pair_ids ); }
}
#________________________________________________________________________
# Title : amino_acid_compos_id_percent_trend
# Usage :
# Function :
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent_trend{
my(%input) = %{$_[0]};
my(@common, @string,@accumu_percent_iden)=(); my(%common_so_far, %compos_table);
my($percent_id_so_far, $length_of_one_seq,$length_of_all_seq, $seq_no)=0;
for $key(keys %input){
$input{$key}=~s/[. \d-]//g;
@string= split(//, $input{$key});
print @string; print "\n";
$length_of_one_seq = $#string+1;
$length_of_all_seq +=$length_of_one_seq;
$seq_no += 1;
%compos_table = &composition_table(@string);
@check = keys (%common_so_far);
if ($#check < 0){ %common_so_far = %compos_table; }
else{ %common_so_far= %{&common_compos_2_hash(\%common_so_far,\%compos_table)};}
for $value(values %common_so_far){ $common_residue_sum +=$value; }
$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
$common_residue_sum =0; }
for $value(values %common_so_far){ $common_residue_sum +=$value; }
$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
return(\$final_percent_id);
}
#________________________________________________________________________
# Title : composition_table (can handle both nucleic and protein seq)
# Usage : %output = %{&compos_table(@input_array1, @input_array2,,,,)};
# example input
#
# Function : returns a table of alphabet with occurances.
# can handle any char, this converts char to upper case.
# Example :
my(%common_of_the_2)=(); my($common)=0;
for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
if ($common =~ /\d+/){
$common_of_the_2{$key1}=$common; } }
\%common_of_the_2;
}
#________________________________________________________________________
# Title : pair_percent_id_trend
# Usage : @array = &pair_percent_id_trend (%arrayinput);
# Function :
# Example : common gaps means only '.' (dots, not alphabets!!)
# AAA....BBCB
# AAAB..B.BCC --> A.A.....BC. (as in an array)
# A.AAA...BCA
# The resulting array XXXXX..XXXX is literally like so.
# This is to detect absurd gaps in the above.
#
#
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub pair_percent_id_trend{
my(%input) = %{$_[0]};
my(@common, @string,@accumu_percent_iden)=();
my($percent_id_so_far)=0;
for $key(keys %input){
my($len) = &smaller_one($#common, $#string) unless $#common < 0;
$input{$key}=~s/ //g;
@string= split(//, $input{$key});
$length_of_one_seq = $#string+1;
$length_of_all_seq +=$length_of_one_seq;
$seq_no += 1;
for ($k=0; $k <= $len;$k++){
if($#common == -1){
@common = @string;
}
elsif ( $string[$k] eq $common[$k] ){
$common[$k] = $string[$k];
$identical_count +=1;
}elsif( $string[$k] ne $common[$k]){
$common[$k]='.';
}
}
$num_of_iden_char = &count_num_of_char(\@common);
$av_seq_no = $length_of_all_seq/$seq_no;
$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
print "\n percent_id so far = $percent_id_so_far \n";
push(@accumu_percent_iden,$percent_id_so_far);
} # end of for (after all sequences have been run).
$num_of_iden_char = &count_num_of_char(@common);
$av_seq_no = $length_of_all_seq/$seq_no;
$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
print "\n percent_id so far = $percent_id_so_far \n";
\@accumu_percent_iden; # final ids array.
}
#________________________________________________________________________
# Title : smaller_one
# Usage : $smaller = & smaller_one($var, $var2);
#
# Function : gets smaller value of the two inputs
# Example : will return 5 with &smaller_one(5, 50);
# Warning : gets only digits!!
# Keywords :
# Options :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub count_num_of_char{
my(@input)={$_[0]};
my($num_of_char)=0;
for $elem(@input){ # this is for the percentage of TWO seqs.
if ($elem =~ /\w/){
$num_of_char +=1;
}
}
$num_of_char;
}
#________________________________________________________________________
# Title : remov_com_column2 (this is the older and slower version)
# Usage : %new_string = %{&remov_com_column2(\%input_hash)};
# Function :
# ((-l option for long output )
# NAME MOZ_HUMAN_part
# HEADER |- Residue -| Pred Rel NAli Asn
# PRED 1 MET M c 0.000 0 ?
# PRED 2 ALA A c 0.000 0 ?
#
# Warning :
# Keywords : open_prd_files, open_pred_files, predator, open_prdl_files
# open_pre_files, secondary structure prediction file
# Options : 's' for sequence output as well (\%sec_str, \%seq)
# 'p' for percentage of the sec. str.
# 'a' for accumulated percentage. This will
# set 'p' automatically
# 'n' for NO name when outputing Percentage of chars with
# HASH input to get_occurances_of_char sub.
# $reverse_residue_order=r by r
# Returns :
# Argument :
# Category :
# Version : 1.8
#-----------------------------------------------------------
sub open_predator_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my( @out_ref, $seq_out, %sec_str, %seq, $percent_out, $NO_name_out,
$short_form_out_detected, $long_form_out_detected, $accumulate,
$reverse_residue_order, %rev_sec_str);
if($char_opt=~/s/i){ $seq_out=1 }
if($char_opt=~/a/i){ $accumulate=1 }
if($char_opt=~/p/i){ $percent_out=1 }
if($char_opt=~/n/i){ $NO_name_out='n' }
if($char_opt=~/r/){ $reverse_residue_order='r' }
for($i=0; $i< @file; $i++){
my (%sec_str, %seq) if($accumulate !=1);
open(PREDATOR_FILE, "$file[$i]");
print "\n# (INFO) open_predator_files: opening $file[$i]";
while(<PREDATOR_FILE>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
$residue_3_letter=$2;
$residue_1_letter=$3;
$sec_str_predicted=$4;
$reliability=$5;
$sec_str{$position}=[$residue_1_letter, $sec_str_predicted, $reliability];
}
}
close (PREDATOR_FILE);
print "\n \%sec_str is: ", %sec_str, "\n" if ($debug == 1);
if($seq_out==1){ push(@out_ref, \%sec_str, \%seq);
}elsif($percent_out==1 ){
push(@out_ref, [%{&get_occurances_of_char(\%sec_str, $NO_name_out, 'p')}] );
}elsif($percent_out !=1){ push(@out_ref, \%sec_str) }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If -r option is set (for long form, this does not affect
#____________________________________________________________
if($short_form_out_detected and $reverse_residue_order){
@keys=keys %sec_str;
for($r=0; $r<@keys; $r++){
$rev_sec_str{$keys[$r]}=reverse($sec_str{$keys[$r]});
}
%sec_str=%rev_sec_str;
# >P1;1cdg
# secondary structure and phi angle
# CCCCCCCCCCCCCCCCEEECCHHHHCCCCHHHCCCPHHCCCCPCC-CCCCCPCCHHHHHHHHHCPHHHHHPCCEE
# >P1;1cdg
# solvent accessibility
# TTTTTTTTTTTFFFFFFFFFFFFFFTTTTTTTTTTTTTTTTTFTT-TTTTFFFFFTFFTTTFTTTFFTTFTFTFF
# >P1;1cdg
# DSSP
# CCCCCCCCCCCCCCCCEEECCHHHHCCCCGGGCCCGGGCCCCCCC-CCCCCCCCHHHHHHHHHCCHHHHHCCCEE
# >P1;1cdg
# percentage accessibility
# 67523272360000000000000002213792129b722248085-14110000030015105660028040200
# 2ltn ----TETTSFLITKFSPDQQNLIFQGDGYTT-KEKLTLTK------AVKNTVGRALYSSP
# 1loe ----TETTSFSITKFGPDQQNLIFQGDGYTT-KERLTLTK------AVRNTVGRALYSSP
#
# 2ltn ----CEEEEEEECCCCCCCCCEEEEPCCEEP-PPCEEEEC------CCCPCEEEEEECCC
# 1loe ----CEEEEEEECCCCCCCCCEEEEPCCEEE-PPEEEEEC------CCCPCEEEEEECCC
#
# 2ltn ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFTFFT------TTTTTTFFFFTTTT
# 1loe ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFFFFT------TTTTTTFFFFTTTT
#
# Version : 1.0
#--------------------------------------------------------------------
sub open_tem_files{
my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
$s, $t, $u, $v, $w, $x, $y, $z, $pwd, $file, $dir, $output, $in_dir,
%hash, @keys, @array, @hash, $option_string, $string, @in, $line,
$name, %out, $gap_chr, @str1, @str2, $num_opt, @file, @dir,
$char_opt, $char_opt_given, $num_opt_given,
@char_options, @file, $original_dir, @read_files, %array_msf, %array_jp,
$jp_file, $error_rate, $id_compos, @dir, @names, $name, $name_found,
@outref, %sequence, %secondary,%solvent_access, %DSSP, %percent_accessibility,
$name_found,$type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc
);
##################################################
##### Start of general argument handling ######
##################################################
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k]) ){
if($_[$k]=~ /^[\-]*(\w)$/){
$char_opt .= "\,$1";
$char_opt_given =1;
} #from below ============== SECONDARY
}elsif(/^secondary structure and phi angle/){ $type_secon = 1;
}elsif(($type_secon ==1)&&(/^([\w\-]+)[\*]*$/)){
$secondary{$name}.=$1; #from below============= SOLVENT ACCESSIBILITY
}elsif(/^solvent accessibility/){ $type_sol = 1;
}elsif(($type_sol ==1)&&(/^([\w\-]+)[\*]*$/)){
$solvent_access{$name}.=$1; #from below========= DSSP
}elsif(/^DSSP/){ $type_DSSP = 1;
}elsif(($type_DSSP ==1)&&(/^([\w\-]+)[\*]*$/)){
$DSSP{$name}.=$1; #from below=================== PERCENTAGE ACCESSIBILITY
}elsif(/^percentage accessibility/){ $type_acc = 1;
}elsif(($type_acc ==1)&&(/^([\w\-]+)[\*]*$/)){
$percent_accessibility{$name}.=$1; } }
push(@outref,\%sequence,\%secondary,\%solvent_access,\%DSSP,\%percent_accessibility);
} }
if( ($char_opt =~ /s/i) || ( @outref == 1 ) ){
return(\%sequence); }
elsif( @outref > 1){ return(@outref); } # <-- contains (\%sequence,\%secondary,....)
}
#________________________________________________________________________
# Title : open_hlx_files
# Usage :
# Function :
chdir($original_dir);
if($realfile1 =~/(\d+\-$no\.msf)$/){
@dir=split(/\//, $realfile1);
$dir=$dir[($#dir-1)]; # where am I ?
# $jp_file = $original_dir.'/'.$dir.'.jp';
# %array_msf =&open_msf_files($realfile1);
# %array_jp =&open_jp_files ($jp_file);
# $array_ref_msf = \%array_msf;
# $array_ref_jp = \%array_jp;
# $error_rate =&get_posi_shift_hash($array_ref_msf, $array_ref_jp);
# $id_compos =&amino_acid_compos_id_percent($array_ref_jp);
# push(@rates_accumu,$error_rate);
# push(@compos_id,$id_compos);
}
}
else
{
next;
}
}
}
}
}
$sum = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}
\$id_counter; # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_percent_homol_arr
# Usage : $homology_out = ${&get_pair_homol(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function : get pair wise seq. identity of any two strings, outputs a scalar (%)
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percent_homol_arr{
my(@input)=@{$_[0]};
$input[0] =~ tr/a-z/A-Z/; # capitalizing.
$input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$input[0]);
my(@string2)= split(//,$input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter, $sum,$percent_homol)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
$sum = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}else{
$percent_homol=($id_counter/$sum)*100;
}
return(\$percent_homol); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_pair_homol_hash
# Usage : $homology_out = & get_pair_homol (%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function : get pair wise seq. identity as a scalar count
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
}
}
my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}
return ($id_counter); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_percent_homo_hash
# Usage : $homology_out = &get_pair_homol_hash(%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function : get pair wise seq. identity(%) of any two strings put in as a hash
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percent_homo_hash{
my(%input)=@_;
&hash_chk(\%input);
my(@keys_input)= keys (%input);
my(@values_input) = values (%input);
$values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
$values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$values_input[0]);
my(@string2)= split(//,$values_input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter,$percent_homol,)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}else{
$percent_homol=($id_counter/$sum)*100;
}
return ($percent_homol);
}
#________________________________________________________________________
# Title : file_size
# Usage : $outputfilesize = &file_size($input_file_name);
# Function : returns the size of any single testing file
# Example :
# Warning : Q is for quality of this sub. This can't be wrong.
# Keywords :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub file_size { my($infile)=$_[0];
if ( $size=(-s "$infile")){ return $size; }
}
#________________________________________________________________________
# Title : seq_comp_percent2
# Usage : @outarray = &seq_comp_percent2(@any_input_string_array);
# Function : get string seq COMPOSITION identities(a to z). gets array
# of strings and outs array of % numbers
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub seq_comp_percent2{ # simple and basic seq. id. eg. ABC on ABCABC is 50 % identical.
my(@input)=@_;
my(@array_of_ids2, $id2, @char1, @char2);
&array_chk(sort @input);
my($longest_str_size) = &get_long_str_size (@input), "\n";
my($shortest_str_size) = &get_short_str_size(@input), "\n";
print "longest_str_size",$longest_str_size;
print "shortest_str_size",$shortest_str_size;
if (($longest_str_size/$shortest_str_size) > 4){
print "\n The shortest string is less than 1/4 of the longest\n";
print " This is quite meaningless, but will go on\n";
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub cls{ my($cls) = `clear`;
print $cls;
}
#________________________________________________________________________
# Title : seq_comp_percent1
# Usage : @outarray = &seq_comp_percent1(@any_input_string_array);
# Function : get string seq identities(a to z). gets array of strings and outs array of % numbers
# Example :
# Warning :
# Keywords :
# Options :
# Returns : one ref. of an array
# Argument : one ref. of an array
# Category :
# Version :
#--------------------------------------------------------------------
sub seq_comp_percent1{ # this is affected by seq. length
my(@input)=@{$_[0]};
my(@array_of_ids1, $id1, @char1, @char2);
&array_chk(\@input);
@input = sort (@input);
$longest_str_size = &get_long_str_size (@input), "\n";
$shortest_str_size = &get_short_str_size(@input), "\n";
if (($longest_str_size/$shortest_str_size) > 4){
print "\n The shortest string is less than 1/4 of the longest\n";
print " This is quite meaningless, but will go on\n";
}
for $key1 (sort keys %hash1){
for $key2 (sort keys %hash2){
if ($key1 eq $key2){
$no_of_same = &min($hash1{$key1},$hash2{$key2});
$sum_of_same += $no_of_same;
last;
}
}
}
$identity = $sum_of_same*2/(&sum_array(@num_char1,@num_char2))*100;
# print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title : get_id_among_2_2
# Usage : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function : gets the % id of any two sequences, returns in 100.0% format.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
$no_of_same = &min($hash1{$key1},$hash2{$key2});
$sum_of_same += $no_of_same;
last;
}
}
}
$seq1=&sum_array(@num_char1);
$seq2=&sum_array(@num_char2);
$longer_seq = &max($seq1, $seq2);
$identity = $sum_of_same/$longer_seq*100;
#print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title : array_average
# Usage : $output = &array_average(\@any_array);
# Function : (the same as average_array)
# Example :
# Warning : If divided by 0, it will automatically replace it with 1
# Keywords : get_array_average, av_array, average_array, get_average_array
# average_of_array, average_array
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%hash, @pdbd_seqs, @superfamily, @members, @files, $over_write, $msp_file_gz,
$msp_seq_file_dir, $msp_file, $msp_file_long, $msp_file_gz_long,
$pdbd_seq_long, $final_merged_ISL_fasta,$pdbg_file, $pwd , $range_start,
$range_stop, @files_NOT_processed);
my $source_db_fasta=$ENV{'NRDB_FASTA'}; ## general default
my $evalue_thresh=0.001; # default
my $score_thresh=70; # default
my $range_thresh=10;
my $percent_id_thresh=0.95;
print "\n# (i) Running sub of: make_intermediate_sequence_library\n";
if($vars{'FASTA_DB'}=~/\S+/){ $source_db_fasta=$vars{'FASTA_DB'} }
if($vars{'MSP_DIR'}=~/\S+/){ $msp_seq_file_dir=$vars{'MSP_DIR'} }
if($vars{'p'}=~/\S+/){ $pdbg_file=$vars{'p'}; print "\n# (i) $vars{'p'} is given \n"; } ## PDBG file input
if($vars{'m'}=~/\S+/){ $msp_seq_file_dir=$vars{'m'} }
if($vars{'DB'}=~/\S+/){ $source_db_fasta=$vars{'DB'} }
if($vars{'E'}=~/\S+/){ $evalue_thresh=$vars{'E'} }
if($vars{'e'}=~/\S+/){ $evalue_thresh=$vars{'e'} }
if($vars{'s'}=~/\S+/){ $score_thresh=$vars{'s'} }
next;
}
if(@pdbd_seqs){ print "\n# (i) \@pdbd_seqs for $superfamily is: @pdbd_seqs\n";
}else{ print "\n# (E) \@pdbd_seqs is empty, strange, error in making $out_fasta_file \n\n";
}
for($j=0; $j< @pdbd_seqs; $j++){
$pdbd_seq=$pdbd_seqs[$j];
$pdbd_seq_long="pdb\_$pdbd_seqs[$j]";
my (@msp_content, $evalue, $sub_dir, $percentage_id, $range_length);
if($pdbd_seq=~/^ *$/){ next; }
$msp_file="$pdbd_seq\.msp";
$msp_file_gz="$pdbd_seq\.msp\.gz";
$msp_file_long="pdb\_$pdbd_seq\.msp"; ## this is to handle Sarah's pdb_ prefixed pdbd files
$msp_file_gz_long="pdb\_$pdbd_seq\.msp\.gz";
print "# (i) I am processing $msp_file <- subroutine:make_intermediate_sequence_library\n";
if($msp_file_gz=~/^([dec])\S/ or $msp_file_long=~/pdb_([dec])\S/){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
print "\n# (i) Now I have finished reading in ONE msp file for $pdbd_seq ($superfamily) into \@msp_content";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
# Processing each MSP line
#_______________________________________________________________________________________
my ($identical_one_added, $interm_seq);
for($k=0; $k< @msp_content; $k++){ ## NEW MSP format
if($msp_content[$k]=~/^ *(\S+) +(\S+) +(\S+) +\d+ +\d+ +[pdb_]*$pdbd_seq[_\d+\-\d+]* +(\d+) +(\d+) +(\S+)/){ # [pdb_]* is for Sarah's pdb_ prefix
#if($pdbd_seq eq $6){ next } ## this is to exclude the same seq match
$evalue=$2; $percentage_id=$3;
$range_length=$5-$4; $score =$1; $interm_seq=$6; $range_start=$4, $range_stop=$5;
if($interm_seq=~/(\S+)_\d+\-\d+/){ $interm_seq=$1; }
if($evalue < $evalue_thresh and $range_length > $range_thresh and $score > $score_thresh){
if($percentage_id <= $percent_id_thresh){ ## to keep one pdb seq ($percentage_id=1)
unless($interm_seq=~/^d\d\S/){ # to remove entries like: d1abc_10-20_d2acb__ (two pdb seqs)
$interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
}
}elsif($percentage_id ==1 and $identical_one_added < 1){ ## this is to prevent more than 1 100% id seq
$interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
$identical_one_added++;
}
}
}elsif($msp_content[$k]=~/^ *(\d+) +(\S+) +\d+ +\d+ +$pdbd_seq +(\d+) +(\d+) +(\S+)/){
$evalue=$2; $score=$1;
$range_length=$4-$3; $interm_seq=$5;
if($evalue < $evalue_thresh and $range_length > $range_thresh
and $score > $score_thresh){
$interm_hash{$superfamily} .=" $interm_seq\_$3\-$4\_$pdbd_seq";
( run in 0.613 second using v1.01-cache-2.11-cpan-709fd43a63f )