Bioinf

 view release on metacpan or  search on metacpan

Bioinf.pl  view on Meta::CPAN

# Returns   : [$av_correct, $num_enq_seq]
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Category  :
# Version   : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
	my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
	    $sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
	    $av_correct, $av_false, $actual_e_value, $correct_matched,
	    %correcting_pairs, @correcting_pairs, %correct);

	my %seqs=%{$_[0]};
	my @pdbg_seqs=@{$_[1]};
	my $evalue=${$_[2]};
	my $pdbg_base=${$_[3]} || $ARGV[3];
	my $E_mult_factor1=${$_[4]};
	my $E_mult_factor2=${$_[4]};
        if(ref($_[5])){  $leng_thresh =${$_[5]}  }else{ $leng_thresh=$_[5]; }

Bioinf.pl  view on Meta::CPAN

                     }
                }
                if($correct_matched !=1){
                     $false_positive++;
                     unless( $correct{$sorted} ){
                          $correct_group{$base} .="Nomolog: $sorted $base  $msp_0{$sorted}\n";
                     }
                     $correct{$sorted} = "Nomolog: $base  $msp_0{$sorted}";
                }
	   }
           if(@match_seqs == 0){ @match_seqs=1; $percent_correct=0; }
	   $sum_correct += $correct;
	   $sum_false   += $false_positive;
	}
	$av_correct = $sum_correct/$num_enq_seq;
	$av_false   = $sum_false  /($num_enq_seq);

	#### $actual_e_value becomes whatever $E_mult_factor1 defined ~~~~~~~~~~~~
	if($E_mult_factor1 != 1){
	   $actual_e_value=$evalue * $E_mult_factor1;
	}elsif($E_mult_factor2 != 1){

Bioinf.pl  view on Meta::CPAN

    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
       $score, @temp_show_sub, $optimize, $file, $evalue, $over_write, $din_dom,
       $sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
       $average_region, $dynamic_factor, @sub_clustering_clu_files);

    $factor=7; # default factor is 7 for 70%
    $length_thresh=30;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Dealing with options
    #_________________________________________

Bioinf.pl  view on Meta::CPAN

    }if($char_opt=~/L/){       $large_region  ='L';
    }if($char_opt=~/A/){       $average_region='A';
    }if($vars{'T'}=~/\d+/){    $length_thresh= $vars{'T'};
    }if($vars{'l'}=~/\d+/){    $length_thresh= $vars{'l'}; ## synonym of 't'
    }if($vars{'f'}=~/\S+/){    $factor= $vars{'f'};
    }if($vars{'s'}=~/\d+/){    $score = $vars{'s'};
    }if($vars{'e'}=~/\d+/){    $evalue= $vars{'e'};
    }if($vars{'E'}=~/\d+/){    $evalue= $vars{'E'}; # synonym of e
    }

   $percent_fac=$factor*10;

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
   #________________________________________________________________________________
   if(@file == 1){  #<=== @file has xxxx.msp, yyyy.msp  zzzz.msp ....,
        print "\n# (1) divide_clusters: One single file was given=> \"@file\"\n" if $verbose;
        $file=$file[0];
        $base_1=${&get_base_names($file)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Bioinf.pl  view on Meta::CPAN

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (4) This is critical seqlet merging step
        #________________________________________________________________________________
        @out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
               "f=$factor", $short_region, $large_region, $average_region, $optimize)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (5) This is showing the result in clu file format
        #________________________________________________________________________________
        @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup,
						   "e=$evalue", "p=$percent_fac", "f=$factor" );
        $good_bad       = $temp_show_sub[0];
        $indup_c        = $temp_show_sub[1];
        $sum_seq_num   += $temp_show_sub[2];
        push(@sub_clustering_out_files, @{$temp_show_sub[3]});

        if($good_bad==1){      push(@good, $file);
        }else{                 push(@bad, $file);       }

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (6) Final write up stage (unecessary)

Bioinf.pl  view on Meta::CPAN

            }

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            #  (4) Clustering the sets of merged seqlets => CORE algorithm
            #____________________________________________________________

            @out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
                   $short_region, $large_region, $average_region)};

            @temp_show_sub=&show_subclusterings(\@out, $big_msp_file, $sat_file, $dindom, $indup,
                                                    "e=$evalue", "p=$percent_fac", "f=$factor");
                        $good_bad       = $temp_show_sub[0];
                        $indup_c        = $temp_show_sub[1];
                        $sum_seq_num   += $temp_show_sub[2];
            push(@sub_clustering_out_files, @{$temp_show_sub[3]});

            if($good_bad==1){          push(@good, $big_msp_file);
            }else{         push(@bad, $big_msp_file);       }

          }
          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`

Bioinf.pl  view on Meta::CPAN

	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
        my ($max_size, $sat_file_name, $clu_file_name,
        $ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
        $indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
        @sub_clustering_out_files);  # clusall_1e-5_clu_14-324_ss.sat
	my @out=@{$array[0]};
	$indup_count=0;

	if($char_opt=~/d/){	    $dindom=1;	}
	if($char_opt=~/i/){		$indup=1;	}
	if($vars{'f'}=~/\S+/){     $factor= $vars{'f'}; }
	if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
	if($vars{'s'}=~/\d+/){	   $score = $vars{'s'};	}
	if($vars{'e'}=~/\d+/){	   $evalue= $vars{'e'};	}

	#print "\n# (1) show_subclusterings : \@file has : @file\n";
	if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.msp/  or
		$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/   ){
		 $ori_cluster_size=$2;
		 $ori_cluster_num =$3;
		 $base=$1;
		 $sat_file_name="$base\.sat";

Bioinf.pl  view on Meta::CPAN

		         $ori_cluster_size,
		         $dindom,
		         $clu_file_name,
								 \%tem3, \%tem4,
								 $indup, );

           #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           # Local subroutine
           #_______________________________________________________________
	   sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
               my(@keys, $indup_count, $x, $m, $percentage_fac);
               my $count=$_[0]; # count of cluster
	       my %tem2=%{$_[1]};	my $num_seq=@keys=sort keys %tem2;
	       my %tem=%{$_[2]};	my $ori_cluster_num=$_[3];
	       my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
	       my $ori_cluster_size=$_[4];
	       my $dindom=$_[5];	my %tem3=%{$_[7]};
	       my $indup=$_[9];	my (%internal_dup);
	       my %tem4=%{$_[8]};
               #~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
	       if($dindom){

Bioinf.pl  view on Meta::CPAN

                                     for($m=0; $m< @dup_keys; $m++){
                                             printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
                                     }
                             }
                    }
                 }

                #~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
                print  CLU  "Cluster size $num_seq\n";
                                        printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
                                          $new_clus_NAME, $evalue, $factor, $percentage_fac,
                                          $ori_cluster_size, $num_seq, $ori_cluster_num);
                print       "Cluster size $num_seq\n";
                printf     ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
                              $new_clus_NAME, $evalue, $factor, $percentage_fac,
                              $ori_cluster_size, $num_seq, $ori_cluster_num);
                for($x=0; $x <@keys; $x++){
                   printf CLU ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
                               $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
                   printf ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
                          $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
                }
                return($indup_count);
	   }
	}

Bioinf.pl  view on Meta::CPAN

#             1cdg_6taa      -------442---------------2222-----------------000-
#             1cdg_2aaa      -------222---------------2222-----------------000-
#
#             <Final output>
#             2aaa_6taa      0%
#             1cdg_6taa      67%
#             1cdg_2aaa      67%
#
# Warning   :
# Keywords  :
# Options   : 'p' or 'P' for percentage term(default)
#             'r' or 'R' for ratio term (0.0 - 1.0), where 1 means all the
#              segments were wrongly aligned.
#             's' or 'S' for Shift rate (it actually caculates the position shift
#              rate for the secondary structure segment.
#             'h' or 'H' for position Shift rate (it actually caculates the position
#              shift rate for helical segments). If this is the only option, it
#              will show the default percentage term rate for helical segments.
#              If used with 'r', it will give you ratio (0.0 - 1.0) for helical
#              segment. If used with 's' option, it will give you position shift
#              rate for only helical segments.
#             'e' or 'E' for position Shift rate (it actually caculates the position
#              shift rate for beta segments). If this is the only option, it will
#              show the default percentage term rate for beta segments. If used
#              with 'r', it will give you ratio (0.0 - 1.0) for beta. If used
#              with 's' option, it will give you position shift rate for only
#              beta segments.
# Returns   :
# Argument  : Two references of hashes. One for error rate the other for sec.
#             assignment.
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub get_segment_shift_rate{

Bioinf.pl  view on Meta::CPAN

# Title     : get_wrong_segment_rate
# Usage     : print_seq_in_block( &get_wrong_segment_rate(\%superposed_hash) );
# Function  : Treats the segment as one single big error.
#             calculates the wrong segment number compared to the correct ones.
# Example   : <input example> hash of 3 keys and values.
#             2aaa_6taa      -------00000---------00000000----0000-------00000-
#             1cdg_6taa      -------442---------------2222-----------------000-
#             1cdg_2aaa      -------222---------------2222-----------------000-
#
#             In the above there are two segments wrong in 3 segment blocks = 2/3
#             <output example> hash of 3 percentage rates.
#
#             2aaa_6taa      0 %
#             1cdg_6taa      66.6666666666667 %
#             1cdg_2aaa      66.6666666666667 %
#
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :

Bioinf.pl  view on Meta::CPAN

#                                                by 9(occurance) in the above
#             for '0' of the first hash, so (0, 2.222,  1, 2.1666,  2, 2.4 )
#             Average is the average of numbers
#             average value in 0-9 scale (or 0-100 with 'p' option)
#             So, if there are
#                  seq1 00111110000,   The 'a' value of 0 and 1 as in the seq2
#                  seq2 33000040000    is 0-> 6/6, 1-> 4/5, while the 'n'
#                                        calc would be, 0-> 6 (60%), 1-> 4(40%)
#
# Argument  : (\%hash1, \%hash2) or optionally (\%hash1, \%hash2, ['n', 'i', 'p', 'a'])
#             'n' => normalizing, 'p' => percentage out, 'i' => make int out, 'a'=> averaged
# Category  :
# Version   : 1.2
#--------------------------------------------------------------------
sub tally_2_hashes{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);

Bioinf.pl  view on Meta::CPAN

	 if( $char_opt=~/m/i){  $minus_whole_cs  = 'm'}

	@input = @{$array[0]};
	if(defined(${$_[2]})){ $base_level =${$_[2]}; }
	if(defined(${$_[3]})){ $scale  =${$_[3]}; }

	for ($t=0; $t< @input; $t++){ $length=length($input[$t]) if(length($input[$t])>$length);}
	if ($length < $window_size){  $window_size = $length;   }
	 #___________ getting ratio for the whole sequence ___________

	$compos_whole_seq=${&main::compos_id_percent_array(\@input)};  ## for whole composition rate
	$seq_id_whole_seq=${&main::seq_id_percent_array(\@input)};
	print "\nComposition ID of the alignment:  $compos_whole_seq\%\n";
	print "Sequence    ID of the alignment:  $seq_id_whole_seq\%\n";
	if ($seq_id_whole_seq == 0){  $ratio_whole_seq =0; }
	else{  $ratio_whole_seq = $compos_whole_seq/$seq_id_whole_seq;  }
	print "Composition and Sequ.  ID Ratio:   $ratio_whole_seq\n";

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	##########   Initial Window size setting      ##############################
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if   (  $ratio_whole_seq >9 ){ $window_size= 28; } # $ratio_whole_seq is

Bioinf.pl  view on Meta::CPAN

	 ##      Go back if the window size is too small due to gaps        ##
	 #####################################################################
	 if( (length($win_no_gap1) < $ori_win_size)&&($variable_win_size ==1) )
	 {
		$window_size+=1; $w--; next;
	 }

	 #####################################################################
	 ##      Getting Compos and Seq ids                                 ##
	 #####################################################################
	 $compos_id=${&main::compos_id_percent_array(\@array_of_2_seq)};
	 $seq_id   =${&main::seq_id_percent_array(\@array_of_2_seq)};


	 #####################################################################
	 ####    Go back if the Seq id is bigger than Compos id        #######
	 #####################################################################
	 if(($variable_win_size eq 'v') && ($seq_id >  $compos_id))
	 {
		$window_size+=1;  $w--;   next;
	 }

Bioinf.pl  view on Meta::CPAN

	my(@ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $t, $length, $w,
	  $compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
	  $ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate );

	for ($t=0; $t < @input; $t++){
	 $length=length($input[$t]) if (length($input[$t])>$length);   }

	if ($length < $window_size){  $window_size = $length;   }

	  #___________ getting ratio for the whole sequence ___________
	$compos_whole_seq=${&compos_id_percent_array(\@input)};
	$seq_id_whole_seq=${&seq_id_percent_array(\@input)};
	if ($seq_id_whole_seq == 0){  $ratio_whole_seq =$compos_whole_seq/10; }
	else{  $ratio_whole_seq =$compos_whole_seq/$seq_id_whole_seq;  }

	  #___________ getting ratio for each window sequence ___________
	for ($w=0; $w < $length; $w++){
	 $offset = $w - int($window_size/2);  # $offset starts from -5 when window_size is 10.
	 $offset=0 if ($offset < 0);
	 $window_1=substr($input[0], $offset, $window_size);  # window_1 is one segment
	 $window_2=substr($input[1], $offset, $window_size);  # of defined length(size)
	 @array_of_2_seq=($window_1, $window_2); # making an array like this = ('ABCDE', 'BDESA')
	 $compos_id=${&compos_id_percent_array(\@array_of_2_seq)};
	 $seq_id   =${&seq_id_percent_array(\@array_of_2_seq)};
	#print "\n offset = $offset Wind1 = $window_1  Wind2 = $window_2 ";
	#print " Compos1 = $compos_id  Seqid = $seq_id \n";

	 #______  Handle special case when $seqid is zero > the rate becomes $compos_id/10 ______
	 if (($seq_id == 0) && ($compos_id != 0)){ $ratio_compos_vs_seqid = $compos_id/10;   }
	 elsif(($seq_id == $compos_id)&&($seq_id == 0)){ $ratio_compos_vs_seqid = 0;}
	 elsif(($seq_id == $compos_id)&&($seq_id == 100)){ $ratio_compos_vs_seqid = 0;}
	 else{ $ratio_compos_vs_seqid=($compos_id/$seq_id); }
	 push(@ratio_array, $ratio_compos_vs_seqid);  }

Bioinf.pl  view on Meta::CPAN

#
# Function  : gets the numbers of occurances for 1, 2, 3 ... position shifts.
#             If hash is given, it only looks at the values.
#             If multiple string, array, hash or combinations of these
#              are given, it will add up to one single result
# Example   :
# Warning   :
# Keywords  : composition of chars, composition table making,
#             make_composition, make composition table
#             occurances_of_char, get_char_occurances, occurances
#             get_percentage_occurances_of_char, percentage_occurances_of_char
# Options   : 'p' for percentage output of the char among others
#             'n' for NO name option when HASH input is given
# Returns   : one ref. of hash  (a =>5, b=>6, c=>4,,,,,)
# Argument  : one ref. of hash (seq1 alsdfjlsj
#                               seq2 asldfjsld
#                               seq3 owiurouou);
# Category  :
# Version   : 1.4
#--------------------------------------------------------------------
sub  get_occurances_of_char{
	my ($i, %H, $no_name, %out, $N,@splited, $val,$percentage_out,
	 $split, $sum);
	for($i=0; $i< @_; $i++){
	if($_[$i]=~/^[\-]?p$/i){
	   $percentage_out=1;   splice(@_, $i, 1); 	   $i--;
	}elsif($_[$i]=~/^[\-]?n$/i){
	   $no_name=1;	        splice(@_, $i, 1);	   $i--;
	}
	}

	for($i=0; $i< @_; $i++){
	if(  ref($_[$i]) eq 'HASH'){
	  my %H=%{$_[$i]};
	  my @names=keys %H;
	  for $key (@names){

Bioinf.pl  view on Meta::CPAN

	  @splited=@{$_[$i]};
	  for $split (@splited){  $out{$split}++; $sum++ }
	}elsif(ref($_[$i]) eq 'SCALAR'){
	   @splited = split(//, ${$_[$i]});
	   for $split (@splited){  $out{$split}++; $sum++ }
	}elsif( !(ref($_[$i])) ){
	   @splited = split(//, $_[$i]);
	   for $split (@splited){  $out{$split}++; $sum++ }
	}
	}
	if($percentage_out==1){
	 my @keys=keys %out;
	 my %percent;
	 for($i=0; $i< @keys; $i++){
		$percent{$keys[$i]} = $out{$keys[$i]}/$sum*100;
	 }
	 return(\%percent);
	}else{
	 return(\%out);
	}
}


#________________________________________________________________________
# Title     : make_composition_table
# Usage     : %occurances=%{&make_compos_table(\%key_and_value_for_seq)};
# Function  : gets the numbers of occurances for 1, 2, 3 ... position shifts.

Bioinf.pl  view on Meta::CPAN

    $aa{"c"} = "cys";  $aa{"n"} = "asn";  $aa{"e"} = "glu";  $aa{"q"} = "gln";
    $aa{"f"} = "phe";  $aa{"r"} = "arg";  $aa{"g"} = "gly";  $aa{"s"} = "ser";
    $aa{"h"} = "his";  $aa{"t"} = "thr";  $aa{"i"} = "ile";  $aa{"v"} = "val";
    $aa{"k"} = "lys";  $aa{"w"} = "trp";  $aa{"l"} = "leu";  $aa{"y"} = "tyr";
    return(\%aa);
}



#________________________________________________________________________
# Title     : amino_acid_compos_id_percent
# Usage     : $percent = &amino_acid_compos_id_percent (%any_hash_with_sequences);
#             The way identity(composition) is derived is;
#
# Function  : gets amino acid composition identity of any given
#             number of sequences(at least 2).
# Example   :
# Warning   :
# Keywords  : get_amino_acid_composition, get_protein_composition, composition
# Options   :
# Argument  : hash of at least 2 sequences.
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent{
	my(%input)= %{$_[0]};
	my(@names)=keys (%input);
	my(@temp, $i, $j, $iden, @all_pairs_id, $iden_sum);
	my(%compos_table1, %compos_table2, $final_iden, $larger);
	for ($i=0; $i < @names; $i ++){  # putting seqs in arrays.
	 $input{$names[$i]}=~ tr/a-z/A-Z/;
	 $input{$names[$i]}=~ s/\W//g;
	 @{"string$i"}= split('', $input{$names[$i]});
	 $larger = @{"string$i"} if @{"string$i"} > $larger;
	}

Bioinf.pl  view on Meta::CPAN

		push (@all_pairs_id,  $iden );
	 }
	}
	for $iden (@all_pairs_id){  $iden_sum+= $iden;  }
	if(@all_pairs_id == 0){ @all_pairs_id =1; }
	$final_iden=$iden_sum/@all_pairs_id;
	\$final_iden;
}

#________________________________________________________________________
# Title     : seq_id_percent_array  (more than 2 elements array)
# Usage     : $percent = &seq_id_percent_array(@any_array_sequences);
#             The way identity(pairwise) is derived is;
#
# Function  : produces amino acid composition identity of any given number of sequences.
# Example   :
# Warning   : This can handle 'common gaps' in the sequences
# Keywords  : get_percent_composition_identity, seq_composition_identity,
#             percent_sequence_composition_id
#
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub seq_id_percent_array{
	my(@input, $denominator,@all_pairs_id, $percent_id);
	my($largest,$p,$i,$j,$k,$iden_residue_num,$iden,@temp,$iden_sum,$gap_num,$final_iden);
	for($d=0; $d<@_; $d++){
		if(ref($_[$d]) eq 'ARRAY'){ @input=@{$_[$d]}; }
		elsif( (ref($_[$d]) eq 'SCALAR') &&( ${$_[$d]}=~/^[aA]/) ){ $average_len_opt =1 }
		elsif( !(ref($_[$d])) && ( $_[$d] =~/^[aA]/) ){ $average_len_opt =1 } }
	if ((@input== 1)||( @input== 0)){
		print "\n\n \" $0 \"  requires at least 2 sequences\n\n";
		print "\n Abnormally dying at seq_id_percent_array in $0 \n\n";
		print chr(7); exit;}
	$shortest=length($input[0]);
	my($sans_gap_seq, $length_sum, $average_seq_len);
	for($p=0; $p < @input; $p++){
		$input[$p]=~ tr/a-z/A-Z/;
		$sans_gap_seq=$input[$p];
		$sans_gap_seq=~s/\W//g;
		$input[$p]=~ s/\W/./g;
		(@{"string$p"})=split('', $input[$p]);
		$largest = length($input[$p]) if length($input[$p]) > $largest;

Bioinf.pl  view on Meta::CPAN

	$average_seq_len = $length_sum/@input;
	for($i=0; $i< @input; $i++){
		for($j=$i+1; $j< @input; $j++){
			for ($k=0; $k <  $largest; $k ++){  # getting composition tables for two seqs.
				if ((${"string$i"}[$k] !~ /\W/)&&(${"string$i"}[$k] eq ${"string$j"}[$k])){
					$iden_residue_num++; }
				elsif((${"string$i"}[$k] =~ /\W/)&&(${"string$i"}[$k]=~ /\W/)){ $gap_num++; }}
			if( $average_len_opt == 1){ $denominator = $average_seq_len; }
			else{ $denominator = $shortest; }
			if($denominator == 0){ $denominator=1; }  # in the above it is 50% rather than 0.07%
			$percent_id=($iden_residue_num/($denominator))*100;
			push(@all_pairs_id, $percent_id);
			undef ($iden_residue_num, $gap_num);
		}
	}
	for (@all_pairs_id){  $iden_sum+=$_;    }
	$final_iden=$iden_sum/($#all_pairs_id+1);
	return( \$final_iden );
}

#________________________________________________________________________
# Title     : compos_id_percent_array  (more than 2 elements array)
# Usage     : $percent = &compos_id_percent_array(@any_array_sequences);
#             The way identity(composition) is derived is;
# Function  : produces amino acid composition identity of any given number of sequences.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub compos_id_percent_array{
	 my(@input)=@{$_[0]};
	 my($largest,$iden,@temp,$iden_sum,$final_iden, @all_pairs_id);
	for($i=0; $i<=$#input; $i++){  $input[$i]=~ tr/a-z/A-Z/;  $input[$i]=~ s/[\.\-\s]//g;
		@temp = split('', $input[$i]);  (@{"string$i"})= @temp;
		$largest = @{"string$i"} if @{"string$i"} > $largest;    }
	for($i=0; $i< @input; $i++){ #_________ permutating ___________
		for($j=$i+1; $j<=$#input; $j++){
			for ($k=0; $k <= $largest; $k ++){  # getting composition tables for two seqs.
				$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
				$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/);   }

Bioinf.pl  view on Meta::CPAN

					 sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
				 $sum_of_the_common_residue_no += $common;     }
			 $compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;   }
	  \$compos_id;
	}
	#-----------------------------------------------------
	return ( \$final_iden ); # final identity for any given set of strings(seq).
}

#________________________________________________________________________________
# Title     : compos_id_percent_hash  (synonym of amino_acid_compos_id_percent)
# Usage     : $percent = &compos_id_percent_hash(%any_hash_with_sequences);
#             The way identity(composition) is derived is;
#
# Function  : gets amino acid composition identity of any given number of sequences.
# Example   :
# Warning   :
# Keywords  : get_amino_acid_composiiton
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#------------------------------------------------------------------------------
sub compos_id_percent_hash{ my(%input, @names);
	if(ref($_[0]) eq 'HASH'){  %input= %{$_[0]}; @names= keys  %input;  }
	else{ print "\n hash ref was not passed to compos_id_percent_hash\n"; exit; }
	my(@temp, $iden, @all_pairs_id, $i, $j, $k,$iden_sum);
	my(%compos_table1, %compos_table2, $final_iden, $larger);
	for ($i=0; $i < @names; $i ++){  $input{$names[$i]}=~ tr/a-z/A-Z/;
	 $input{$names[$i]}=~ s/\W//g;    @temp = split('', $input{$names[$i]});
	 (@{"string$i"})=@temp; $larger = @{"string$i"} if @{"string$i"}>$larger;}
	for ($i=0; $i < @names; $i++){
	 for ($j=$i; $j < @names; $j ++){
		if ($j == $i){ next; }
		for ($k=0; $k < $larger; $k ++){
		  $compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);

Bioinf.pl  view on Meta::CPAN

#________________________________________________________________________
# Title     : common_compos_id_hash (BUG free)
# Usage     : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function  : actual calculation of identity
# Example   : ('A', 200, 'C', 191, D, 99)
#                  ('A', 290, 'C', 199, D, 100)
#             uses only two sequences.
# Warning   :
# Keywords  :
# Options   :
# Returns   : ref. of a scaler (in percent)  eg)  95
# Argument  : two references of hash of seqeunces.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub  common_compos_id_hash{
	my(%hash1)=%{$_[0]};
	my(%hash2)=%{$_[1]};
	my(%common_of_the_2)=();  my($common, $compos_id, $sum_of_the_common_residue_no);
	my(@values1) = values (%hash1);  my(@values2) = values (%hash2);
	my(@combined_values)=(@values1, @values2);

Bioinf.pl  view on Meta::CPAN

#________________________________________________________________________
# Title     : calc_compos_id_hash (the same as 'common_compos_hash')
# Usage     : %hash = &calc_compos_hash(\%any_hash1, \%any_hash1);
# Function  : actual calculation of identity
# Example   : ('A', 200, 'C', 191, D, 99)
#                  ('A', 290, 'C', 199, D, 100)
#             uses only two sequences.
# Warning   :
# Keywords  :
# Options   :
# Returns   : ref. of a scaler (in percent)  eg)  95
# Argument  : two references of hash of seqeunces.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub  calc_compos_id_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; my(%common_of_the_2)=();
	  my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
	  my(@values1) = values (%hash1);   my(@values2) = values (%hash2);
	  my(@combined_values)=(@values1,@values2);
	  for $elem (@combined_values){
			$sum_residues += $elem;   }
	  if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
	  else{
		  for $key1(keys %hash1){
			  $common=&smaller_one($hash1{$key1}, $hash2{$key1});
				  sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
			  $sum_of_the_common_residue_no += $common;     }
		  $compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;   }
	  \$compos_id;
}
#________________________________________________________________________
# Title     : get_percentage
# Usage     : %out= %{&get_percentage(\%result, '1')};
# Function  : calculates the percentage content of any single char over the whole
#             length of strings in it.
# Example   : if the string is  'seq  ABCDEEEEEFFEFE' given in a hash
#             if you put 'A' as one argument, it counts the occurances of 'A'
#             and gets the percentage of it.
# Warning   : This converts array and string input as ref. into arbitrary hash and
#             returns hash
#             programmed by A Biomatic
# Keywords  : get_percentage_of_char
# Options   : None yet.
# Returns   : Numerical Percentage
# Argument  : ref. for Scalar string or Array of chars or Hash  AND 'the target char'
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percentage{
	my(@in, $k, $sort, $numerator, $residue, @out_hash_ref, %hash_out );
	for($k=0; $k< @_ ;$k++){
	  if( !ref($_[$k])&& (length($_[$k]) == 1 )){
		 $numerator = $_[$k];
	  }
	  elsif( (ref($_[$k]) eq 'SCALAR') && (length(${$_[$k]}) == 1 )){
		 $numerator = ${$_[$k]};
	  }
	  elsif(ref($_[$k]) eq "HASH")  { push(@in, $_[$k]); }
	  elsif(ref($_[$k]) eq "ARRAY") { push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
	  elsif(ref($_[$k]) eq "SCALAR"){ push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
	}
	####### final output is => @in of hash ref elements #######
	for (@in){   my(%H) = %{$_}; my(@keys)= sort keys %H;
	 for $name(@keys){
		 my($numerator_count);
		 my($seq_len) = length($H{$name}); print  "\n $name Sequence length: ", $seq_len, "\n";
		 my(@string) = split(//, $H{$name});
		 for $residue (@string){  if($residue =~/^$numerator$/){ $numerator_count ++; }}
		 my($percent) = $numerator_count/$seq_len *100;
		 $hash_out{$name}=$percent;    }
	 push(@out_hash_ref, \%hash_out);  }
	if(@out_hash_ref ==1){ return($out_hash_ref[0]); }
	elsif( @out_hash_ref > 1){ return(@out_hash_ref); }
}


#________________________________________________________________________
# Title     : pairwise_percent_id  (pairwise sequence identity in percentage)
# Usage     : $identity = ${&pairwise_percent_id(%arrayinput)};
#
# Function  : takes a ref. of a hash of names and sequences, returns
#             percent identity.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub pairwise_percent_id{
	my($i,$j,$k, @iden_array_ref);
	for($i=0; $i< @_; $i++){  my %input= %{$_[$i]};  my @names= sort keys %input;
	  my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
	  my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
	  for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
		 @temp = split('', $input{$names[$i]});  (@{"string$i"})=@temp;
		 $larger = @{"string$i"} if @{"string$i"} > $larger; }
			 for ($i=0; $i < @names; $i++){       # to make permutated pairs.
				for ($j=$i+1; $j < @names; $j ++){
					for ($k=0; $k < $larger; $k ++){  # getting composition tables for two seqs.
					  $iden+=2 if ((${"string$i"}[$k] eq ${"string$j"}[$k])&&(${"string$i"}[$k] =~ /\w/));
					  $residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
					  $residue_sum2++ if (${"string$j"}[$k] =~ /\w/);  }
					$whole_seq_len =($residue_sum1+$residue_sum2);
					$percent_for_pair = $iden/$whole_seq_len*100;
					push(@percent_for_pair,$percent_for_pair);
					$residue_sum1=0; $residue_sum2=0; $iden=0; } }
			 for $iden (@percent_for_pair){ $iden_sum+=$iden;}
	  $final_av_iden=$iden_sum/( @percent_for_pair );
	  push(@iden_array_ref, \$final_av_iden);  }
	if(@iden_array_ref ==1){ return($iden_array_ref[0]);}else{ return(@iden_array_ref);}
}


#________________________________________________________________________
# Title     : get_seq_identity
# Usage     : $identity = ${&get_seq_identity(%arrayinput)};
#
# Function  : takes a ref. of a hash of names and sequences, returns
#             percent identity. NOT composition identity.

# Example   :
# Warning   :
# Keywords  : get_sequence_identity
# Options   :
# Returns   :
# Argument  : hash(es) of sequences.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_seq_identity{
	my($i,$j,$k, $c, @iden_array_ref);
	for($c=0; $c< @_; $c++){
	  my %input= %{$_[$c]};
	  my @names= sort keys %input;
	  my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
	  my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
	  for ($i=0; $i < @names; $i ++){
		 $input{$names[$i]}=~ tr/a-z/A-Z/;
		 @temp = split(//, $input{$names[$i]});
		 @{"string$i"}=@temp;
		 $larger = @{"string$i"} if @{"string$i"} > $larger; }
		 for ($i=0; $i < @names; $i++){       # to make permutated pairs.
			for ($j=$i+1; $j < @names; $j ++){
				for ($k=0; $k < $larger; $k ++){
				  if ( ${"string$i"}[$k] eq ${"string$j"}[$k] and ${"string$i"}[$k] =~ /\w/){
					 $iden+=2 ;
				  }
				  $residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
				  $residue_sum2++ if (${"string$j"}[$k] =~ /\w/);
				}
				$whole_seq_len =($residue_sum1+$residue_sum2);
				$percent_for_pair = $iden/$whole_seq_len*100;
				push(@percent_for_pair,$percent_for_pair);
				$residue_sum1=0;
				$residue_sum2=0;
				$iden=0;
			}
		 }
	  for $iden (@percent_for_pair){
		  $iden_sum+=$iden;
	  }
	  if(@percent_for_pair <1){ @percent_for_pair=(1); }
	  $final_av_iden=$iden_sum/( @percent_for_pair );
	  push(@iden_array_ref, \$final_av_iden);
	}
	if(@iden_array_ref ==1){
	   return($iden_array_ref[0]);
	}else{
	   return(@iden_array_ref);
	}
}




#________________________________________________________________________
# Title     : get_correct_percent_alignment_rate  (made for Bissan)
# Usage     : &get_correct_percent_alignment_rate(\$file1, \$file2);
# Function  : accepts two files and prints out the sequence identities of the alignment.
# Example   :
# Warning   : Alpha version,  A Biomatic , made for Bissan
# Keywords  :
# Options   : h  # for help
#             v  # for verbose printouts(prints actual sequences)
# Returns   : reference of Scalar for percentage correct alignment(for already
#             aligned sequences)
# Argument  : two sequence files which have identical sequence names.
# Category  :
# Version   :
#--------------------------------------------------------------------
sub get_correct_percent_alignment_rate{
	 my($i, $j, $k, $verbose, @string1, @string2, $larger, $seq_pair_id, @seq_pair_ids );
	 my(%inputhash1) = %{&read_any_seq_files($_[0])};
	 my(%inputhash2) = %{&read_any_seq_files($_[1])};
	 my(@names)= sort keys %inputhash1;
	 ######################################
	 ####### Sub argument handling ########
	 ######################################
	 for($k=0; $k< @_ ;$k++){
		if( !ref($_[$k])&& (length(${$_[$k]}) < 5)){  # when inputs are not ref.
		  if($_[$k]=~ /^[\-vV]$/){ $verbose = 1; next;}

Bioinf.pl  view on Meta::CPAN

	 }
	 print "\n", "_"x88, "\n";
	 my($whole_average_of_the_id)=${&array_average(\@seq_pair_ids)};
	 print "The whole average is; $whole_average_of_the_id\n";
	 if(@seq_pair_ids == 1){ return( \$seq_pair_ids[0] ); }
	 elsif(@seq_pair_ids > 1){ return( \@seq_pair_ids ); }
}


#________________________________________________________________________
# Title     : amino_acid_compos_id_percent_trend
# Usage     :
# Function  :
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent_trend{
	my(%input) = %{$_[0]};
	my(@common, @string,@accumu_percent_iden)=(); my(%common_so_far, %compos_table);
	my($percent_id_so_far, $length_of_one_seq,$length_of_all_seq, $seq_no)=0;
	for $key(keys %input){
			$input{$key}=~s/[. \d-]//g;
			@string= split(//, $input{$key});
			print @string; print "\n";
			$length_of_one_seq = $#string+1;
			$length_of_all_seq +=$length_of_one_seq;
			$seq_no += 1;
			%compos_table  = &composition_table(@string);
			@check = keys (%common_so_far);
			if  ($#check < 0){ %common_so_far = %compos_table; }
			else{ %common_so_far= %{&common_compos_2_hash(\%common_so_far,\%compos_table)};}
			for $value(values %common_so_far){ $common_residue_sum +=$value; }
			$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
			$common_residue_sum =0;  }
	for $value(values %common_so_far){ $common_residue_sum +=$value; }
	$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
	return(\$final_percent_id);
}

#________________________________________________________________________
# Title     : composition_table   (can handle both nucleic and protein seq)
# Usage     : %output = %{&compos_table(@input_array1, @input_array2,,,,)};
#             example input
#
# Function  : returns a table of alphabet with occurances.
#             can handle any char, this converts char to upper case.
# Example   :

Bioinf.pl  view on Meta::CPAN

	my(%common_of_the_2)=(); my($common)=0;
	for $key1(keys %hash1){
	 $common=&smaller_one($hash1{$key1}, $hash2{$key1});
	 if ($common =~ /\d+/){
		$common_of_the_2{$key1}=$common; } }
	\%common_of_the_2;
}


#________________________________________________________________________
# Title     : pair_percent_id_trend
# Usage     : @array = &pair_percent_id_trend (%arrayinput);
# Function  :
# Example   : common gaps means only '.' (dots, not alphabets!!)
#             AAA....BBCB
#             AAAB..B.BCC  --> A.A.....BC. (as in an array)
#             A.AAA...BCA
#             The resulting array XXXXX..XXXX is literally like so.
#             This is to detect absurd gaps in the above.
#
#
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub pair_percent_id_trend{
	my(%input) = %{$_[0]};
	my(@common, @string,@accumu_percent_iden)=();
	my($percent_id_so_far)=0;
	for $key(keys %input){
	  my($len) = &smaller_one($#common, $#string) unless $#common < 0;
	  $input{$key}=~s/ //g;
	  @string= split(//, $input{$key});
	  $length_of_one_seq = $#string+1;
	  $length_of_all_seq +=$length_of_one_seq;
	  $seq_no += 1;
	  for ($k=0; $k <= $len;$k++){
		  if($#common == -1){
			 @common = @string;

Bioinf.pl  view on Meta::CPAN

		  }
		  elsif ( $string[$k] eq $common[$k] ){
			 $common[$k] = $string[$k];
			 $identical_count +=1;
		  }elsif( $string[$k] ne $common[$k]){
			 $common[$k]='.';
		  }
	  }
	  $num_of_iden_char = &count_num_of_char(\@common);
	  $av_seq_no = $length_of_all_seq/$seq_no;
	  $percent_id_so_far = $num_of_iden_char/$av_seq_no*100;

	  print "\n percent_id so far = $percent_id_so_far \n";
	  push(@accumu_percent_iden,$percent_id_so_far);

	} # end of for (after all sequences have been run).
	$num_of_iden_char = &count_num_of_char(@common);
	$av_seq_no = $length_of_all_seq/$seq_no;
	$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
	print "\n percent_id so far = $percent_id_so_far \n";
	\@accumu_percent_iden; # final ids array.
}
#________________________________________________________________________
# Title     : smaller_one
# Usage     : $smaller = & smaller_one($var, $var2);
#
# Function  : gets smaller value of the two inputs
# Example   : will return   5   with  &smaller_one(5, 50);
# Warning   : gets only digits!!
# Keywords  :
# Options   :

Bioinf.pl  view on Meta::CPAN

# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub count_num_of_char{
	 my(@input)={$_[0]};
	 my($num_of_char)=0;
	 for $elem(@input){  # this is for the percentage of TWO seqs.
		 if ($elem =~ /\w/){
					 $num_of_char +=1;
		 }
	 }
	 $num_of_char;
}
#________________________________________________________________________
# Title     : remov_com_column2  (this is the older and slower version)
# Usage     : %new_string = %{&remov_com_column2(\%input_hash)};
# Function  :

Bioinf.pl  view on Meta::CPAN

# ((-l option for long output )
#  NAME MOZ_HUMAN_part
#  HEADER  |- Residue -|  Pred  Rel      NAli   Asn
#  PRED    1    MET    M  c     0.000    0      ?
#  PRED    2    ALA    A  c     0.000    0      ?
#
# Warning   :
# Keywords  : open_prd_files, open_pred_files, predator, open_prdl_files
#             open_pre_files, secondary structure prediction file
# Options   : 's' for sequence output as well (\%sec_str, \%seq)
#             'p' for percentage of the sec. str.
#             'a' for accumulated percentage. This will
#                  set 'p' automatically
#             'n' for NO name when outputing Percentage of chars with
#                 HASH input to get_occurances_of_char sub.
#      $reverse_residue_order=r by r
# Returns   :
# Argument  :
# Category  :
# Version   : 1.8
#-----------------------------------------------------------
sub open_predator_files{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my( @out_ref, $seq_out, %sec_str, %seq, $percent_out, $NO_name_out,
        $short_form_out_detected, $long_form_out_detected, $accumulate,
        $reverse_residue_order, %rev_sec_str);
    if($char_opt=~/s/i){ $seq_out=1 }
    if($char_opt=~/a/i){ $accumulate=1  }
    if($char_opt=~/p/i){ $percent_out=1 }
    if($char_opt=~/n/i){ $NO_name_out='n' }
    if($char_opt=~/r/){  $reverse_residue_order='r' }

    for($i=0; $i< @file; $i++){
	 my (%sec_str, %seq) if($accumulate !=1);

         open(PREDATOR_FILE, "$file[$i]");
         print "\n# (INFO) open_predator_files: opening $file[$i]";
	 while(<PREDATOR_FILE>){
              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`

Bioinf.pl  view on Meta::CPAN

                 $residue_3_letter=$2;
                 $residue_1_letter=$3;
                 $sec_str_predicted=$4;
                 $reliability=$5;
                 $sec_str{$position}=[$residue_1_letter, $sec_str_predicted, $reliability];
              }
	 }
         close (PREDATOR_FILE);
	 print "\n \%sec_str is: ", %sec_str, "\n" if ($debug == 1);
	 if($seq_out==1){ push(@out_ref, \%sec_str, \%seq);
	 }elsif($percent_out==1 ){
	      push(@out_ref, [%{&get_occurances_of_char(\%sec_str, $NO_name_out, 'p')}] );
	 }elsif($percent_out !=1){ push(@out_ref, \%sec_str) }

         #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
         # If -r option is set (for long form, this does not affect
         #____________________________________________________________
         if($short_form_out_detected and $reverse_residue_order){
              @keys=keys %sec_str;
              for($r=0; $r<@keys; $r++){
                  $rev_sec_str{$keys[$r]}=reverse($sec_str{$keys[$r]});
              }
              %sec_str=%rev_sec_str;

Bioinf.pl  view on Meta::CPAN

#             >P1;1cdg
#             secondary structure and phi angle
#             CCCCCCCCCCCCCCCCEEECCHHHHCCCCHHHCCCPHHCCCCPCC-CCCCCPCCHHHHHHHHHCPHHHHHPCCEE
#             >P1;1cdg
#             solvent accessibility
#             TTTTTTTTTTTFFFFFFFFFFFFFFTTTTTTTTTTTTTTTTTFTT-TTTTFFFFFTFFTTTFTTTFFTTFTFTFF
#             >P1;1cdg
#             DSSP
#             CCCCCCCCCCCCCCCCEEECCHHHHCCCCGGGCCCGGGCCCCCCC-CCCCCCCCHHHHHHHHHCCHHHHHCCCEE
#             >P1;1cdg
#             percentage accessibility
#             67523272360000000000000002213792129b722248085-14110000030015105660028040200
#             2ltn           ----TETTSFLITKFSPDQQNLIFQGDGYTT-KEKLTLTK------AVKNTVGRALYSSP
#             1loe           ----TETTSFSITKFGPDQQNLIFQGDGYTT-KERLTLTK------AVRNTVGRALYSSP
#
#             2ltn           ----CEEEEEEECCCCCCCCCEEEEPCCEEP-PPCEEEEC------CCCPCEEEEEECCC
#             1loe           ----CEEEEEEECCCCCCCCCEEEEPCCEEE-PPEEEEEC------CCCPCEEEEEECCC
#
#             2ltn           ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFTFFT------TTTTTTFFFFTTTT
#             1loe           ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFFFFT------TTTTTTFFFFTTTT
#

Bioinf.pl  view on Meta::CPAN

# Version   : 1.0
#--------------------------------------------------------------------
sub open_tem_files{
	my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
	  $s, $t, $u, $v, $w, $x, $y, $z, $pwd, $file, $dir, $output, $in_dir,
	  %hash, @keys, @array, @hash, $option_string, $string, @in, $line,
	  $name, %out, $gap_chr, @str1, @str2, $num_opt, @file, @dir,
	  $char_opt, $char_opt_given, $num_opt_given,
	  @char_options, @file, $original_dir, @read_files, %array_msf, %array_jp,
	  $jp_file, $error_rate, $id_compos, @dir, @names, $name, $name_found,
	  @outref, %sequence, %secondary,%solvent_access, %DSSP, %percent_accessibility,
	  $name_found,$type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc
	);
	##################################################
	##### Start of  general argument handling   ######
	##################################################
	for($k=0; $k< @_ ;$k++){
	  if( !ref($_[$k]) ){
			  if($_[$k]=~ /^[\-]*(\w)$/){
				  $char_opt  .= "\,$1";
				  $char_opt_given =1;

Bioinf.pl  view on Meta::CPAN

				} #from below ============== SECONDARY
			 }elsif(/^secondary structure and phi angle/){  $type_secon = 1;
			 }elsif(($type_secon ==1)&&(/^([\w\-]+)[\*]*$/)){
				$secondary{$name}.=$1;     #from below============= SOLVENT ACCESSIBILITY
			 }elsif(/^solvent accessibility/){  $type_sol = 1;
			 }elsif(($type_sol ==1)&&(/^([\w\-]+)[\*]*$/)){
				$solvent_access{$name}.=$1;     #from below========= DSSP
			 }elsif(/^DSSP/){  $type_DSSP = 1;
			 }elsif(($type_DSSP ==1)&&(/^([\w\-]+)[\*]*$/)){
				$DSSP{$name}.=$1;     #from below=================== PERCENTAGE ACCESSIBILITY
			 }elsif(/^percentage accessibility/){  $type_acc = 1;
			 }elsif(($type_acc ==1)&&(/^([\w\-]+)[\*]*$/)){
				$percent_accessibility{$name}.=$1;  } }
		  push(@outref,\%sequence,\%secondary,\%solvent_access,\%DSSP,\%percent_accessibility);
	  }  }
	if( ($char_opt =~ /s/i) || ( @outref  == 1 ) ){
	  return(\%sequence); }
	elsif( @outref > 1){ return(@outref); } # <-- contains (\%sequence,\%secondary,....)
}

#________________________________________________________________________
# Title     : open_hlx_files
# Usage     :
# Function  :

Bioinf.pl  view on Meta::CPAN

            chdir($original_dir);
            if($realfile1 =~/(\d+\-$no\.msf)$/){
                @dir=split(/\//, $realfile1);
                $dir=$dir[($#dir-1)];  # where am I ?
                # $jp_file = $original_dir.'/'.$dir.'.jp';
                # %array_msf =&open_msf_files($realfile1);
                # %array_jp  =&open_jp_files ($jp_file);
                # $array_ref_msf = \%array_msf;
                # $array_ref_jp  = \%array_jp;
                # $error_rate =&get_posi_shift_hash($array_ref_msf, $array_ref_jp);
                # $id_compos  =&amino_acid_compos_id_percent($array_ref_jp);
                # push(@rates_accumu,$error_rate);
                # push(@compos_id,$id_compos);
            }
        }
        else
        {
                               next;
        }
    }
}

Bioinf.pl  view on Meta::CPAN

		  }
	 }
	 $sum = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }
	 \$id_counter; # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title     : get_percent_homol_arr
# Usage     : $homology_out = ${&get_pair_homol(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function  : get pair wise seq. identity of any two strings, outputs a scalar (%)
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percent_homol_arr{
	 my(@input)=@{$_[0]};
	 $input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$input[0]);
	 my(@string2)= split(//,$input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
		  print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter, $sum,$percent_homol)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 $sum = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }else{
		  $percent_homol=($id_counter/$sum)*100;
	 }
	 return(\$percent_homol); # $id_counter is the homology counter;
}

#________________________________________________________________________
# Title     : get_pair_homol_hash
# Usage     : $homology_out = & get_pair_homol (%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function  : get pair wise seq. identity as a scalar count
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :

Bioinf.pl  view on Meta::CPAN

		  }
	 }
	 my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		 print "\n There is something wrong in getting homology in get_pair_homol \n";
		 &caller_info;
			}
	 return ($id_counter); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title     : get_percent_homo_hash
# Usage     : $homology_out = &get_pair_homol_hash(%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function  : get pair wise seq. identity(%) of any two strings put in as a hash
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percent_homo_hash{
	 my(%input)=@_;
	 &hash_chk(\%input);
	 my(@keys_input)= keys (%input);
	 my(@values_input) = values (%input);
	 $values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$values_input[0]);
	 my(@string2)= split(//,$values_input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
					 print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter,$percent_homol,)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }else{
		  $percent_homol=($id_counter/$sum)*100;
	 }
	 return ($percent_homol);
}


#________________________________________________________________________
# Title     : file_size
# Usage     : $outputfilesize = &file_size($input_file_name);
# Function  : returns the size of any single testing file
# Example   :
# Warning   : Q is for quality of this sub. This can't be wrong.
# Keywords  :

Bioinf.pl  view on Meta::CPAN

# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub file_size { my($infile)=$_[0];
	if ( $size=(-s "$infile")){ return $size; }
}

#________________________________________________________________________
# Title     : seq_comp_percent2
# Usage     : @outarray = &seq_comp_percent2(@any_input_string_array);
# Function  : get string seq COMPOSITION identities(a to z). gets array
#             of strings and outs array of % numbers
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub seq_comp_percent2{   	# simple and basic seq. id. eg. ABC on ABCABC is 50 % identical.
	my(@input)=@_;
	my(@array_of_ids2, $id2, @char1, @char2);
	&array_chk(sort @input);
	my($longest_str_size)  = &get_long_str_size (@input), "\n";
	my($shortest_str_size) = &get_short_str_size(@input), "\n";
	print "longest_str_size",$longest_str_size;
	print "shortest_str_size",$shortest_str_size;
	if (($longest_str_size/$shortest_str_size) > 4){
		  print "\n  The shortest string is less than 1/4 of the longest\n";
		  print "  This is quite meaningless, but will go on\n";

Bioinf.pl  view on Meta::CPAN

# Argument  :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub cls{   my($cls) = `clear`;
    print $cls;
}


#________________________________________________________________________
# Title     : seq_comp_percent1
# Usage     : @outarray = &seq_comp_percent1(@any_input_string_array);
# Function  : get string seq identities(a to z). gets array of strings and outs array of % numbers
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   : one ref. of an array
# Argument  : one ref. of an array
# Category  :
# Version   :
#--------------------------------------------------------------------
sub seq_comp_percent1{ 		# this is affected by seq. length
		  my(@input)=@{$_[0]};
		  my(@array_of_ids1, $id1, @char1, @char2);
		  &array_chk(\@input);
		  @input = sort (@input);
		  $longest_str_size  = &get_long_str_size (@input), "\n";
		  $shortest_str_size = &get_short_str_size(@input), "\n";
		  if (($longest_str_size/$shortest_str_size) > 4){
					 print "\n The shortest string is less than 1/4 of the longest\n";
					 print " This is quite meaningless, but will go on\n";
		  }

Bioinf.pl  view on Meta::CPAN

	 for $key1 (sort keys %hash1){
					 for $key2 (sort keys %hash2){
								if ($key1 eq $key2){
										  $no_of_same = &min($hash1{$key1},$hash2{$key2});
										  $sum_of_same += $no_of_same;
										  last;
								}
					 }
	 }
	 $identity = $sum_of_same*2/(&sum_array(@num_char1,@num_char2))*100;
	 # print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title     : get_id_among_2_2
# Usage     : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function  : gets the % id of any two sequences, returns in  100.0% format.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :

Bioinf.pl  view on Meta::CPAN

					 $no_of_same = &min($hash1{$key1},$hash2{$key2});
					 $sum_of_same += $no_of_same;
					 last;
				}
			}
	 }
	 $seq1=&sum_array(@num_char1);
	 $seq2=&sum_array(@num_char2);
	 $longer_seq = &max($seq1, $seq2);
	 $identity = $sum_of_same/$longer_seq*100;
	 #print "percent iden = ", $identity, "\n";
}

#________________________________________________________________________
# Title     : array_average
# Usage     : $output = &array_average(\@any_array);
# Function  : (the same as average_array)
# Example   :
# Warning   : If divided by 0, it will automatically replace it with 1
# Keywords  : get_array_average, av_array, average_array, get_average_array
#             average_of_array, average_array

Bioinf.pl  view on Meta::CPAN

    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my(%hash, @pdbd_seqs, @superfamily, @members, @files, $over_write, $msp_file_gz,
         $msp_seq_file_dir, $msp_file, $msp_file_long, $msp_file_gz_long,
         $pdbd_seq_long, $final_merged_ISL_fasta,$pdbg_file, $pwd , $range_start,
         $range_stop, @files_NOT_processed);
    my $source_db_fasta=$ENV{'NRDB_FASTA'}; ## general default
    my $evalue_thresh=0.001; #  default
    my $score_thresh=70;     #  default
    my $range_thresh=10;
    my $percent_id_thresh=0.95;
    print "\n# (i) Running sub of: make_intermediate_sequence_library\n";

    if($vars{'FASTA_DB'}=~/\S+/){  $source_db_fasta=$vars{'FASTA_DB'} }
    if($vars{'MSP_DIR'}=~/\S+/){   $msp_seq_file_dir=$vars{'MSP_DIR'} }
    if($vars{'p'}=~/\S+/){         $pdbg_file=$vars{'p'}; print "\n# (i) $vars{'p'} is given \n"; } ## PDBG file input
    if($vars{'m'}=~/\S+/){         $msp_seq_file_dir=$vars{'m'} }
    if($vars{'DB'}=~/\S+/){        $source_db_fasta=$vars{'DB'} }
    if($vars{'E'}=~/\S+/){         $evalue_thresh=$vars{'E'} }
    if($vars{'e'}=~/\S+/){         $evalue_thresh=$vars{'e'} }
    if($vars{'s'}=~/\S+/){         $score_thresh=$vars{'s'} }

Bioinf.pl  view on Meta::CPAN

           next;
       }
       if(@pdbd_seqs){      print "\n# (i) \@pdbd_seqs for $superfamily is: @pdbd_seqs\n";
       }else{               print "\n# (E) \@pdbd_seqs is empty, strange, error in making $out_fasta_file \n\n";
       }

        for($j=0; $j< @pdbd_seqs; $j++){
             $pdbd_seq=$pdbd_seqs[$j];
             $pdbd_seq_long="pdb\_$pdbd_seqs[$j]";

             my (@msp_content, $evalue, $sub_dir, $percentage_id, $range_length);
             if($pdbd_seq=~/^ *$/){  next; }

             $msp_file="$pdbd_seq\.msp";
             $msp_file_gz="$pdbd_seq\.msp\.gz";
             $msp_file_long="pdb\_$pdbd_seq\.msp"; ## this is to handle Sarah's pdb_ prefixed pdbd files
             $msp_file_gz_long="pdb\_$pdbd_seq\.msp\.gz";

             print "# (i)      I am processing $msp_file  <- subroutine:make_intermediate_sequence_library\n";
             if($msp_file_gz=~/^([dec])\S/ or $msp_file_long=~/pdb_([dec])\S/){
                  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`

Bioinf.pl  view on Meta::CPAN


                  print "\n# (i) Now I have finished reading in ONE msp file for $pdbd_seq ($superfamily) into \@msp_content";

                  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
                  # Processing each MSP line
                  #_______________________________________________________________________________________
                  my ($identical_one_added, $interm_seq);
                  for($k=0; $k< @msp_content; $k++){ ## NEW MSP format
                      if($msp_content[$k]=~/^ *(\S+) +(\S+) +(\S+) +\d+ +\d+ +[pdb_]*$pdbd_seq[_\d+\-\d+]* +(\d+) +(\d+) +(\S+)/){     # [pdb_]* is for Sarah's pdb_ prefix
                          #if($pdbd_seq eq $6){ next }  ## this is to exclude the same seq match
                          $evalue=$2;               $percentage_id=$3;
                          $range_length=$5-$4;      $score =$1;  $interm_seq=$6; $range_start=$4, $range_stop=$5;

                          if($interm_seq=~/(\S+)_\d+\-\d+/){ $interm_seq=$1; }

                          if($evalue < $evalue_thresh and $range_length > $range_thresh and $score > $score_thresh){
                              if($percentage_id <= $percent_id_thresh){  ## to keep one pdb seq ($percentage_id=1)
                                  unless($interm_seq=~/^d\d\S/){  # to remove entries like:  d1abc_10-20_d2acb__ (two pdb seqs)
                                      $interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
                                  }
                              }elsif($percentage_id ==1 and $identical_one_added < 1){  ## this is to prevent more than 1 100% id seq
                                  $interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
                                  $identical_one_added++;
                              }
                          }
                      }elsif($msp_content[$k]=~/^ *(\d+) +(\S+) +\d+ +\d+ +$pdbd_seq +(\d+) +(\d+) +(\S+)/){
                          $evalue=$2;              $score=$1;
                          $range_length=$4-$3;     $interm_seq=$5;
                          if($evalue < $evalue_thresh and $range_length > $range_thresh
                              and $score > $score_thresh){
                              $interm_hash{$superfamily} .=" $interm_seq\_$3\-$4\_$pdbd_seq";

Bioinf.pm  view on Meta::CPAN

package Bioinf;
require 5.000;
require Exporter;
use Carp;

@ISA = qw(Exporter);
@EXPORT= qw( ISS_server ONE_TO_THREE_LETTER One_To_Three_Letter Richardson_alpha_matrix
        Roman Sheraga_alpha_matrix abs_numerically add_columns
        add_ranges_in_msp_line amino_acid_compos_id_percent amino_acid_compos_id_percent_trend amino_acid_homology_matrix
        arabic array_average array_chk array_least_occur
        array_median array_most_occur array_sum ask_for_ENV_vars
        assign_options_to_variables attach_classification_to_pdb_seq average_array average_of_array
        beep bla_to_msf break_down_clu_file by_values
        calc_compos_id_hash calc_factorial calculate_protein_volume capitalize_sentence
        capitalize_word cc check_common_elements_in_array check_file_exists_in_path
        check_homology_of_seq_pair check_if_defined check_if_files_exist check_if_sec_str_form_hash
        check_input_file_extension check_linkage_of_2_similar_seqlet_sets check_parf_files chop_word
        cls clu_to_sso_to_msp cluster_merged_seqlet_sets com_gap_pos_hash
        common_compos_2_hash common_compos_id_hash compare_sec_template_with_db compos_id_percent_array
        compos_id_percent_hash composition_table compress_files_by_gzip condense_number_string
        condense_script convert_1_to_3_letter convert_3_to_1_letter convert_arr_and_str_2_hash
        convert_array_to_hash convert_bla_multaln_to_msf convert_bla_to_msf convert_bla_to_msp
        convert_char_to_0_or_1_hash convert_clu_to_msp convert_clu_to_sso_to_msp convert_dna_to_protein
        convert_hmmls_to_msp_files convert_mmp_to_mrg convert_msp_line_to_mmp_line convert_num_0_or_1_hash_opposite
        convert_num_to_0_or_1_hash convert_rna_to_protein convert_sso_to_msp convert_string_to_hash
        convert_to_anti_sense corelation_coefficient correct_head_box count_num_of_char
        cp create_sorted_cluster ctime default_help
        define_secondary_structure_segments delbut detect_file_format_type die_if_file_not_present
        diff_dates digitize_char dir_name dir_path
        dir_search dir_search_single divide_array divide_clusters

Bioinf.pm  view on Meta::CPAN

        extract_words fasta_append fasta_kt1_search fasta_out_seq_no
        fasta_output fasta_permute_array_write fasta_permute_hash_write fetch_seq
        fetch_sequence_from_db fetch_subroutines fetch_swiss_seq file_size
        fill_ending_space filter_by_string_length filter_hash_by_num_value filter_intermediates_by_E_value
        filter_seq_DB_by_seq_length find_central_seq_msp_chunk find_central_sequence find_low_complexity_region
        find_program_in_path find_seq_file_old find_seq_files find_source_perl_library
        follow_seqlet_link fromJulian full_pwd_path geanfammer
        geanfammer_main get_added_matched_regions_in_msp get_all_dirs_from_ENV get_all_msp_files
        get_av_and_sd_seq_length get_av_seq_length get_average_sequence_size get_averaged_prediction
        get_base_names get_column get_common_array_entry get_common_column
        get_common_hash_keys get_correct_percent_alignment_rate get_date get_dir_names_only
        get_domain_inside_domain get_each_posi_diff_hash get_extension_names get_false_positive_seq_matches
        get_file_dir_names get_file_extensions get_first_seq_in_alignment get_full_dir_names
        get_full_file_name get_full_path_dir_names get_full_pwd_path get_gap_positions
        get_hash_value_average get_high_score_blocks get_homology_info_of_seq_pairs get_host_by_addr
        get_host_by_name get_id_among_2 get_id_among_2_1 get_id_among_2_2
        get_internal_dup_in_a_cluster get_isearch_result_stat get_largest_element get_largest_file
        get_linked_sequence get_linux_kernel_version get_longest_str_size get_max_hash_by_value
        get_median get_median get_msp_enquiry_sequence get_msp_matched_sequence
        get_msp_range get_multiple_array_entry get_occurances_of_char get_occurances_of_shift_type_hash
        get_occurances_of_shift_type_hash_all get_overlapping_range get_overlapping_seq_match_size get_pair_homol_array
        get_pair_homol_hash get_path_dirs_from_ENV get_pdb_file_start_number get_peptide_occurance
        get_percent_homo_hash get_percent_homol_arr get_percentage get_perl_keywords
        get_posi_diff get_posi_diff_abs get_posi_diff_and_rms_hash get_posi_diff_hash
        get_posi_rates_hash_out get_posi_rates_hash_out get_posi_rates_hash_out_compact get_posi_rates_hash_out_jp
        get_posi_rates_hash_out_msf get_posi_sans_gaps get_posi_shift_hash get_posi_shift_hash_rms
        get_posi_shift_rate get_posi_shift_rms_hash get_posi_shift_rms_whole get_position_shift_rate
        get_probable_half get_pwd_dir get_pwd_dir_name get_residue_error_rate
        get_scop_correcting_pairs get_sd_of_length_diff get_segment_shift_rate get_seq_fragments
        get_seq_hash_sans_gaps get_seq_identity get_seqblock get_sequence_complexity
        get_sequence_number get_shortest_str_size get_smallest_file get_stat_FASTA_search_result_in_msp_0_files
        get_sub_hash get_subdir_names get_subroutine_calls get_time
        get_total_memory_size_in_linux get_unix_shell_name get_whole_pwd_path get_windows_cs_rate_array

Bioinf.pm  view on Meta::CPAN

        open_dna_files open_dssp_files open_embl_files open_fasta_files
        open_fil_file open_hlx_files open_hmmfs_files open_hmmls_files
        open_jp_files open_lottery_file open_msf_files open_msf_jp_files
        open_msp_files open_out_files open_pdb_files open_pdbg_files
        open_phd_files open_pir_files open_predator_files open_rms_files
        open_rms_files2 open_sdb_files open_self open_seq_alignment_files
        open_seq_files open_sequence_index_files open_slx_files open_sso_files
        open_sst_files open_sst_files_with_gap open_stride_dat_files open_stride_dat_files
        open_subdir_and_go_in_and_do open_swissprot_seq_files open_tem_files opendir_and_go
        opendir_and_go_in_and_do_something opendir_and_go_rand_fasta opendir_and_go_rand_fasta_and_clustal overlay_seq_by_certain_chars
        overlay_seq_for_identical_chars overlay_seq_hash pair_percent_id_trend pairwise_iden_pos
        pairwise_percent_id parse_arguments permute permute_binary
        pick_random_files pick_random_hash_pairs pir_permute_array_write pir_permute_hash_write
        pir_write pir_write plot_histogram_horizontally plot_vertically
        print_clusfile_from_hash print_in_block print_seq_in_block print_seq_in_block_old
        print_seq_in_block_with_print print_seq_in_columns produce_random_numbers push_if_not_already
        push_if_not_already put_gaps_every_x_position_in_string put_gaps_every_x_position_in_string_special put_gaps_in_hash
        put_msp_lines_to_hash_from_bla put_position_back_to_str_seq put_slash_before_special_chars pwd_dir_name
        pwd_path rand_DNA_seq_generate rand_RNA_seq_generate rand_sequence_mul_array
        rand_sequence_one_array rand_sequence_one_string rand_word randomise_file_contents
        randomise_lines read_all_head_boxes read_any_dir read_any_dir2
        read_any_dir_for_dir read_any_dir_simple read_any_seq_files read_blast_hits

Bioinf.pm  view on Meta::CPAN

        remove_file_extension remove_mail_header_in_files remove_non_char remove_repetitives_in_array
        remove_similar_seqlets remove_small_files remove_text replace_lines
        replace_subroutines replace_text reset_all_the_vars reset_shell_environment
        rev_abs_numerically rev_lines_pdb rev_sequence_mul_array reverse_hash
        reverse_sequences roman rotate_seq round
        round_number round_numbers run_fasta_sequence_search scale_for_horizontal_histogram
        scan_win_and_get_cs_rate_pairs scan_win_and_get_sc_rate_pairs scan_win_get_average scan_window_and_calc_average
        scan_window_and_calc_something scan_windows_and_get_compos_seqid_rate scoring scramble_array
        scramble_sequences sd se search_files_in_subdir
        search_palindromes search_self self_self_search send_mail
        sep seq_comp_percent1 seq_comp_percent2 seq_id_percent_array
        seq_to_regexp set_debug set_debug_option set_special_options
        shift_word shift_word_recursively show_array show_default_help
        show_hash show_in_fasta show_options show_subclusterings
        smaller_one sort_by_E_values sort_by_cluster_size sort_by_column
        sort_by_column_bigger_first sort_by_digits_in_string sort_by_hash_values sort_by_keys
        sort_files_by_size sort_files_by_time sort_hash_by_keys sort_hash_by_value
        sort_hash_by_value_and_make_array sort_hash_value_by_column sort_string_by_length sort_words_in_string
        split_fasta_files split_file_by_string split_files split_sequence
        sqrt_array square_array sso_to_msp ssp_permute_array_write
        ssp_permute_hash_write ssp_write steve_permute_array strip_rotated_seq

Bioinf.pm  view on Meta::CPAN

# Returns   : [$av_correct, $num_enq_seq]
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Category  :
# Version   : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
	my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
	    $sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
	    $av_correct, $av_false, $actual_e_value, $correct_matched,
	    %correcting_pairs, @correcting_pairs, %correct);

	my %seqs=%{$_[0]};
	my @pdbg_seqs=@{$_[1]};
	my $evalue=${$_[2]};
	my $pdbg_base=${$_[3]} || $ARGV[3];
	my $E_mult_factor1=${$_[4]};
	my $E_mult_factor2=${$_[4]};
        if(ref($_[5])){  $leng_thresh =${$_[5]}  }else{ $leng_thresh=$_[5]; }

Bioinf.pm  view on Meta::CPAN

                     }
                }
                if($correct_matched !=1){
                     $false_positive++;
                     unless( $correct{$sorted} ){
                          $correct_group{$base} .="Nomolog: $sorted $base  $msp_0{$sorted}\n";
                     }
                     $correct{$sorted} = "Nomolog: $base  $msp_0{$sorted}";
                }
	   }
           if(@match_seqs == 0){ @match_seqs=1; $percent_correct=0; }
	   $sum_correct += $correct;
	   $sum_false   += $false_positive;
	}
	$av_correct = $sum_correct/$num_enq_seq;
	$av_false   = $sum_false  /($num_enq_seq);

	#### $actual_e_value becomes whatever $E_mult_factor1 defined ~~~~~~~~~~~~
	if($E_mult_factor1 != 1){
	   $actual_e_value=$evalue * $E_mult_factor1;
	}elsif($E_mult_factor2 != 1){

Bioinf.pm  view on Meta::CPAN

    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
       $score, @temp_show_sub, $optimize, $file, $evalue, $over_write, $din_dom,
       $sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
       $average_region, $dynamic_factor, @sub_clustering_clu_files);

    $factor=7; # default factor is 7 for 70%
    $length_thresh=30;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Dealing with options
    #_________________________________________

Bioinf.pm  view on Meta::CPAN

    }if($char_opt=~/L/){       $large_region  ='L';
    }if($char_opt=~/A/){       $average_region='A';
    }if($vars{'T'}=~/\d+/){    $length_thresh= $vars{'T'};
    }if($vars{'l'}=~/\d+/){    $length_thresh= $vars{'l'}; ## synonym of 't'
    }if($vars{'f'}=~/\S+/){    $factor= $vars{'f'};
    }if($vars{'s'}=~/\d+/){    $score = $vars{'s'};
    }if($vars{'e'}=~/\d+/){    $evalue= $vars{'e'};
    }if($vars{'E'}=~/\d+/){    $evalue= $vars{'E'}; # synonym of e
    }

   $percent_fac=$factor*10;

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
   #________________________________________________________________________________
   if(@file == 1){  #<=== @file has xxxx.msp, yyyy.msp  zzzz.msp ....,
        print "\n# (1) divide_clusters: One single file was given=> \"@file\"\n" if $verbose;
        $file=$file[0];
        $base_1=${&get_base_names($file)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Bioinf.pm  view on Meta::CPAN

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (4) This is critical seqlet merging step
        #________________________________________________________________________________
        @out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
               "f=$factor", $short_region, $large_region, $average_region, $optimize)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (5) This is showing the result in clu file format
        #________________________________________________________________________________
        @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup,
						   "e=$evalue", "p=$percent_fac", "f=$factor" );
        $good_bad       = $temp_show_sub[0];
        $indup_c        = $temp_show_sub[1];
        $sum_seq_num   += $temp_show_sub[2];
        push(@sub_clustering_out_files, @{$temp_show_sub[3]});

        if($good_bad==1){      push(@good, $file);
        }else{                 push(@bad, $file);       }

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (6) Final write up stage (unecessary)

Bioinf.pm  view on Meta::CPAN

            }

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            #  (4) Clustering the sets of merged seqlets => CORE algorithm
            #____________________________________________________________

            @out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
                   $short_region, $large_region, $average_region)};

            @temp_show_sub=&show_subclusterings(\@out, $big_msp_file, $sat_file, $dindom, $indup,
                                                    "e=$evalue", "p=$percent_fac", "f=$factor");
                        $good_bad       = $temp_show_sub[0];
                        $indup_c        = $temp_show_sub[1];
                        $sum_seq_num   += $temp_show_sub[2];
            push(@sub_clustering_out_files, @{$temp_show_sub[3]});

            if($good_bad==1){          push(@good, $big_msp_file);
            }else{         push(@bad, $big_msp_file);       }

          }
          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`

Bioinf.pm  view on Meta::CPAN

	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
        my ($max_size, $sat_file_name, $clu_file_name,
        $ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
        $indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
        @sub_clustering_out_files);  # clusall_1e-5_clu_14-324_ss.sat
	my @out=@{$array[0]};
	$indup_count=0;

	if($char_opt=~/d/){	    $dindom=1;	}
	if($char_opt=~/i/){		$indup=1;	}
	if($vars{'f'}=~/\S+/){     $factor= $vars{'f'}; }
	if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
	if($vars{'s'}=~/\d+/){	   $score = $vars{'s'};	}
	if($vars{'e'}=~/\d+/){	   $evalue= $vars{'e'};	}

	#print "\n# (1) show_subclusterings : \@file has : @file\n";
	if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.msp/  or
		$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/   ){
		 $ori_cluster_size=$2;
		 $ori_cluster_num =$3;
		 $base=$1;
		 $sat_file_name="$base\.sat";

Bioinf.pm  view on Meta::CPAN

		         $ori_cluster_size,
		         $dindom,
		         $clu_file_name,
								 \%tem3, \%tem4,
								 $indup, );

           #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           # Local subroutine
           #_______________________________________________________________
	   sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
               my(@keys, $indup_count, $x, $m, $percentage_fac);
               my $count=$_[0]; # count of cluster
	       my %tem2=%{$_[1]};	my $num_seq=@keys=sort keys %tem2;
	       my %tem=%{$_[2]};	my $ori_cluster_num=$_[3];
	       my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
	       my $ori_cluster_size=$_[4];
	       my $dindom=$_[5];	my %tem3=%{$_[7]};
	       my $indup=$_[9];	my (%internal_dup);
	       my %tem4=%{$_[8]};
               #~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
	       if($dindom){

Bioinf.pm  view on Meta::CPAN

                                     for($m=0; $m< @dup_keys; $m++){
                                             printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
                                     }
                             }
                    }
                 }

                #~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
                print  CLU  "Cluster size $num_seq\n";
                                        printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
                                          $new_clus_NAME, $evalue, $factor, $percentage_fac,
                                          $ori_cluster_size, $num_seq, $ori_cluster_num);
                print       "Cluster size $num_seq\n";
                printf     ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
                              $new_clus_NAME, $evalue, $factor, $percentage_fac,
                              $ori_cluster_size, $num_seq, $ori_cluster_num);
                for($x=0; $x <@keys; $x++){
                   printf CLU ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
                               $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
                   printf ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
                          $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
                }
                return($indup_count);
	   }
	}

Bioinf.pm  view on Meta::CPAN

#             1cdg_6taa      -------442---------------2222-----------------000-
#             1cdg_2aaa      -------222---------------2222-----------------000-
#
#             <Final output>
#             2aaa_6taa      0%
#             1cdg_6taa      67%
#             1cdg_2aaa      67%
#
# Warning   :
# Keywords  :
# Options   : 'p' or 'P' for percentage term(default)
#             'r' or 'R' for ratio term (0.0 - 1.0), where 1 means all the
#              segments were wrongly aligned.
#             's' or 'S' for Shift rate (it actually caculates the position shift
#              rate for the secondary structure segment.
#             'h' or 'H' for position Shift rate (it actually caculates the position
#              shift rate for helical segments). If this is the only option, it
#              will show the default percentage term rate for helical segments.
#              If used with 'r', it will give you ratio (0.0 - 1.0) for helical
#              segment. If used with 's' option, it will give you position shift
#              rate for only helical segments.
#             'e' or 'E' for position Shift rate (it actually caculates the position
#              shift rate for beta segments). If this is the only option, it will
#              show the default percentage term rate for beta segments. If used
#              with 'r', it will give you ratio (0.0 - 1.0) for beta. If used
#              with 's' option, it will give you position shift rate for only
#              beta segments.
# Returns   :
# Argument  : Two references of hashes. One for error rate the other for sec.
#             assignment.
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub get_segment_shift_rate{

Bioinf.pm  view on Meta::CPAN

# Title     : get_wrong_segment_rate
# Usage     : print_seq_in_block( &get_wrong_segment_rate(\%superposed_hash) );
# Function  : Treats the segment as one single big error.
#             calculates the wrong segment number compared to the correct ones.
# Example   : <input example> hash of 3 keys and values.
#             2aaa_6taa      -------00000---------00000000----0000-------00000-
#             1cdg_6taa      -------442---------------2222-----------------000-
#             1cdg_2aaa      -------222---------------2222-----------------000-
#
#             In the above there are two segments wrong in 3 segment blocks = 2/3
#             <output example> hash of 3 percentage rates.
#
#             2aaa_6taa      0 %
#             1cdg_6taa      66.6666666666667 %
#             1cdg_2aaa      66.6666666666667 %
#
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :

Bioinf.pm  view on Meta::CPAN

#                                                by 9(occurance) in the above
#             for '0' of the first hash, so (0, 2.222,  1, 2.1666,  2, 2.4 )
#             Average is the average of numbers
#             average value in 0-9 scale (or 0-100 with 'p' option)
#             So, if there are
#                  seq1 00111110000,   The 'a' value of 0 and 1 as in the seq2
#                  seq2 33000040000    is 0-> 6/6, 1-> 4/5, while the 'n'
#                                        calc would be, 0-> 6 (60%), 1-> 4(40%)
#
# Argument  : (\%hash1, \%hash2) or optionally (\%hash1, \%hash2, ['n', 'i', 'p', 'a'])
#             'n' => normalizing, 'p' => percentage out, 'i' => make int out, 'a'=> averaged
# Category  :
# Version   : 1.2
#--------------------------------------------------------------------
sub tally_2_hashes{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);

Bioinf.pm  view on Meta::CPAN

	 if( $char_opt=~/m/i){  $minus_whole_cs  = 'm'}

	@input = @{$array[0]};
	if(defined(${$_[2]})){ $base_level =${$_[2]}; }
	if(defined(${$_[3]})){ $scale  =${$_[3]}; }

	for ($t=0; $t< @input; $t++){ $length=length($input[$t]) if(length($input[$t])>$length);}
	if ($length < $window_size){  $window_size = $length;   }
	 #___________ getting ratio for the whole sequence ___________

	$compos_whole_seq=${&main::compos_id_percent_array(\@input)};  ## for whole composition rate
	$seq_id_whole_seq=${&main::seq_id_percent_array(\@input)};
	print "\nComposition ID of the alignment:  $compos_whole_seq\%\n";
	print "Sequence    ID of the alignment:  $seq_id_whole_seq\%\n";
	if ($seq_id_whole_seq == 0){  $ratio_whole_seq =0; }
	else{  $ratio_whole_seq = $compos_whole_seq/$seq_id_whole_seq;  }
	print "Composition and Sequ.  ID Ratio:   $ratio_whole_seq\n";

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	##########   Initial Window size setting      ##############################
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if   (  $ratio_whole_seq >9 ){ $window_size= 28; } # $ratio_whole_seq is

Bioinf.pm  view on Meta::CPAN

	 ##      Go back if the window size is too small due to gaps        ##
	 #####################################################################
	 if( (length($win_no_gap1) < $ori_win_size)&&($variable_win_size ==1) )
	 {
		$window_size+=1; $w--; next;
	 }

	 #####################################################################
	 ##      Getting Compos and Seq ids                                 ##
	 #####################################################################
	 $compos_id=${&main::compos_id_percent_array(\@array_of_2_seq)};
	 $seq_id   =${&main::seq_id_percent_array(\@array_of_2_seq)};


	 #####################################################################
	 ####    Go back if the Seq id is bigger than Compos id        #######
	 #####################################################################
	 if(($variable_win_size eq 'v') && ($seq_id >  $compos_id))
	 {
		$window_size+=1;  $w--;   next;
	 }

Bioinf.pm  view on Meta::CPAN

	my(@ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $t, $length, $w,
	  $compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
	  $ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate );

	for ($t=0; $t < @input; $t++){
	 $length=length($input[$t]) if (length($input[$t])>$length);   }

	if ($length < $window_size){  $window_size = $length;   }

	  #___________ getting ratio for the whole sequence ___________
	$compos_whole_seq=${&compos_id_percent_array(\@input)};
	$seq_id_whole_seq=${&seq_id_percent_array(\@input)};
	if ($seq_id_whole_seq == 0){  $ratio_whole_seq =$compos_whole_seq/10; }
	else{  $ratio_whole_seq =$compos_whole_seq/$seq_id_whole_seq;  }

	  #___________ getting ratio for each window sequence ___________
	for ($w=0; $w < $length; $w++){
	 $offset = $w - int($window_size/2);  # $offset starts from -5 when window_size is 10.
	 $offset=0 if ($offset < 0);
	 $window_1=substr($input[0], $offset, $window_size);  # window_1 is one segment
	 $window_2=substr($input[1], $offset, $window_size);  # of defined length(size)
	 @array_of_2_seq=($window_1, $window_2); # making an array like this = ('ABCDE', 'BDESA')
	 $compos_id=${&compos_id_percent_array(\@array_of_2_seq)};
	 $seq_id   =${&seq_id_percent_array(\@array_of_2_seq)};
	#print "\n offset = $offset Wind1 = $window_1  Wind2 = $window_2 ";
	#print " Compos1 = $compos_id  Seqid = $seq_id \n";

	 #______  Handle special case when $seqid is zero > the rate becomes $compos_id/10 ______
	 if (($seq_id == 0) && ($compos_id != 0)){ $ratio_compos_vs_seqid = $compos_id/10;   }
	 elsif(($seq_id == $compos_id)&&($seq_id == 0)){ $ratio_compos_vs_seqid = 0;}
	 elsif(($seq_id == $compos_id)&&($seq_id == 100)){ $ratio_compos_vs_seqid = 0;}
	 else{ $ratio_compos_vs_seqid=($compos_id/$seq_id); }
	 push(@ratio_array, $ratio_compos_vs_seqid);  }

Bioinf.pm  view on Meta::CPAN

#
# Function  : gets the numbers of occurances for 1, 2, 3 ... position shifts.
#             If hash is given, it only looks at the values.
#             If multiple string, array, hash or combinations of these
#              are given, it will add up to one single result
# Example   :
# Warning   :
# Keywords  : composition of chars, composition table making,
#             make_composition, make composition table
#             occurances_of_char, get_char_occurances, occurances
#             get_percentage_occurances_of_char, percentage_occurances_of_char
# Options   : 'p' for percentage output of the char among others
#             'n' for NO name option when HASH input is given
# Returns   : one ref. of hash  (a =>5, b=>6, c=>4,,,,,)
# Argument  : one ref. of hash (seq1 alsdfjlsj
#                               seq2 asldfjsld
#                               seq3 owiurouou);
# Category  :
# Version   : 1.4
#--------------------------------------------------------------------
sub  get_occurances_of_char{
	my ($i, %H, $no_name, %out, $N,@splited, $val,$percentage_out,
	 $split, $sum);
	for($i=0; $i< @_; $i++){
	if($_[$i]=~/^[\-]?p$/i){
	   $percentage_out=1;   splice(@_, $i, 1); 	   $i--;
	}elsif($_[$i]=~/^[\-]?n$/i){
	   $no_name=1;	        splice(@_, $i, 1);	   $i--;
	}
	}

	for($i=0; $i< @_; $i++){
	if(  ref($_[$i]) eq 'HASH'){
	  my %H=%{$_[$i]};
	  my @names=keys %H;
	  for $key (@names){

Bioinf.pm  view on Meta::CPAN

	  @splited=@{$_[$i]};
	  for $split (@splited){  $out{$split}++; $sum++ }
	}elsif(ref($_[$i]) eq 'SCALAR'){
	   @splited = split(//, ${$_[$i]});
	   for $split (@splited){  $out{$split}++; $sum++ }
	}elsif( !(ref($_[$i])) ){
	   @splited = split(//, $_[$i]);
	   for $split (@splited){  $out{$split}++; $sum++ }
	}
	}
	if($percentage_out==1){
	 my @keys=keys %out;
	 my %percent;
	 for($i=0; $i< @keys; $i++){
		$percent{$keys[$i]} = $out{$keys[$i]}/$sum*100;
	 }
	 return(\%percent);
	}else{
	 return(\%out);
	}
}


#________________________________________________________________________
# Title     : make_composition_table
# Usage     : %occurances=%{&make_compos_table(\%key_and_value_for_seq)};
# Function  : gets the numbers of occurances for 1, 2, 3 ... position shifts.

Bioinf.pm  view on Meta::CPAN

    $aa{"c"} = "cys";  $aa{"n"} = "asn";  $aa{"e"} = "glu";  $aa{"q"} = "gln";
    $aa{"f"} = "phe";  $aa{"r"} = "arg";  $aa{"g"} = "gly";  $aa{"s"} = "ser";
    $aa{"h"} = "his";  $aa{"t"} = "thr";  $aa{"i"} = "ile";  $aa{"v"} = "val";
    $aa{"k"} = "lys";  $aa{"w"} = "trp";  $aa{"l"} = "leu";  $aa{"y"} = "tyr";
    return(\%aa);
}



#________________________________________________________________________
# Title     : amino_acid_compos_id_percent
# Usage     : $percent = &amino_acid_compos_id_percent (%any_hash_with_sequences);
#             The way identity(composition) is derived is;
#
# Function  : gets amino acid composition identity of any given
#             number of sequences(at least 2).
# Example   :
# Warning   :
# Keywords  : get_amino_acid_composition, get_protein_composition, composition
# Options   :
# Argument  : hash of at least 2 sequences.
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent{
	my(%input)= %{$_[0]};
	my(@names)=keys (%input);
	my(@temp, $i, $j, $iden, @all_pairs_id, $iden_sum);
	my(%compos_table1, %compos_table2, $final_iden, $larger);
	for ($i=0; $i < @names; $i ++){  # putting seqs in arrays.
	 $input{$names[$i]}=~ tr/a-z/A-Z/;
	 $input{$names[$i]}=~ s/\W//g;
	 @{"string$i"}= split('', $input{$names[$i]});
	 $larger = @{"string$i"} if @{"string$i"} > $larger;
	}

Bioinf.pm  view on Meta::CPAN

		push (@all_pairs_id,  $iden );
	 }
	}
	for $iden (@all_pairs_id){  $iden_sum+= $iden;  }
	if(@all_pairs_id == 0){ @all_pairs_id =1; }
	$final_iden=$iden_sum/@all_pairs_id;
	\$final_iden;
}

#________________________________________________________________________
# Title     : seq_id_percent_array  (more than 2 elements array)
# Usage     : $percent = &seq_id_percent_array(@any_array_sequences);
#             The way identity(pairwise) is derived is;
#
# Function  : produces amino acid composition identity of any given number of sequences.
# Example   :
# Warning   : This can handle 'common gaps' in the sequences
# Keywords  : get_percent_composition_identity, seq_composition_identity,
#             percent_sequence_composition_id
#
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub seq_id_percent_array{
	my(@input, $denominator,@all_pairs_id, $percent_id);
	my($largest,$p,$i,$j,$k,$iden_residue_num,$iden,@temp,$iden_sum,$gap_num,$final_iden);
	for($d=0; $d<@_; $d++){
		if(ref($_[$d]) eq 'ARRAY'){ @input=@{$_[$d]}; }
		elsif( (ref($_[$d]) eq 'SCALAR') &&( ${$_[$d]}=~/^[aA]/) ){ $average_len_opt =1 }
		elsif( !(ref($_[$d])) && ( $_[$d] =~/^[aA]/) ){ $average_len_opt =1 } }
	if ((@input== 1)||( @input== 0)){
		print "\n\n \" $0 \"  requires at least 2 sequences\n\n";
		print "\n Abnormally dying at seq_id_percent_array in $0 \n\n";
		print chr(7); exit;}
	$shortest=length($input[0]);
	my($sans_gap_seq, $length_sum, $average_seq_len);
	for($p=0; $p < @input; $p++){
		$input[$p]=~ tr/a-z/A-Z/;
		$sans_gap_seq=$input[$p];
		$sans_gap_seq=~s/\W//g;
		$input[$p]=~ s/\W/./g;
		(@{"string$p"})=split('', $input[$p]);
		$largest = length($input[$p]) if length($input[$p]) > $largest;

Bioinf.pm  view on Meta::CPAN

	$average_seq_len = $length_sum/@input;
	for($i=0; $i< @input; $i++){
		for($j=$i+1; $j< @input; $j++){
			for ($k=0; $k <  $largest; $k ++){  # getting composition tables for two seqs.
				if ((${"string$i"}[$k] !~ /\W/)&&(${"string$i"}[$k] eq ${"string$j"}[$k])){
					$iden_residue_num++; }
				elsif((${"string$i"}[$k] =~ /\W/)&&(${"string$i"}[$k]=~ /\W/)){ $gap_num++; }}
			if( $average_len_opt == 1){ $denominator = $average_seq_len; }
			else{ $denominator = $shortest; }
			if($denominator == 0){ $denominator=1; }  # in the above it is 50% rather than 0.07%
			$percent_id=($iden_residue_num/($denominator))*100;
			push(@all_pairs_id, $percent_id);
			undef ($iden_residue_num, $gap_num);
		}
	}
	for (@all_pairs_id){  $iden_sum+=$_;    }
	$final_iden=$iden_sum/($#all_pairs_id+1);
	return( \$final_iden );
}

#________________________________________________________________________
# Title     : compos_id_percent_array  (more than 2 elements array)
# Usage     : $percent = &compos_id_percent_array(@any_array_sequences);
#             The way identity(composition) is derived is;
# Function  : produces amino acid composition identity of any given number of sequences.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub compos_id_percent_array{
	 my(@input)=@{$_[0]};
	 my($largest,$iden,@temp,$iden_sum,$final_iden, @all_pairs_id);
	for($i=0; $i<=$#input; $i++){  $input[$i]=~ tr/a-z/A-Z/;  $input[$i]=~ s/[\.\-\s]//g;
		@temp = split('', $input[$i]);  (@{"string$i"})= @temp;
		$largest = @{"string$i"} if @{"string$i"} > $largest;    }
	for($i=0; $i< @input; $i++){ #_________ permutating ___________
		for($j=$i+1; $j<=$#input; $j++){
			for ($k=0; $k <= $largest; $k ++){  # getting composition tables for two seqs.
				$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
				$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/);   }

Bioinf.pm  view on Meta::CPAN

					 sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
				 $sum_of_the_common_residue_no += $common;     }
			 $compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;   }
	  \$compos_id;
	}
	#-----------------------------------------------------
	return ( \$final_iden ); # final identity for any given set of strings(seq).
}

#________________________________________________________________________________
# Title     : compos_id_percent_hash  (synonym of amino_acid_compos_id_percent)
# Usage     : $percent = &compos_id_percent_hash(%any_hash_with_sequences);
#             The way identity(composition) is derived is;
#
# Function  : gets amino acid composition identity of any given number of sequences.
# Example   :
# Warning   :
# Keywords  : get_amino_acid_composiiton
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#------------------------------------------------------------------------------
sub compos_id_percent_hash{ my(%input, @names);
	if(ref($_[0]) eq 'HASH'){  %input= %{$_[0]}; @names= keys  %input;  }
	else{ print "\n hash ref was not passed to compos_id_percent_hash\n"; exit; }
	my(@temp, $iden, @all_pairs_id, $i, $j, $k,$iden_sum);
	my(%compos_table1, %compos_table2, $final_iden, $larger);
	for ($i=0; $i < @names; $i ++){  $input{$names[$i]}=~ tr/a-z/A-Z/;
	 $input{$names[$i]}=~ s/\W//g;    @temp = split('', $input{$names[$i]});
	 (@{"string$i"})=@temp; $larger = @{"string$i"} if @{"string$i"}>$larger;}
	for ($i=0; $i < @names; $i++){
	 for ($j=$i; $j < @names; $j ++){
		if ($j == $i){ next; }
		for ($k=0; $k < $larger; $k ++){
		  $compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);

Bioinf.pm  view on Meta::CPAN

#________________________________________________________________________
# Title     : common_compos_id_hash (BUG free)
# Usage     : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function  : actual calculation of identity
# Example   : ('A', 200, 'C', 191, D, 99)
#                  ('A', 290, 'C', 199, D, 100)
#             uses only two sequences.
# Warning   :
# Keywords  :
# Options   :
# Returns   : ref. of a scaler (in percent)  eg)  95
# Argument  : two references of hash of seqeunces.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub  common_compos_id_hash{
	my(%hash1)=%{$_[0]};
	my(%hash2)=%{$_[1]};
	my(%common_of_the_2)=();  my($common, $compos_id, $sum_of_the_common_residue_no);
	my(@values1) = values (%hash1);  my(@values2) = values (%hash2);
	my(@combined_values)=(@values1, @values2);

Bioinf.pm  view on Meta::CPAN

#________________________________________________________________________
# Title     : calc_compos_id_hash (the same as 'common_compos_hash')
# Usage     : %hash = &calc_compos_hash(\%any_hash1, \%any_hash1);
# Function  : actual calculation of identity
# Example   : ('A', 200, 'C', 191, D, 99)
#                  ('A', 290, 'C', 199, D, 100)
#             uses only two sequences.
# Warning   :
# Keywords  :
# Options   :
# Returns   : ref. of a scaler (in percent)  eg)  95
# Argument  : two references of hash of seqeunces.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub  calc_compos_id_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; my(%common_of_the_2)=();
	  my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
	  my(@values1) = values (%hash1);   my(@values2) = values (%hash2);
	  my(@combined_values)=(@values1,@values2);
	  for $elem (@combined_values){
			$sum_residues += $elem;   }
	  if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
	  else{
		  for $key1(keys %hash1){
			  $common=&smaller_one($hash1{$key1}, $hash2{$key1});
				  sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
			  $sum_of_the_common_residue_no += $common;     }
		  $compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;   }
	  \$compos_id;
}
#________________________________________________________________________
# Title     : get_percentage
# Usage     : %out= %{&get_percentage(\%result, '1')};
# Function  : calculates the percentage content of any single char over the whole
#             length of strings in it.
# Example   : if the string is  'seq  ABCDEEEEEFFEFE' given in a hash
#             if you put 'A' as one argument, it counts the occurances of 'A'
#             and gets the percentage of it.
# Warning   : This converts array and string input as ref. into arbitrary hash and
#             returns hash
#             programmed by A Biomatic
# Keywords  : get_percentage_of_char
# Options   : None yet.
# Returns   : Numerical Percentage
# Argument  : ref. for Scalar string or Array of chars or Hash  AND 'the target char'
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percentage{
	my(@in, $k, $sort, $numerator, $residue, @out_hash_ref, %hash_out );
	for($k=0; $k< @_ ;$k++){
	  if( !ref($_[$k])&& (length($_[$k]) == 1 )){
		 $numerator = $_[$k];
	  }
	  elsif( (ref($_[$k]) eq 'SCALAR') && (length(${$_[$k]}) == 1 )){
		 $numerator = ${$_[$k]};
	  }
	  elsif(ref($_[$k]) eq "HASH")  { push(@in, $_[$k]); }
	  elsif(ref($_[$k]) eq "ARRAY") { push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
	  elsif(ref($_[$k]) eq "SCALAR"){ push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
	}
	####### final output is => @in of hash ref elements #######
	for (@in){   my(%H) = %{$_}; my(@keys)= sort keys %H;
	 for $name(@keys){
		 my($numerator_count);
		 my($seq_len) = length($H{$name}); print  "\n $name Sequence length: ", $seq_len, "\n";
		 my(@string) = split(//, $H{$name});
		 for $residue (@string){  if($residue =~/^$numerator$/){ $numerator_count ++; }}
		 my($percent) = $numerator_count/$seq_len *100;
		 $hash_out{$name}=$percent;    }
	 push(@out_hash_ref, \%hash_out);  }
	if(@out_hash_ref ==1){ return($out_hash_ref[0]); }
	elsif( @out_hash_ref > 1){ return(@out_hash_ref); }
}


#________________________________________________________________________
# Title     : pairwise_percent_id  (pairwise sequence identity in percentage)
# Usage     : $identity = ${&pairwise_percent_id(%arrayinput)};
#
# Function  : takes a ref. of a hash of names and sequences, returns
#             percent identity.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub pairwise_percent_id{
	my($i,$j,$k, @iden_array_ref);
	for($i=0; $i< @_; $i++){  my %input= %{$_[$i]};  my @names= sort keys %input;
	  my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
	  my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
	  for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
		 @temp = split('', $input{$names[$i]});  (@{"string$i"})=@temp;
		 $larger = @{"string$i"} if @{"string$i"} > $larger; }
			 for ($i=0; $i < @names; $i++){       # to make permutated pairs.
				for ($j=$i+1; $j < @names; $j ++){
					for ($k=0; $k < $larger; $k ++){  # getting composition tables for two seqs.
					  $iden+=2 if ((${"string$i"}[$k] eq ${"string$j"}[$k])&&(${"string$i"}[$k] =~ /\w/));
					  $residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
					  $residue_sum2++ if (${"string$j"}[$k] =~ /\w/);  }
					$whole_seq_len =($residue_sum1+$residue_sum2);
					$percent_for_pair = $iden/$whole_seq_len*100;
					push(@percent_for_pair,$percent_for_pair);
					$residue_sum1=0; $residue_sum2=0; $iden=0; } }
			 for $iden (@percent_for_pair){ $iden_sum+=$iden;}
	  $final_av_iden=$iden_sum/( @percent_for_pair );
	  push(@iden_array_ref, \$final_av_iden);  }
	if(@iden_array_ref ==1){ return($iden_array_ref[0]);}else{ return(@iden_array_ref);}
}


#________________________________________________________________________
# Title     : get_seq_identity
# Usage     : $identity = ${&get_seq_identity(%arrayinput)};
#
# Function  : takes a ref. of a hash of names and sequences, returns
#             percent identity. NOT composition identity.

# Example   :
# Warning   :
# Keywords  : get_sequence_identity
# Options   :
# Returns   :
# Argument  : hash(es) of sequences.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_seq_identity{
	my($i,$j,$k, $c, @iden_array_ref);
	for($c=0; $c< @_; $c++){
	  my %input= %{$_[$c]};
	  my @names= sort keys %input;
	  my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
	  my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
	  for ($i=0; $i < @names; $i ++){
		 $input{$names[$i]}=~ tr/a-z/A-Z/;
		 @temp = split(//, $input{$names[$i]});
		 @{"string$i"}=@temp;
		 $larger = @{"string$i"} if @{"string$i"} > $larger; }
		 for ($i=0; $i < @names; $i++){       # to make permutated pairs.
			for ($j=$i+1; $j < @names; $j ++){
				for ($k=0; $k < $larger; $k ++){
				  if ( ${"string$i"}[$k] eq ${"string$j"}[$k] and ${"string$i"}[$k] =~ /\w/){
					 $iden+=2 ;
				  }
				  $residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
				  $residue_sum2++ if (${"string$j"}[$k] =~ /\w/);
				}
				$whole_seq_len =($residue_sum1+$residue_sum2);
				$percent_for_pair = $iden/$whole_seq_len*100;
				push(@percent_for_pair,$percent_for_pair);
				$residue_sum1=0;
				$residue_sum2=0;
				$iden=0;
			}
		 }
	  for $iden (@percent_for_pair){
		  $iden_sum+=$iden;
	  }
	  if(@percent_for_pair <1){ @percent_for_pair=(1); }
	  $final_av_iden=$iden_sum/( @percent_for_pair );
	  push(@iden_array_ref, \$final_av_iden);
	}
	if(@iden_array_ref ==1){
	   return($iden_array_ref[0]);
	}else{
	   return(@iden_array_ref);
	}
}




#________________________________________________________________________
# Title     : get_correct_percent_alignment_rate  (made for Bissan)
# Usage     : &get_correct_percent_alignment_rate(\$file1, \$file2);
# Function  : accepts two files and prints out the sequence identities of the alignment.
# Example   :
# Warning   : Alpha version,  A Biomatic , made for Bissan
# Keywords  :
# Options   : h  # for help
#             v  # for verbose printouts(prints actual sequences)
# Returns   : reference of Scalar for percentage correct alignment(for already
#             aligned sequences)
# Argument  : two sequence files which have identical sequence names.
# Category  :
# Version   :
#--------------------------------------------------------------------
sub get_correct_percent_alignment_rate{
	 my($i, $j, $k, $verbose, @string1, @string2, $larger, $seq_pair_id, @seq_pair_ids );
	 my(%inputhash1) = %{&read_any_seq_files($_[0])};
	 my(%inputhash2) = %{&read_any_seq_files($_[1])};
	 my(@names)= sort keys %inputhash1;
	 ######################################
	 ####### Sub argument handling ########
	 ######################################
	 for($k=0; $k< @_ ;$k++){
		if( !ref($_[$k])&& (length(${$_[$k]}) < 5)){  # when inputs are not ref.
		  if($_[$k]=~ /^[\-vV]$/){ $verbose = 1; next;}

Bioinf.pm  view on Meta::CPAN

	 }
	 print "\n", "_"x88, "\n";
	 my($whole_average_of_the_id)=${&array_average(\@seq_pair_ids)};
	 print "The whole average is; $whole_average_of_the_id\n";
	 if(@seq_pair_ids == 1){ return( \$seq_pair_ids[0] ); }
	 elsif(@seq_pair_ids > 1){ return( \@seq_pair_ids ); }
}


#________________________________________________________________________
# Title     : amino_acid_compos_id_percent_trend
# Usage     :
# Function  :
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent_trend{
	my(%input) = %{$_[0]};
	my(@common, @string,@accumu_percent_iden)=(); my(%common_so_far, %compos_table);
	my($percent_id_so_far, $length_of_one_seq,$length_of_all_seq, $seq_no)=0;
	for $key(keys %input){
			$input{$key}=~s/[. \d-]//g;
			@string= split(//, $input{$key});
			print @string; print "\n";
			$length_of_one_seq = $#string+1;
			$length_of_all_seq +=$length_of_one_seq;
			$seq_no += 1;
			%compos_table  = &composition_table(@string);
			@check = keys (%common_so_far);
			if  ($#check < 0){ %common_so_far = %compos_table; }
			else{ %common_so_far= %{&common_compos_2_hash(\%common_so_far,\%compos_table)};}
			for $value(values %common_so_far){ $common_residue_sum +=$value; }
			$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
			$common_residue_sum =0;  }
	for $value(values %common_so_far){ $common_residue_sum +=$value; }
	$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
	return(\$final_percent_id);
}

#________________________________________________________________________
# Title     : composition_table   (can handle both nucleic and protein seq)
# Usage     : %output = %{&compos_table(@input_array1, @input_array2,,,,)};
#             example input
#
# Function  : returns a table of alphabet with occurances.
#             can handle any char, this converts char to upper case.
# Example   :

Bioinf.pm  view on Meta::CPAN

	my(%common_of_the_2)=(); my($common)=0;
	for $key1(keys %hash1){
	 $common=&smaller_one($hash1{$key1}, $hash2{$key1});
	 if ($common =~ /\d+/){
		$common_of_the_2{$key1}=$common; } }
	\%common_of_the_2;
}


#________________________________________________________________________
# Title     : pair_percent_id_trend
# Usage     : @array = &pair_percent_id_trend (%arrayinput);
# Function  :
# Example   : common gaps means only '.' (dots, not alphabets!!)
#             AAA....BBCB
#             AAAB..B.BCC  --> A.A.....BC. (as in an array)
#             A.AAA...BCA
#             The resulting array XXXXX..XXXX is literally like so.
#             This is to detect absurd gaps in the above.
#
#
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub pair_percent_id_trend{
	my(%input) = %{$_[0]};
	my(@common, @string,@accumu_percent_iden)=();
	my($percent_id_so_far)=0;
	for $key(keys %input){
	  my($len) = &smaller_one($#common, $#string) unless $#common < 0;
	  $input{$key}=~s/ //g;
	  @string= split(//, $input{$key});
	  $length_of_one_seq = $#string+1;
	  $length_of_all_seq +=$length_of_one_seq;
	  $seq_no += 1;
	  for ($k=0; $k <= $len;$k++){
		  if($#common == -1){
			 @common = @string;

Bioinf.pm  view on Meta::CPAN

		  }
		  elsif ( $string[$k] eq $common[$k] ){
			 $common[$k] = $string[$k];
			 $identical_count +=1;
		  }elsif( $string[$k] ne $common[$k]){
			 $common[$k]='.';
		  }
	  }
	  $num_of_iden_char = &count_num_of_char(\@common);
	  $av_seq_no = $length_of_all_seq/$seq_no;
	  $percent_id_so_far = $num_of_iden_char/$av_seq_no*100;

	  print "\n percent_id so far = $percent_id_so_far \n";
	  push(@accumu_percent_iden,$percent_id_so_far);

	} # end of for (after all sequences have been run).
	$num_of_iden_char = &count_num_of_char(@common);
	$av_seq_no = $length_of_all_seq/$seq_no;
	$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
	print "\n percent_id so far = $percent_id_so_far \n";
	\@accumu_percent_iden; # final ids array.
}
#________________________________________________________________________
# Title     : smaller_one
# Usage     : $smaller = & smaller_one($var, $var2);
#
# Function  : gets smaller value of the two inputs
# Example   : will return   5   with  &smaller_one(5, 50);
# Warning   : gets only digits!!
# Keywords  :
# Options   :

Bioinf.pm  view on Meta::CPAN

# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub count_num_of_char{
	 my(@input)={$_[0]};
	 my($num_of_char)=0;
	 for $elem(@input){  # this is for the percentage of TWO seqs.
		 if ($elem =~ /\w/){
					 $num_of_char +=1;
		 }
	 }
	 $num_of_char;
}
#________________________________________________________________________
# Title     : remov_com_column2  (this is the older and slower version)
# Usage     : %new_string = %{&remov_com_column2(\%input_hash)};
# Function  :

Bioinf.pm  view on Meta::CPAN

# ((-l option for long output )
#  NAME MOZ_HUMAN_part
#  HEADER  |- Residue -|  Pred  Rel      NAli   Asn
#  PRED    1    MET    M  c     0.000    0      ?
#  PRED    2    ALA    A  c     0.000    0      ?
#
# Warning   :
# Keywords  : open_prd_files, open_pred_files, predator, open_prdl_files
#             open_pre_files, secondary structure prediction file
# Options   : 's' for sequence output as well (\%sec_str, \%seq)
#             'p' for percentage of the sec. str.
#             'a' for accumulated percentage. This will
#                  set 'p' automatically
#             'n' for NO name when outputing Percentage of chars with
#                 HASH input to get_occurances_of_char sub.
#      $reverse_residue_order=r by r
# Returns   :
# Argument  :
# Category  :
# Version   : 1.8
#-----------------------------------------------------------
sub open_predator_files{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my( @out_ref, $seq_out, %sec_str, %seq, $percent_out, $NO_name_out,
        $short_form_out_detected, $long_form_out_detected, $accumulate,
        $reverse_residue_order, %rev_sec_str);
    if($char_opt=~/s/i){ $seq_out=1 }
    if($char_opt=~/a/i){ $accumulate=1  }
    if($char_opt=~/p/i){ $percent_out=1 }
    if($char_opt=~/n/i){ $NO_name_out='n' }
    if($char_opt=~/r/){  $reverse_residue_order='r' }

    for($i=0; $i< @file; $i++){
	 my (%sec_str, %seq) if($accumulate !=1);

         open(PREDATOR_FILE, "$file[$i]");
         print "\n# (INFO) open_predator_files: opening $file[$i]";
	 while(<PREDATOR_FILE>){
              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`

Bioinf.pm  view on Meta::CPAN

                 $residue_3_letter=$2;
                 $residue_1_letter=$3;
                 $sec_str_predicted=$4;
                 $reliability=$5;
                 $sec_str{$position}=[$residue_1_letter, $sec_str_predicted, $reliability];
              }
	 }
         close (PREDATOR_FILE);
	 print "\n \%sec_str is: ", %sec_str, "\n" if ($debug == 1);
	 if($seq_out==1){ push(@out_ref, \%sec_str, \%seq);
	 }elsif($percent_out==1 ){
	      push(@out_ref, [%{&get_occurances_of_char(\%sec_str, $NO_name_out, 'p')}] );
	 }elsif($percent_out !=1){ push(@out_ref, \%sec_str) }

         #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
         # If -r option is set (for long form, this does not affect
         #____________________________________________________________
         if($short_form_out_detected and $reverse_residue_order){
              @keys=keys %sec_str;
              for($r=0; $r<@keys; $r++){
                  $rev_sec_str{$keys[$r]}=reverse($sec_str{$keys[$r]});
              }
              %sec_str=%rev_sec_str;

Bioinf.pm  view on Meta::CPAN

#             >P1;1cdg
#             secondary structure and phi angle
#             CCCCCCCCCCCCCCCCEEECCHHHHCCCCHHHCCCPHHCCCCPCC-CCCCCPCCHHHHHHHHHCPHHHHHPCCEE
#             >P1;1cdg
#             solvent accessibility
#             TTTTTTTTTTTFFFFFFFFFFFFFFTTTTTTTTTTTTTTTTTFTT-TTTTFFFFFTFFTTTFTTTFFTTFTFTFF
#             >P1;1cdg
#             DSSP
#             CCCCCCCCCCCCCCCCEEECCHHHHCCCCGGGCCCGGGCCCCCCC-CCCCCCCCHHHHHHHHHCCHHHHHCCCEE
#             >P1;1cdg
#             percentage accessibility
#             67523272360000000000000002213792129b722248085-14110000030015105660028040200
#             2ltn           ----TETTSFLITKFSPDQQNLIFQGDGYTT-KEKLTLTK------AVKNTVGRALYSSP
#             1loe           ----TETTSFSITKFGPDQQNLIFQGDGYTT-KERLTLTK------AVRNTVGRALYSSP
#
#             2ltn           ----CEEEEEEECCCCCCCCCEEEEPCCEEP-PPCEEEEC------CCCPCEEEEEECCC
#             1loe           ----CEEEEEEECCCCCCCCCEEEEPCCEEE-PPEEEEEC------CCCPCEEEEEECCC
#
#             2ltn           ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFTFFT------TTTTTTFFFFTTTT
#             1loe           ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFFFFT------TTTTTTFFFFTTTT
#

Bioinf.pm  view on Meta::CPAN

# Version   : 1.0
#--------------------------------------------------------------------
sub open_tem_files{
	my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
	  $s, $t, $u, $v, $w, $x, $y, $z, $pwd, $file, $dir, $output, $in_dir,
	  %hash, @keys, @array, @hash, $option_string, $string, @in, $line,
	  $name, %out, $gap_chr, @str1, @str2, $num_opt, @file, @dir,
	  $char_opt, $char_opt_given, $num_opt_given,
	  @char_options, @file, $original_dir, @read_files, %array_msf, %array_jp,
	  $jp_file, $error_rate, $id_compos, @dir, @names, $name, $name_found,
	  @outref, %sequence, %secondary,%solvent_access, %DSSP, %percent_accessibility,
	  $name_found,$type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc
	);
	##################################################
	##### Start of  general argument handling   ######
	##################################################
	for($k=0; $k< @_ ;$k++){
	  if( !ref($_[$k]) ){
			  if($_[$k]=~ /^[\-]*(\w)$/){
				  $char_opt  .= "\,$1";
				  $char_opt_given =1;

Bioinf.pm  view on Meta::CPAN

				} #from below ============== SECONDARY
			 }elsif(/^secondary structure and phi angle/){  $type_secon = 1;
			 }elsif(($type_secon ==1)&&(/^([\w\-]+)[\*]*$/)){
				$secondary{$name}.=$1;     #from below============= SOLVENT ACCESSIBILITY
			 }elsif(/^solvent accessibility/){  $type_sol = 1;
			 }elsif(($type_sol ==1)&&(/^([\w\-]+)[\*]*$/)){
				$solvent_access{$name}.=$1;     #from below========= DSSP
			 }elsif(/^DSSP/){  $type_DSSP = 1;
			 }elsif(($type_DSSP ==1)&&(/^([\w\-]+)[\*]*$/)){
				$DSSP{$name}.=$1;     #from below=================== PERCENTAGE ACCESSIBILITY
			 }elsif(/^percentage accessibility/){  $type_acc = 1;
			 }elsif(($type_acc ==1)&&(/^([\w\-]+)[\*]*$/)){
				$percent_accessibility{$name}.=$1;  } }
		  push(@outref,\%sequence,\%secondary,\%solvent_access,\%DSSP,\%percent_accessibility);
	  }  }
	if( ($char_opt =~ /s/i) || ( @outref  == 1 ) ){
	  return(\%sequence); }
	elsif( @outref > 1){ return(@outref); } # <-- contains (\%sequence,\%secondary,....)
}

#________________________________________________________________________
# Title     : open_hlx_files
# Usage     :
# Function  :

Bioinf.pm  view on Meta::CPAN

            chdir($original_dir);
            if($realfile1 =~/(\d+\-$no\.msf)$/){
                @dir=split(/\//, $realfile1);
                $dir=$dir[($#dir-1)];  # where am I ?
                # $jp_file = $original_dir.'/'.$dir.'.jp';
                # %array_msf =&open_msf_files($realfile1);
                # %array_jp  =&open_jp_files ($jp_file);
                # $array_ref_msf = \%array_msf;
                # $array_ref_jp  = \%array_jp;
                # $error_rate =&get_posi_shift_hash($array_ref_msf, $array_ref_jp);
                # $id_compos  =&amino_acid_compos_id_percent($array_ref_jp);
                # push(@rates_accumu,$error_rate);
                # push(@compos_id,$id_compos);
            }
        }
        else
        {
                               next;
        }
    }
}

Bioinf.pm  view on Meta::CPAN

		  }
	 }
	 $sum = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }
	 \$id_counter; # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title     : get_percent_homol_arr
# Usage     : $homology_out = ${&get_pair_homol(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function  : get pair wise seq. identity of any two strings, outputs a scalar (%)
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percent_homol_arr{
	 my(@input)=@{$_[0]};
	 $input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$input[0]);
	 my(@string2)= split(//,$input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
		  print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter, $sum,$percent_homol)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 $sum = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }else{
		  $percent_homol=($id_counter/$sum)*100;
	 }
	 return(\$percent_homol); # $id_counter is the homology counter;
}

#________________________________________________________________________
# Title     : get_pair_homol_hash
# Usage     : $homology_out = & get_pair_homol (%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function  : get pair wise seq. identity as a scalar count
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :

Bioinf.pm  view on Meta::CPAN

		  }
	 }
	 my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		 print "\n There is something wrong in getting homology in get_pair_homol \n";
		 &caller_info;
			}
	 return ($id_counter); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title     : get_percent_homo_hash
# Usage     : $homology_out = &get_pair_homol_hash(%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function  : get pair wise seq. identity(%) of any two strings put in as a hash
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percent_homo_hash{
	 my(%input)=@_;
	 &hash_chk(\%input);
	 my(@keys_input)= keys (%input);
	 my(@values_input) = values (%input);
	 $values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$values_input[0]);
	 my(@string2)= split(//,$values_input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
					 print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter,$percent_homol,)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }else{
		  $percent_homol=($id_counter/$sum)*100;
	 }
	 return ($percent_homol);
}


#________________________________________________________________________
# Title     : file_size
# Usage     : $outputfilesize = &file_size($input_file_name);
# Function  : returns the size of any single testing file
# Example   :
# Warning   : Q is for quality of this sub. This can't be wrong.
# Keywords  :

Bioinf.pm  view on Meta::CPAN

# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub file_size { my($infile)=$_[0];
	if ( $size=(-s "$infile")){ return $size; }
}

#________________________________________________________________________
# Title     : seq_comp_percent2
# Usage     : @outarray = &seq_comp_percent2(@any_input_string_array);
# Function  : get string seq COMPOSITION identities(a to z). gets array
#             of strings and outs array of % numbers
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub seq_comp_percent2{   	# simple and basic seq. id. eg. ABC on ABCABC is 50 % identical.
	my(@input)=@_;
	my(@array_of_ids2, $id2, @char1, @char2);
	&array_chk(sort @input);
	my($longest_str_size)  = &get_long_str_size (@input), "\n";
	my($shortest_str_size) = &get_short_str_size(@input), "\n";
	print "longest_str_size",$longest_str_size;
	print "shortest_str_size",$shortest_str_size;
	if (($longest_str_size/$shortest_str_size) > 4){
		  print "\n  The shortest string is less than 1/4 of the longest\n";
		  print "  This is quite meaningless, but will go on\n";

Bioinf.pm  view on Meta::CPAN

# Argument  :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub cls{   my($cls) = `clear`;
    print $cls;
}


#________________________________________________________________________
# Title     : seq_comp_percent1
# Usage     : @outarray = &seq_comp_percent1(@any_input_string_array);
# Function  : get string seq identities(a to z). gets array of strings and outs array of % numbers
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   : one ref. of an array
# Argument  : one ref. of an array
# Category  :
# Version   :
#--------------------------------------------------------------------
sub seq_comp_percent1{ 		# this is affected by seq. length
		  my(@input)=@{$_[0]};
		  my(@array_of_ids1, $id1, @char1, @char2);
		  &array_chk(\@input);
		  @input = sort (@input);
		  $longest_str_size  = &get_long_str_size (@input), "\n";
		  $shortest_str_size = &get_short_str_size(@input), "\n";
		  if (($longest_str_size/$shortest_str_size) > 4){
					 print "\n The shortest string is less than 1/4 of the longest\n";
					 print " This is quite meaningless, but will go on\n";
		  }

Bioinf.pm  view on Meta::CPAN

	 for $key1 (sort keys %hash1){
					 for $key2 (sort keys %hash2){
								if ($key1 eq $key2){
										  $no_of_same = &min($hash1{$key1},$hash2{$key2});
										  $sum_of_same += $no_of_same;
										  last;
								}
					 }
	 }
	 $identity = $sum_of_same*2/(&sum_array(@num_char1,@num_char2))*100;
	 # print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title     : get_id_among_2_2
# Usage     : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function  : gets the % id of any two sequences, returns in  100.0% format.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :

Bioinf.pm  view on Meta::CPAN

					 $no_of_same = &min($hash1{$key1},$hash2{$key2});
					 $sum_of_same += $no_of_same;
					 last;
				}
			}
	 }
	 $seq1=&sum_array(@num_char1);
	 $seq2=&sum_array(@num_char2);
	 $longer_seq = &max($seq1, $seq2);
	 $identity = $sum_of_same/$longer_seq*100;
	 #print "percent iden = ", $identity, "\n";
}

#________________________________________________________________________
# Title     : array_average
# Usage     : $output = &array_average(\@any_array);
# Function  : (the same as average_array)
# Example   :
# Warning   : If divided by 0, it will automatically replace it with 1
# Keywords  : get_array_average, av_array, average_array, get_average_array
#             average_of_array, average_array

Bioinf.pm  view on Meta::CPAN

    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my(%hash, @pdbd_seqs, @superfamily, @members, @files, $over_write, $msp_file_gz,
         $msp_seq_file_dir, $msp_file, $msp_file_long, $msp_file_gz_long,
         $pdbd_seq_long, $final_merged_ISL_fasta,$pdbg_file, $pwd , $range_start,
         $range_stop, @files_NOT_processed);
    my $source_db_fasta=$ENV{'NRDB_FASTA'}; ## general default
    my $evalue_thresh=0.001; #  default
    my $score_thresh=70;     #  default
    my $range_thresh=10;
    my $percent_id_thresh=0.95;
    print "\n# (i) Running sub of: make_intermediate_sequence_library\n";

    if($vars{'FASTA_DB'}=~/\S+/){  $source_db_fasta=$vars{'FASTA_DB'} }
    if($vars{'MSP_DIR'}=~/\S+/){   $msp_seq_file_dir=$vars{'MSP_DIR'} }
    if($vars{'p'}=~/\S+/){         $pdbg_file=$vars{'p'}; print "\n# (i) $vars{'p'} is given \n"; } ## PDBG file input
    if($vars{'m'}=~/\S+/){         $msp_seq_file_dir=$vars{'m'} }
    if($vars{'DB'}=~/\S+/){        $source_db_fasta=$vars{'DB'} }
    if($vars{'E'}=~/\S+/){         $evalue_thresh=$vars{'E'} }
    if($vars{'e'}=~/\S+/){         $evalue_thresh=$vars{'e'} }
    if($vars{'s'}=~/\S+/){         $score_thresh=$vars{'s'} }

Bioinf.pm  view on Meta::CPAN

           next;
       }
       if(@pdbd_seqs){      print "\n# (i) \@pdbd_seqs for $superfamily is: @pdbd_seqs\n";
       }else{               print "\n# (E) \@pdbd_seqs is empty, strange, error in making $out_fasta_file \n\n";
       }

        for($j=0; $j< @pdbd_seqs; $j++){
             $pdbd_seq=$pdbd_seqs[$j];
             $pdbd_seq_long="pdb\_$pdbd_seqs[$j]";

             my (@msp_content, $evalue, $sub_dir, $percentage_id, $range_length);
             if($pdbd_seq=~/^ *$/){  next; }

             $msp_file="$pdbd_seq\.msp";
             $msp_file_gz="$pdbd_seq\.msp\.gz";
             $msp_file_long="pdb\_$pdbd_seq\.msp"; ## this is to handle Sarah's pdb_ prefixed pdbd files
             $msp_file_gz_long="pdb\_$pdbd_seq\.msp\.gz";

             print "# (i)      I am processing $msp_file  <- subroutine:make_intermediate_sequence_library\n";
             if($msp_file_gz=~/^([dec])\S/ or $msp_file_long=~/pdb_([dec])\S/){
                  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`

Bioinf.pm  view on Meta::CPAN


                  print "\n# (i) Now I have finished reading in ONE msp file for $pdbd_seq ($superfamily) into \@msp_content";

                  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
                  # Processing each MSP line
                  #_______________________________________________________________________________________
                  my ($identical_one_added, $interm_seq);
                  for($k=0; $k< @msp_content; $k++){ ## NEW MSP format
                      if($msp_content[$k]=~/^ *(\S+) +(\S+) +(\S+) +\d+ +\d+ +[pdb_]*$pdbd_seq[_\d+\-\d+]* +(\d+) +(\d+) +(\S+)/){     # [pdb_]* is for Sarah's pdb_ prefix
                          #if($pdbd_seq eq $6){ next }  ## this is to exclude the same seq match
                          $evalue=$2;               $percentage_id=$3;
                          $range_length=$5-$4;      $score =$1;  $interm_seq=$6; $range_start=$4, $range_stop=$5;

                          if($interm_seq=~/(\S+)_\d+\-\d+/){ $interm_seq=$1; }

                          if($evalue < $evalue_thresh and $range_length > $range_thresh and $score > $score_thresh){
                              if($percentage_id <= $percent_id_thresh){  ## to keep one pdb seq ($percentage_id=1)
                                  unless($interm_seq=~/^d\d\S/){  # to remove entries like:  d1abc_10-20_d2acb__ (two pdb seqs)
                                      $interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
                                  }
                              }elsif($percentage_id ==1 and $identical_one_added < 1){  ## this is to prevent more than 1 100% id seq
                                  $interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
                                  $identical_one_added++;
                              }
                          }
                      }elsif($msp_content[$k]=~/^ *(\d+) +(\S+) +\d+ +\d+ +$pdbd_seq +(\d+) +(\d+) +(\S+)/){
                          $evalue=$2;              $score=$1;
                          $range_length=$4-$3;     $interm_seq=$5;
                          if($evalue < $evalue_thresh and $range_length > $range_thresh
                              and $score > $score_thresh){
                              $interm_hash{$superfamily} .=" $interm_seq\_$3\-$4\_$pdbd_seq";



( run in 0.613 second using v1.01-cache-2.11-cpan-709fd43a63f )