Bioinf

 view release on metacpan or  search on metacpan

Bioinf.pl  view on Meta::CPAN

        'd1eny__ d1nhp_2', 'd1pbe_1 d2pgd_2',
        'd1ldb_1 d1pbe_1', 'd1ldb_1 d1lvl_2',
        'd1gesa2 d1hlpa1', 'd1dhr__ d1nhp_2',
        'd1hdca_ d1tde_1', 'd1gesa1 d1psda2',
        'd1pbe_1 d2cmd_1', 'd1tde_2 d1udpa_',
        'd1pbe_1 d2dlda2', 'd1hdca_ d1tde_2',
        'd1gesa2 d1ldb_1', 'd1psda2 d2tpra1',
        'd1gdha2 d1lvl_2', 'd1tde_1 d2dlda2',
        'd1ldm_1 d1pbe_1', 'd1pbe_1 d1scua2',
        'd1gesa1 d2ohxa2', 'd1lvl_2 d2naca2',
        'd1gd1o1 d1lvl_1', 'd1fvl__ d1kst__',
        'd1kst__ d2ech__', 'd1hsaa2 d1std__', ## d1hsaa.. is NOT homol, but to fix a problem in E_100_e_0.0005_j30_segged_2092
        'd1afp__ d1hfi__'
        );


     for($i=0; $i< @correcting_pairs; $i++){
                     $correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
     }
     return(\%correcting_pairs);
}

#__________________________________________________________________
# Title     : get_isearch_result_stat
# Usage     : &get_self_isearch_stat(\%stat2, \@pdbg_seqs, \$evalue);
# Function  :
# Example   : Following input (hash eg: %stat2, input with the first word as key)
#              will become columnar output.
#
#    d1ash__ d1bam__ d1mba__ d2lhb__
#    d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
#    d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
#
#    Will become:
#      ....
#      d1ash__ d2lhb__ Homolog: G1   98 0.012
#      d1baba_ d1flp__ Homolog: G1   82 0.072
#      d1baba_ d1hbg__ Homolog: G1   79 0.13
#      d1baba_ d2lhb__ Homolog: G1   228 8e-12
#      d1baba_ d3aaha_ Nomolog: G1   74 2
#      d1baba_ d3sdha_ Homolog: G1   92 0.012
#      d1cola_ d1hbg__ Nomolog: G1   79 0.59
#      d1cpca_ d1cpcb_ Homolog: G1   176 4.9e-08
#      ....
#
# Keywords  : get_stat_interm_search, get_intermediate_search_stat
# Options   : _  for debugging.
#             #  for debugging.
# Package   : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns   : [$av_correct, $num_enq_seq]
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Category  :
# Version   : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
	my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
	    $sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
	    $av_correct, $av_false, $actual_e_value, $correct_matched,
	    %correcting_pairs, @correcting_pairs, %correct);

	my %seqs=%{$_[0]};
	my @pdbg_seqs=@{$_[1]};
	my $evalue=${$_[2]};
	my $pdbg_base=${$_[3]} || $ARGV[3];
	my $E_mult_factor1=${$_[4]};
	my $E_mult_factor2=${$_[4]};
        if(ref($_[5])){  $leng_thresh =${$_[5]}  }else{ $leng_thresh=$_[5]; }
	my %msp_0=%{$_[6]};
	my %msp_00=%{$_[7]};

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# %correcting_pairs is a correcting table for old pdb40d file classi
		#_____________________________________________________________________
		@correcting_pairs=(  # should be pairs
                                'd2kauc1 d2kauc2',              'd1pkya1 d1pkya2',
                                'd1pvda2 d1trka1',                              'd1pbe_1 d1pbe_2',
				'd1poxa3 d1pvda2',				'd1efga1 d1efga2',
				'd1dsba1 d1dsba2',				'd2gsta1 d2gsta2',
				'd1bct__ d1brd__',				'd1qora1 d1qora2',
				'd2ohxa1 d2ohxa2',				'd1efga2 d1eft_1',
				'd1tada1 d1tada2',				'd1gsea1 d1gsea2',
				'd1gesa2 d2tmda3',				'd1lvl_2 d2tmda3',
				'd2tmda3 d2tpra2',				'd1tde_1 d2tmda3',
				'd1nhp_2 d2tmda3',				'd1gesa1 d2tmda3',
				'd1lvl_1 d2tmda3',				'd2tmda3 d2tpra1',
				'd1fcda1 d2tmda3',				'd1nhp_1 d2tmda3',
				'd1tde_2 d2tmda3',				'd1pbe_1 d2tmda3',
				'd1ebha1 d1ebha2',				'd1gesa2 d2dlda2', ## 3.4.1  with 3.15.1
				'd1gesa2 d1psda2',				'd1nhp_2 d2dlda2',
				'd1ldm_1 d1tde_2',				'd1coy_1 d1ldb_1',
				'd1lvl_2 d1psda2',				'd1psda2 d1tde_2',
				'd1hyha1 d1tde_2',				'd1fcda1 d1ldm_1',
				'd1hdca_ d1nhp_2',				'd1fcda1 d1hlpa1',
				'd1llda1 d1lvl_2',				'd2dlda2 d2tpra2',
				'd1ldm_1 d1nhp_2',				'd1llda1 d1pbe_1',
				'd1gdha2 d2tpra1',				'd1ldb_1 d1nhp_2',
                                'd1gesa2 d1scua2',              'd1fcda1 d1hyha1',
                                'd1gesa1 d1hlpa1',              'd1gdha2 d1gesa2',
                                'd1lvl_2 d2dlda2',              'd1gesa1 d2dlda2',
                                'd1nhp_2 d2ohxa2',              'd1tde_2 d2dlda2',
                                'd1nhp_1 d2cmd_1',              'd1fcda1 d1ldb_1',
                                'd1lvl_1 d2ohxa2',              'd1nhp_2 d2naca2',
                                'd1pbe_1 d2ohxa2',              'd1gdha2 d1nhp_2',
                                'd2cmd_1 d2tpra1',              'd1tde_1 d2cmd_1',
                                'd1llda1 d1nhp_2',              'd1hlpa1 d1nhp_2',
                                'd1nhp_1 d2dlda2',              'd1hyha1 d1nhp_2',
                                'd1nhp_2 d1psda2',              'd1fcda1 d2cmd_1',
                                'd1fcda1 d1llda1',              'd1lvl_2 d1udpa_',
                                'd1psda2 d2tpra2',              'd1hdca_ d1lvl_2',
                                'd1gesa2 d1llda1',              'd1nhp_2 d1qora2',
                                'd1ldm_1 d2tpra1',              'd1coy_1 d2dlda2',
                                'd2dlda2 d2tpra1',              'd1hdca_ d1pbe_1',
                                'd1coy_1 d1gdha2',              'd1nhp_2 d2cmd_1',
                                'd1llda1 d1tde_1',              'd1llda1 d1lvl_1',
                                'd1bdma1 d2tpra1',              'd1gd1o1 d2tpra2',
                                'd1ldb_1 d1lvl_1',              'd1hlpa1 d1tde_2',
                                'd1coy_1 d1psda2',              'd1nhp_2 d1udpa_',

Bioinf.pl  view on Meta::CPAN

                                'd1gesa2 d1hlpa1',              'd1dhr__ d1nhp_2',
                                'd1hdca_ d1tde_1',              'd1gesa1 d1psda2',
                                'd1pbe_1 d2cmd_1',              'd1tde_2 d1udpa_',
                                'd1pbe_1 d2dlda2',              'd1hdca_ d1tde_2',
                                'd1gesa2 d1ldb_1',              'd1psda2 d2tpra1',
                                'd1gdha2 d1lvl_2',              'd1tde_1 d2dlda2',
                                'd1ldm_1 d1pbe_1',              'd1pbe_1 d1scua2',
                                'd1gesa1 d2ohxa2',              'd1lvl_2 d2naca2',
                                'd1gd1o1 d1lvl_1'
                                );


		for($i=0; $i< @correcting_pairs; $i++){
				$correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
		}
	if($E_mult_factor1=~/^ *$/){ $E_mult_factor1=1; };


	@keys=sort keys %seqs;
	@keys=@{&strip_sequence_ranges(\@keys)};
	@keys=@{&remove_dup_in_array(\@keys)};
	@pdbg_seqs_ori=@pdbg_seqs;
	$num_enq_seq=@pdbg_seqs;
	print "\n# In get_isearch_result_stat: PDBG seqs $num_enq_seq \n=> @pdbg_seqs\n\n" if $verbose;

	#@pdbg_seqs=@{&strip_sequence_ranges(\@pdbg_seqs)};
	#@pdbg_seqs=@{&remove_dup_in_array(\@pdbg_seqs)};

	if($num_enq_seq < 2){ print "\n# \$num_enq_seq is less than 2 @pdbg_seqs $base\n"; exit; }

	for($c=0; $c < @keys; $c++){
	   my($enq_seq, $correct, $false_positive);
	   $num_of_matched=@match_seqs=split(/ +/, $seqs{$keys[$c]});
			 $enq_seq=$keys[$c];

	   for($d=0; $d< @match_seqs; $d++){
                my($correct_matched, @sorted);

                $match_seq=$match_seqs[$d];
                                      $sorted=join(' ', sort ($enq_seq, $match_seq) );

                for($i=0; $i< @pdbg_seqs; $i++){
                     if($match_seq =~/d?$pdbg_seqs[$i]/i or $correcting_pairs{$sorted} ){
                          print "\n# \$match_seq = $match_seq, \$pdbg_seqs $pdbg_seqs[$i] \$enq_seq: $enq_seq\n"  if $verbose;
                          $correct++;
                          $correct_matched=1;
                          unless($correct{$sorted}){
                                   $correct_group{$base} .="Homolog: $sorted $base  $msp_0{$sorted}\n";
                          }
                          $correct{$sorted} = "Homolog: $base  $msp_0{$sorted}";
                     }
                }
                if($correct_matched !=1){
                     $false_positive++;
                     unless( $correct{$sorted} ){
                          $correct_group{$base} .="Nomolog: $sorted $base  $msp_0{$sorted}\n";
                     }
                     $correct{$sorted} = "Nomolog: $base  $msp_0{$sorted}";
                }
	   }
           if(@match_seqs == 0){ @match_seqs=1; $percent_correct=0; }
	   $sum_correct += $correct;
	   $sum_false   += $false_positive;
	}
	$av_correct = $sum_correct/$num_enq_seq;
	$av_false   = $sum_false  /($num_enq_seq);

	#### $actual_e_value becomes whatever $E_mult_factor1 defined ~~~~~~~~~~~~
	if($E_mult_factor1 != 1){
	   $actual_e_value=$evalue * $E_mult_factor1;
	}elsif($E_mult_factor2 != 1){
	   $actual_e_value= $evalue * $E_mult_factor2;
	}else{ $actual_e_value=$evalue }

	$num_enq_seq--;
	$sum_correct_for_additional = $num_enq_seq+1;
	$match_count=$sum_correct_for_additional * $av_correct;
	#$sum_correct= $sum_correct_for_additional;
	if($verbose){
           printf ("%-10s %-12s %-13f %-13f %-7s %-7s %-7s %-7s %-4s\n", $pdbg_base,
		$actual_e_value, $av_correct, $av_false, $num_enq_seq,
		$sum_correct_for_additional, $sum_false, $match_count, $leng_thresh);
	}

	@correct_new=@{&remove_dup_in_array(\@correct_new)};
	for($i=0; $i< @correct_new; $i++){
	    print "\n# correct new: $correct_new[$i]" ;
	}
	$num_correct=$match_count/2;

	print "Num of non-reflective correcct:  $num_correct  Nomolog: $sum_false  \n\n" if $verbose;
	return([$av_correct, $sum_correct, $num_enq_seq, \%correct, \%correct_group]);
}



#__________________________________________________________________
# Title     : strip_sequence_ranges
# Usage     :
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : remove_sequence_ranges, remove_sequence_name_ranges,
#             remove_ranges_in_sequences, strip_sequence_name_ranges,
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub strip_sequence_ranges{
    my (@out, $i);
    my @in=@{$_[0]} or @in=@_;
    for($i=0; $i< @in; $i++){
        if($in[$i]=~/^(\S+)_\d+\-\d+/){
	     push(@out, $1);
	}else{
	     push(@out, $in[$i]);
        }

Bioinf.pl  view on Meta::CPAN

                system("$default_search_method -T $score_thresh -E $evalue_cutoff $file[$i] $target_DB > $output_hmm_result");
            }else{
                print "Running: $default_search_method -t $score_thresh $file[$i] $target_DB \> $output_hmm_result\n";
                system("$default_search_method -t $score_thresh $file[$i] $target_DB > $output_hmm_result");
            }
        }else{
            print "\n# The $out_hmm_file file already exists. To overwrite use -o opt\n";
        }
        push(@out_hmm_file_names, $output_hmm_result);
    }
    if(@out_hmm_file_names > 1){
       return(\@out_hmm_file_names);
    }else{
       return(\$out_hmm_file_names[0]);
    }
}



#_______________________________________________________________________
# Title     : divide_clusters
# Usage     : &divide_clusters(\@file);
# Function  : This is the main funciton for divclus.pl
#               divides complex single linkage cluster into smaller duplication
#               module level sub clusters.
# Example   : &divide_clusters(\@file, $verbose, $range, $merge, $sat_file,
# 	                $dindom, $indup, "T=$length_thresh", "e=$evalue", $over_write,
#                   $optimize, "s=$score", "f=$factor");
#
# Keywords  : divicl, divclus, div_clus, divide clusters
# Options   : _  for debugging.
#   f=<digit>   for determing the factor in filtering out non-homologous
#                  regions, 7 = 70% now!!
#   l=<digit>   for seqlet(duplication module) length threshold
#   t=<digit>   for seqlet(duplication module) length threshold
#                  (same as l opt, confusing, huh? )
#   s=<digit>   for score threshold
#   e=<digit>   for evalue threshold
#   z           for activating remove_similar_sequences, rather than remove_dup....
#   o           for overwriting
#   v           for verbose printout (infor)
#   D           for dynamic factor
#   S  $short_region=  S by S -S  # taking shorter region overlap in removing similar reg
#   L  $large_region=  L by L -L  # taking larger  region overlap in removing similar reg
#   A  $average_region=A by A -A  # taking average region overlap in removing similar reg
#   o  for $over_write
#
# Version   : 2.9
#------------------------------------------------------------------------
sub divide_clusters{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
       $score, @temp_show_sub, $optimize, $file, $evalue, $over_write, $din_dom,
       $sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
       $average_region, $dynamic_factor, @sub_clustering_clu_files);

    $factor=7; # default factor is 7 for 70%
    $length_thresh=30;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Dealing with options
    #_________________________________________
    if($char_opt=~/m/){        $merge='m';
    }if($char_opt=~/v/){       $verbose='v'; # for showing debugging information
    }if($char_opt=~/i/){       $indup='i';
    }if($char_opt=~/z/){       $optimize='z';
    }if($char_opt=~/o/){       $over_write='o';
    }if($char_opt=~/d/){       $din_dom='d';
    }if($char_opt=~/s/){       $sat_file='s';
    }if($char_opt=~/y/){       $dynamic_factor='y';
    }if($char_opt=~/S/){       $short_region  ='S';
    }if($char_opt=~/L/){       $large_region  ='L';
    }if($char_opt=~/A/){       $average_region='A';
    }if($vars{'T'}=~/\d+/){    $length_thresh= $vars{'T'};
    }if($vars{'l'}=~/\d+/){    $length_thresh= $vars{'l'}; ## synonym of 't'
    }if($vars{'f'}=~/\S+/){    $factor= $vars{'f'};
    }if($vars{'s'}=~/\d+/){    $score = $vars{'s'};
    }if($vars{'e'}=~/\d+/){    $evalue= $vars{'e'};
    }if($vars{'E'}=~/\d+/){    $evalue= $vars{'E'}; # synonym of e
    }

   $percent_fac=$factor*10;

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
   #________________________________________________________________________________
   if(@file == 1){  #<=== @file has xxxx.msp, yyyy.msp  zzzz.msp ....,
        print "\n# (1) divide_clusters: One single file was given=> \"@file\"\n" if $verbose;
        $file=$file[0];
        $base_1=${&get_base_names($file)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (2) Define the output cluster file name:  eg, 3-232_cluster_F7.clu , F7 means factor used is 7
        #______________________________________________________________________________________________
        $output_clu_file="$base_1\_F${factor}\.clu";

        if( !$over_write and -s $output_clu_file){
            print "\n# $output_clu_file Already EXISTS, skipping. Use \'o\' opt to overwrite\n"; exit;
        }
        print "# (2) divide_clusters: processing ONE single file \"@file\" with merge_sequence_in_msp_file\n" if $verbose;

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (3) merge_sequence_in_msp_file does not do much. Just filtering and producing
        #     sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format from msp format
        #________________________________________________________________________________
        @grouped_seq_names=@{&merge_sequence_in_msp_file(\@file, "s=$score", $optimize, $din_dom, $sat_file,
                $optimize, "T=$length_thresh", "e=$evalue", "f=$factor", "$range", "$merge", $verbose,
                $short_region, $large_region, $average_region, $over_write, $dynamic_factor)};

        if($verbose){
            print "\n\n# (3) divide_clusters: finished running \"merge_sequence_in_msp_file\"\n  ==>";
            for($i=0; $i< @grouped_seq_names; $i++){
               print "\n-->> $grouped_seq_names[$i]";
            }
        }

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (4) This is critical seqlet merging step
        #________________________________________________________________________________
        @out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
               "f=$factor", $short_region, $large_region, $average_region, $optimize)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (5) This is showing the result in clu file format
        #________________________________________________________________________________
        @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup,
						   "e=$evalue", "p=$percent_fac", "f=$factor" );
        $good_bad       = $temp_show_sub[0];
        $indup_c        = $temp_show_sub[1];
        $sum_seq_num   += $temp_show_sub[2];
        push(@sub_clustering_out_files, @{$temp_show_sub[3]});

        if($good_bad==1){      push(@good, $file);
        }else{                 push(@bad, $file);       }

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # (6) Final write up stage (unecessary)
        #_______________________________________________________________
	&write_good_bad_list_in_divide_clusters(\@good, \@bad);

   }
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # when more than one single file input is given
   #____________________________________________________________
   elsif(@file >1 ){
       my (@good, @bad);
       if($indup =~/i/i){   open (INDUP, ">indup_stat\.txt");  } # this is not essential.

       for($i=0; $i< @file; $i++){
            my (@grouped_seq_names, @temp_show_sub);
            my $indup_c=0;
            my $big_msp_file=$file[$i];
            unless(-s $big_msp_file){ print "\n# (E) \$big_msp_file does not exist\n"; exit }

            $base_1=${&get_base_names($big_msp_file)};
            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # (1) Define the output cluster file name:  eg, 3-232_cluster_F7.clu , F7 means factor used is 7
            #______________________________________________________________________________________________
            $output_clu_file="$base_1\_F${factor}\.clu";

            print "\n# DIVCLUS: just before merge_sequence_in_msp_file, \$output_clu_file is $output_clu_file from input file $big_msp_file" if $verbose;
            if( !$over_write and -s $output_clu_file){
                print "\n# $output_clu_file Already EXISTS, skipping. Use \'w\' opt to overwrite\n";
                next;  }

            print "\n# (1)  divide_clusters: processing file \"$big_msp_file\" for $output_clu_file" if $verbose;

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            #  (2) If clu file(eg 2-1618_ss.clu ) is in pwd, tries to skip
            #____________________________________________________________
            if((-s $output_clu_file) > 10 and $over_write !~/o/){
                print "# $output_clu_file exists, skipping, use \"o\" option to overwrite\n";  next;
            }

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # (3) merge_sequence_in_msp_file does not do much. Just filtering and producing
            #     sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format of STRING from msp format
            #     $big_msp_file is an MSP file
            #________________________________________________________________________________
            print "\n# (i) divide_clusters : I am merging seq in $big_msp_file file for $output_clu_file\n" if $verbose;
            @grouped_seq_names=@{&merge_sequence_in_msp_file(\$big_msp_file, "s=$score", $din_dom, $sat_file, $optimize,
                                "T=$length_thresh", "e=$evalue", "f=$factor", "$range", "$merge", $verbose, $over_write,
                                 $short_region, $large_region, $average_region, $dynamic_factor )};

            if($verbose){
                print "\n# \@file has more than one input file\n # The result of \"merge_sequence_in_msp_file\"\n";
                print "@grouped_seq_names";
            }

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            #  (4) Clustering the sets of merged seqlets => CORE algorithm
            #____________________________________________________________

            @out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
                   $short_region, $large_region, $average_region)};

            @temp_show_sub=&show_subclusterings(\@out, $big_msp_file, $sat_file, $dindom, $indup,
                                                    "e=$evalue", "p=$percent_fac", "f=$factor");
                        $good_bad       = $temp_show_sub[0];
                        $indup_c        = $temp_show_sub[1];
                        $sum_seq_num   += $temp_show_sub[2];
            push(@sub_clustering_out_files, @{$temp_show_sub[3]});

            if($good_bad==1){          push(@good, $big_msp_file);
            }else{         push(@bad, $big_msp_file);       }

          }
          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
          &write_good_bad_list_in_divide_clusters(\@good, \@bad);
          sub write_good_bad_list_in_divide_clusters{
               my  (@good, @bad, $i); @good=@{$_[0]}; @bad=@{$_[1]};
               open(GOODBAD, ">good_bad.list");
               print GOODBAD "GOOD: all link    : 000\n";
               for($i=0; $i< @good; $i++){  print GOODBAD "$good[$i]\n";  }
               print GOODBAD "BAD : Not all link: 000\n";
               for($i=0; $i< @bad; $i++){   print GOODBAD "$bad[$i]\n";   }
               close(GOODBAD);
          }
          #_______________________________________________________________

   }
   return(\@sub_clustering_out_files); # contains (xxxx.clu, yyy.clu,, )
}



#______________________________________________________________________________
# Title     : remove_file_extension
# Usage     :
# Function  :
# Example   :
# Keywords  :
# Options   :
# Author    : jong@salt2.med.harvard.edu,
# Category  :
# Version   : 1.0
#------------------------------------------------------------------------------
sub remove_file_extension{
    my (@modified_files, $i, @files);
    @files=@_;
    for($i=0; $i< @files; $i++){
        $base=${&get_base_names($files[$i])};
        rename($files[$i], $base);
        push(@modified_files, $base);
    }
    return(\@modified_files);
}

#______________________________________________________________________________
# Title     : remove_small_files
# Usage     : @files_removed=@{&remove_small_files(@ARGV)};
# Function  :
# Example   :
# Keywords  :
# Options   :
# Author    : jong@salt2.med.harvard.edu,
# Category  :
# Version   : 1.0

Bioinf.pl  view on Meta::CPAN

											 print "\n# new seqlet : $seqlets[$i]\n" if $verbose;
					   splice(@seqlets, $i+1, 1);
					   $i--;
				   }else{
					   if($start1 < $start2){
							$short_start=$start2; $large_start=$start1;  ## note that short start should be $start2 if $start2 is bigger
					   }else{
							$short_start=$start1; $large_start=$start2;
					   }
					   if($end1 < $end2){
							$short_end=$end1;  $large_end=$end2;
					   }else{
							$short_end=$end2;  $large_end=$end1;
					   }
					   if($shorter_region){
						   $seqlets[$i]="$seq1\_$short_start\-${short_end}$tail1";
					   }elsif($larger_region){
						   $seqlets[$i]="$seq1\_$large_start\-${large_end}$tail1";
											 }

					   splice(@seqlets, $i+1, 1);
					   $i--;
			       }
			   }
			}
		 }
	  }
	 }
	 print "\n# (3) remove_similar_seqlets: The final out are: @seqlets\n" if $verbose;
	 return(\@seqlets);
}



#__________________________________________________________________________
# Title     : show_subclusterings
# Usage     : &show_subclusterings(\@out);
# Function  : This is the very final sub of divclus.pl
# Example   : @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup);
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : print_subclusterings, sum_subclusterings, write_subclustering
#             show_clusterings, display_subclusterings
# Options   :
#             f  for file output, eg: xxxxxxx.sat
# Category  :
# Version   : 2.7
#-------------------------------------------------------------------------
sub show_subclusterings{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
        my ($max_size, $sat_file_name, $clu_file_name,
        $ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
        $indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
        @sub_clustering_out_files);  # clusall_1e-5_clu_14-324_ss.sat
	my @out=@{$array[0]};
	$indup_count=0;

	if($char_opt=~/d/){	    $dindom=1;	}
	if($char_opt=~/i/){		$indup=1;	}
	if($vars{'f'}=~/\S+/){     $factor= $vars{'f'}; }
	if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
	if($vars{'s'}=~/\d+/){	   $score = $vars{'s'};	}
	if($vars{'e'}=~/\d+/){	   $evalue= $vars{'e'};	}

	#print "\n# (1) show_subclusterings : \@file has : @file\n";
	if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.msp/  or
		$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/   ){
		 $ori_cluster_size=$2;
		 $ori_cluster_num =$3;
		 $base=$1;
		 $sat_file_name="$base\.sat";
		 $clu_file_name="$base\.clu";
	}else{
				 print "\n# (2) LINE:",__LINE__," The \@file input to show_subclusterings is not the right format, dying\n";
				 print "\n     Sarah!, right format looks like: 13-234.msp or 8-420_cluster.msp \n";  exit;
	}

        open(CLU, ">$clu_file_name") or die "\n# (ERROR) show_subclusterings failed miserably to open \"$clu_file_name\" \n";
        push(@sub_clustering_out_files, $clu_file_name);


	@out=@{&sort_string_by_length(\@out)};

	for($i=0; $i< @out; $i++){ # @out has ( 'YAL054C_98-695 YBR041W_90-617', 'YBR115C_230-842 YBR222C_16-537 YER015W_121-686', etc)
	   my $count+=$i+1;
	   my ( $int_dup_number, $sub_clu_size, $seq_with_range, @sp, $new_clus_NAME,
	        %tem, %tem2, %tem3, $j, @keys, $num_seq);
	   if($out[$i]=~/^ *$/){ next }
	   @sp=sort split(/ +/, $out[$i]);

	   for($j=0; $j < @sp; $j++){
		  $seq_with_range=$sp[$j];
		  if($seq_with_range=~/^((\S+)_((\d+)\-(\d+)))/){
			 $tem{$2}++;
			 $tem2{$2}.=sprintf("%-15s ", $1);
			 $tem3{$2} =$3;
			 $tem4{$2} =$5-$4;
		  }
	   }

	   @keys=sort keys %tem;
	   $num_seq=$sub_clu_size=@keys;

	   if($max_size < $sub_clu_size){
		  $max_size=$sub_clu_size; ## This is to collect the sizes of clusters to see if it is good.
	   }
	   $indup_count= &print_summary_for_divclus(
		         $count, \%tem2, \%tem,
		         $ori_cluster_num,
		         $ori_cluster_size,
		         $dindom,
		         $clu_file_name,
								 \%tem3, \%tem4,
								 $indup, );

           #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           # Local subroutine
           #_______________________________________________________________
	   sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
               my(@keys, $indup_count, $x, $m, $percentage_fac);
               my $count=$_[0]; # count of cluster
	       my %tem2=%{$_[1]};	my $num_seq=@keys=sort keys %tem2;
	       my %tem=%{$_[2]};	my $ori_cluster_num=$_[3];
	       my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
	       my $ori_cluster_size=$_[4];
	       my $dindom=$_[5];	my %tem3=%{$_[7]};
	       my $indup=$_[9];	my (%internal_dup);
	       my %tem4=%{$_[8]};
               #~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
	       if($dindom){
	          for($x=0; $x <@keys; $x++){
                       @domain_inside_domain=@{&get_domain_inside_domain($tem2{$keys[$x]})};
                       @domain_inside_domain=@{&remove_dup_in_array(\@domain_inside_domain)};
                       for($m=0; $m< @domain_inside_domain; $m++){ print "  # Dindom: $m : $domain_inside_domain[$m]\n";   }
                       print "\n";
		  }
               }
               #==========================================================================================

	       #~~~~~~~~~~ Internal duplication  ~~~~~~~~~~~~~~
	       if($indup==1){
		   # @keys is the same as sub cluster size,
		   for($x=0; $x < @keys; $x++){
                             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                             # Checks each sequence for duplication
                             #___________________________________________________
                             my %internal_dup=%{&get_internal_dup_in_a_cluster( $tem2{$keys[$x]} )};
                             my @dup_keys=keys %internal_dup;
                             if(@dup_keys > 0){
                                     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
                                     #  This calculates the actual duplicated number rather than jus tthe sequences
                                     #______________________________________________________________________________
                                     $indup_count++;
                                     printf ("%-14s %-12s %-4s", $keys[$x], $new_clus_NAME, $num_seq);
                                     for($m=0; $m< @dup_keys; $m++){
                                             printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
                                     }
                             }
                    }
                 }

                #~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
                print  CLU  "Cluster size $num_seq\n";
                                        printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
                                          $new_clus_NAME, $evalue, $factor, $percentage_fac,
                                          $ori_cluster_size, $num_seq, $ori_cluster_num);
                print       "Cluster size $num_seq\n";
                printf     ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
                              $new_clus_NAME, $evalue, $factor, $percentage_fac,
                              $ori_cluster_size, $num_seq, $ori_cluster_num);
                for($x=0; $x <@keys; $x++){
                   printf CLU ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
                               $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
                   printf ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
                          $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
                }
                return($indup_count);
	   }
	}
		close(CLU); ## this is a bug fix

	if($max_size == $ori_cluster_size){   $good_bad=1;
	}else{	                              $good_bad=0;	}

       print "\n# Sarah, Do you think the subclusterings are O.K.?" if $verbose;
       print "\n#   Tell me, if you feel suspicious, jong\@salts.med.harvard.edu\n\n" if $verbose;
       return($good_bad, $indup_count, $ori_cluster_size, \@sub_clustering_out_files);
}





#__________________________________________________________________________
# Title     : exchange_query_with_match_in_msp
# Usage     : @exchanged_msp=@{&exchange_query_with_match_in_msp(\@file)};
# Function  :
# Example   :
# Keywords  : swap_query_with_match_in_msp, invert_query_with_match_in_msp,
#             swap_query_seq_with_match_seq_in_msp,
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.1
#----------------------------------------------------------------------------
sub exchange_query_with_match_in_msp{

	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	my(%exchanged_msp, @sorted_by_query_seq_names, @new_msp_lines);
	$open_msp_files_x_opt = 'x';
	if($char_opt=~/n/){ $names_only='n' }
	%exchanged_msp=%{&open_msp_files(@file, $open_msp_files_x_opt, $names_only )};

	@new_msp_lines=values %exchanged_msp;
	@sorted_by_query_seq_names=
	   map{ $_->[0] } sort {$a->[1] cmp $b->[1]} map {/^\d+ +\S+ +\d+ +\d+ +(\S+)/ && [$_, $1] } @new_msp_lines;
	return(\@sorted_by_query_seq_names);
}

Bioinf.pl  view on Meta::CPAN

# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub Richardson_alpha_matrix{
	%Richardson_alpha_matrix = ( ## ref: Protein Eng. V.8, no9, pp905-913, 1995
		  'A', 	1.80,
		  'C', 	0.70,
		  'D', 	1.00,
		  'E', 	0.80,
		  'F', 	1.30,
		  'G', 	0.,     ## <<----
		  'H', 	0.69,
		  'I', 	1.14,
		  'K', 	0.94,
		  'L', 	1.14,
		  'M', 	1.20,
		  'N', 	0.78,
		  'P',    0.19,
		  'Q', 	0.98,
		  'R', 	1.03,
		  'S', 	0.76,
		  'T', 	0.82,
		  'V', 	0.95,
		  'W', 	1.11,
		  'Y',    1.02
	);
	return(%Richardson_alpha_matrix);
}


#________________________________________________________________________
# Title     : get_segment_shift_rate
# Usage     : &get_segment_shift_rate(\%hash_for_errors, \%hash_for_sec_str);
# Function  : calculates the secondary structure segment shift rate.
# Example   : <input example> First block is for the first hash input
#                             and Second is for the second hash input.
#
#             1cdg_6taa      00000442222222222242222222222777700000007000000000
#             1cdg_2aaa      00000442222222222242222222222777700000007000000000
#             2aaa_6taa      00000000000000000000000000000000000000000000000000
#
#             1cdg_6taa      -------EEE-----------EE--EEEE------EE---------EEE-
#             1cdg_2aaa      -------EEE-----------EE--EEEE------EE---------EEE-
#             2aaa_6taa      -------EEEEE------EE-EEEEEEEE----EEEE-------EEEEE-
#
#             <intermediate output example>
#             2aaa_6taa      -------00000---------00000000----0000-------00000-
#             1cdg_6taa      -------442---------------2222-----------------000-
#             1cdg_2aaa      -------222---------------2222-----------------000-
#
#             <Final output>
#             2aaa_6taa      0%
#             1cdg_6taa      67%
#             1cdg_2aaa      67%
#
# Warning   :
# Keywords  :
# Options   : 'p' or 'P' for percentage term(default)
#             'r' or 'R' for ratio term (0.0 - 1.0), where 1 means all the
#              segments were wrongly aligned.
#             's' or 'S' for Shift rate (it actually caculates the position shift
#              rate for the secondary structure segment.
#             'h' or 'H' for position Shift rate (it actually caculates the position
#              shift rate for helical segments). If this is the only option, it
#              will show the default percentage term rate for helical segments.
#              If used with 'r', it will give you ratio (0.0 - 1.0) for helical
#              segment. If used with 's' option, it will give you position shift
#              rate for only helical segments.
#             'e' or 'E' for position Shift rate (it actually caculates the position
#              shift rate for beta segments). If this is the only option, it will
#              show the default percentage term rate for beta segments. If used
#              with 'r', it will give you ratio (0.0 - 1.0) for beta. If used
#              with 's' option, it will give you position shift rate for only
#              beta segments.
# Returns   :
# Argument  : Two references of hashes. One for error rate the other for sec.
#             assignment.
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub get_segment_shift_rate{
	my($i, $k, $j, @hash, $option_string, %h, %superposed_hash,
	  $name, %out, $gap_chr, @str1, @str2, %temp, %hash_error, %hash_secondary);
	#"""""""""""""""""""""""""""""""""""""""""
	#       general argument handling        #
	#"""""""""""""""""""""""""""""""""""""""""
	for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(\w)$/) ){
		  $option_string  .= $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(\w)$/) ){
		  $option_string  .= $1;    }
	  elsif(ref($_[$k]) eq "HASH") {
		  %temp = %{$_[$k]};
		  my(@keys)= sort keys (%temp);
		  my($temp_seq) = $temp{$keys[0]};

		  if($temp_seq=~/\d\d+/){
			  %hash_error = %temp; }
		  else{ %hash_secondary = %temp; }
	  }
	}#### OUTPUT are  : %hash_error  &  %hash_secondary
	#"""""""""""""""""""""""""""""""""""""""""
	#       general argument handling end    #
	#"""""""""""""""""""""""""""""""""""""""""
	%hash_secondary =%{&tidy_secondary_structure_segments(\%hash_secondary)};
	%superposed_hash =%{&superpose_seq_hash(\%hash_error, \%hash_secondary)};
	%h=%{&get_wrong_segment_rate(\%superposed_hash)};
	return(\%h);
}

#________________________________________________________________________
# Title     : get_wrong_segment_rate
# Usage     : print_seq_in_block( &get_wrong_segment_rate(\%superposed_hash) );
# Function  : Treats the segment as one single big error.
#             calculates the wrong segment number compared to the correct ones.
# Example   : <input example> hash of 3 keys and values.
#             2aaa_6taa      -------00000---------00000000----0000-------00000-
#             1cdg_6taa      -------442---------------2222-----------------000-
#             1cdg_2aaa      -------222---------------2222-----------------000-
#
#             In the above there are two segments wrong in 3 segment blocks = 2/3
#             <output example> hash of 3 percentage rates.
#
#             2aaa_6taa      0 %
#             1cdg_6taa      66.6666666666667 %
#             1cdg_2aaa      66.6666666666667 %
#
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_wrong_segment_rate{
	my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
	  $s, $t, $u, $v, $w, $x, $y, $z, %h, $seg_min,
	  %hash, @keys, @array, @hash, $option_string, $string,
	  $name, %out, $gap_chr, @str1, @str2, $seg, $len, $wrong_seg, $correct_seg
	);
	%hash=%{$_[0]};
	$seg_min =$_[1];
	if($seg_min !~/\d+/){ $seg_min = 3; } ### Default segmin is 3
	@keys = sort keys (%hash);
	for $k (@keys){
	 my($string) = $hash{$k}; $string =~s/\,//g;
	 my(@segments) = split(/[\-\.\ ]+/, $string);
	 for $seg (@segments){
		$len=length($seg);
		if( $len >= $seg_min){
			if($seg =~/[1-9]/){
				$wrong_seg ++;  }
			else{ $correct_seg ++; }
		}
	 }
	 $h{$k}= ($wrong_seg/($wrong_seg + $correct_seg)*100).' %';
	 $wrong_seg=$correct_seg='';
	}
	\%h;
}


#________________________________________________________________________
# Title     : tidy_secondary_structure_segments
# Usage     : print_seq_in_block(&tidy_secondary_structure_segments(\%hash, 'e4', 'h4'), 's');
#
# Function  : receives any secondary structure assignment hashes and
#             tidys up them. That is removes very shoft secondary structure
#             regions like( --HH--, -E-, -EE- ) according to the given minimum
#             lengths(threshold) of segments by you.
# Example   : print_seq_in_block(&tidy_secondary_structure_segments(\%hash, 'e4', 'h4'), 's');
#             <makes following into the next block>
#
#             1cdg_2aaa      -------EEE-----------EE--EEEE------EE---------EEE-
#             1cdg_6taa      -------EEE-----------EE--EEEE------EE---------EEE-
#             2aaa_6taa      -------EEEEE------EE-EEEEEEEE----EEEE-------EEEEE-
#
#             <example output>
#
#             1cdg_6taa      -------------------------EEEE---------------------
#             1cdg_2aaa      -------------------------EEEE---------------------

Bioinf.pl  view on Meta::CPAN

					 }
					 elsif( (length($2)- length($c)) == -2){
							 $ATOM=$1; $RES=$3;
							 chop($ATOM); chop($RES); chop($ATOM); chop($RES);
							 print F2 "$ATOM$c$RES$c$5\n";
							 next;
					 }
				}
		  }
	 }
	 close F2;

	 ##################################################################################
	 #   Final result
	 ##################################################################################

	 print "\n Files   $file1, $file2  are created \n\n\n";

}


#________________________________________________________________________
# Title     : tally_2_hashes (used for get_cs_rate_for_pairs_stat.pl )
# Usage     : ($ref1, $ref2) = &tally_2_hashes(\%hash1, \%hash2, ['n', 'a', 'p', 'i']);
#              %tally_addedup=%{$ref1};    '0' position had addedup value of 1000
#              %tally_occurances=%{$ref2}; '0' position had occurred 100 times,
#                                          '0' on average had 10 in its
#                                              corresponding hash positions
# Function  : Makes hashes of tallied occurances and summed up values for disits in
#             positions.
#             calculates the occurances or occurance rates of CS rate positions.
#             The hashes should have numbers.
# Example   : you put two hash refs. (ass. array) as args (\%hash1, \%hash2)
#             The hashes are like; hash1  (name1, 0000011111, name2, 0000122222 );
#                                  hash2  (name3, 1324..1341, name4, 13424444.. );
#
#             1) The resulting 1st hash output is (0, 20,   1, 13,     2, 12)
#             which means that 0 added up to 24 in the second arg hash positions
#                              1 added up to 15 in the second arg hash positions
#                              2 added up to 18 in the second arg hash positions
#             'p' option only works with 'n' or 'a'
#             2) The resulting 2nd hash output is (0, 5,   1, 5)
#             which means that 0 occurred 5 times in the first input hash
#                              1 occurred 5 times in the first input hash
#             'p' option only works with 'n' or 'a'
# Warning   :
# Keywords  :  tally two hashes of numbers.
# Options   : [a n i p]
# Returns   : ($ref1, $ref2), ie, two references of hash
#             averaging option causes division of 20(added up value)
#                                                by 9(occurance) in the above
#             for '0' of the first hash, so (0, 2.222,  1, 2.1666,  2, 2.4 )
#             Average is the average of numbers
#             average value in 0-9 scale (or 0-100 with 'p' option)
#             So, if there are
#                  seq1 00111110000,   The 'a' value of 0 and 1 as in the seq2
#                  seq2 33000040000    is 0-> 6/6, 1-> 4/5, while the 'n'
#                                        calc would be, 0-> 6 (60%), 1-> 4(40%)
#
# Argument  : (\%hash1, \%hash2) or optionally (\%hash1, \%hash2, ['n', 'i', 'p', 'a'])
#             'n' => normalizing, 'p' => percentage out, 'i' => make int out, 'a'=> averaged
# Category  :
# Version   : 1.2
#--------------------------------------------------------------------
sub tally_2_hashes{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

	%hash0 = %{$hash[0]};
	%hash1 = %{$hash[1]};
	@keys1=  keys %hash0;  ### No need to sort here as you will return hash at the end
	@keys2=  keys %hash1;

	if($char_opt =~ /p/i ){ $factor =100; }

	for($i=0; $i < @keys1; $i++ ){

	  ###################################################
	  ##  Gap char detection
	  ###################################################
	  if($hash0{$keys1[$i]} =~ /([\,\-])\S+[\,\-]/){ $gap_char1 = $1; }else{ $gap_char1=''; }
	  if($hash1{$keys2[$i]} =~ /([\,\-])\S+[\,\-]/){ $gap_char2 = $1; }else{ $gap_char2=''; }


	  ###################################################
	  ##  Split the value string by gap char
	  ###################################################
	  @string1=split(/$gap_char1/, $hash0{$keys1[$i]});
	  @string2=split(/$gap_char2/, $hash1{$keys2[$i]});
	  ### @string1 => (0,0,0,0,1,1,1,1,1) @string2 => (3,4,2,13,2,1,23,3)


	  ################################################################
	  ##  Main calc part, you get %tally_all_occur and %tally_occur
	  ################################################################
	  for($j=0; $j < @string1; $j++){
		  $tally_all_occur{$string1[$j]}++ ; ## <-- number of all the positions
		  if( ($string2[$j]=~/[\d\^]+/)&&($string1[$j]=~/[\d\^]+/) ){
			  $tally_occur{$string1[$j]}+=$string2[$j] ; # %tally_occur is for added up counts
		  }                                             # %tally_all_occur is for only the position
	  }                                                #  occurances of '0', '1' or whatever. To know
																		#  how many '0' entry were you should use this.
	  ####################################################################################
	  ##  When options were put, do more calc on %tally_all_occur and %tally_occur
	  ####################################################################################
	  if($char_opt =~ /a/i ){
		 print "\n           $char_opt ";
		 my(@cs_rates) = sort keys %tally_all_occur;
		 for($k=0; $k < @cs_rates; $k++){
			 if($tally_all_occur{$cs_rates[$k]} == 0){
				 $tally{$cs_rates[$k]} =0; next;}
			 if($char_opt =~ /i/i ){
				 $tally{$cs_rates[$k]}=int($tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]}); }

Bioinf.pl  view on Meta::CPAN

		$out_hash{$title}=join(",", @out_rate);
	 }
	}
	return( \%out_hash );
}

#________________________________________________________________________
# Title     : get_windows_sc_rate_array
# Usage     : @out_rate = @{&get_windows_compos_and_seqid_rate_array(\@seq, \$win_size)};
# Function  : actual working part of scan_windows_and_get_compos_seqid_rate
# Example   :
# Warning   :
# Class     :
# Keywords  :
# Options   :
# Reference :
# Returns   : \@ratio_array, \$ratio_whole_seq
# Tips      :
# Argument  : (\@input, \$window_size);  @input => ('ABCDEFG.HIK', 'DFD..ASDFAFS', 'ASDFASDFASAS');
#             Input ar => ( 'ABCDEFG
#                'DFD..ASDFAFS'
#                'ASDFASDFASAS' )  as the name of  @sequences.
# Todo      :
# Author    : A Biomatic
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_windows_sc_rate_array{
	my($base_level)=1; my($scale)=1; my($window_size, $show_calculation, $redu_window,
	 @input, @input0, @input1, $variable_win_size, $apply_factor,
	 @ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $length,
	 $compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
	 $ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate,
	 $winsize_reduc_factor, $largest_win_reached, $ori_win_size);

	#""""""""""""""""""""""< handle_arguments{ head Ver 1.3 >"""""""""""""""""""
	my(@A )=&handle_arguments( @_ );my( $num_opt )=${$A[7]};my( $char_opt )=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};my(@raw_string)=@{$A[9]};
	my($i, $j, $c, $d, $e, $f, $g, $h, $k, $l, $p, $q, $r, $s, $t, $u, $v, $w, $x,$y,$z);
	if($debug==1){ print "\n   \@hash has \"@hash\"\n   \@raw_string has   \"@raw_string\"
	\@array has \"@array\"\n   \@char_opt has   \"@char_opt\"\n   \@file has \"@file\"
	\@string has \"@string\""; }
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 if( $num_opt =~/^(d+)/){  $window_size= $1; }
	 if( $char_opt=~/v/i){  $variable_win_size= 'v' }
	 if( $char_opt=~/s/i){  $show_calculation = 's'}
	 if( $char_opt=~/r/i){  $redu_window  = 'r'}
	 if( $char_opt=~/f/i){  $apply_factor  = 'f'}
	 if( $char_opt=~/d/i){  $make_gap_dot  = 'd'}
	 if( $char_opt=~/m/i){  $minus_whole_cs  = 'm'}

	@input = @{$array[0]};
	if(defined(${$_[2]})){ $base_level =${$_[2]}; }
	if(defined(${$_[3]})){ $scale  =${$_[3]}; }

	for ($t=0; $t< @input; $t++){ $length=length($input[$t]) if(length($input[$t])>$length);}
	if ($length < $window_size){  $window_size = $length;   }
	 #___________ getting ratio for the whole sequence ___________

	$compos_whole_seq=${&main::compos_id_percent_array(\@input)};  ## for whole composition rate
	$seq_id_whole_seq=${&main::seq_id_percent_array(\@input)};
	print "\nComposition ID of the alignment:  $compos_whole_seq\%\n";
	print "Sequence    ID of the alignment:  $seq_id_whole_seq\%\n";
	if ($seq_id_whole_seq == 0){  $ratio_whole_seq =0; }
	else{  $ratio_whole_seq = $compos_whole_seq/$seq_id_whole_seq;  }
	print "Composition and Sequ.  ID Ratio:   $ratio_whole_seq\n";

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	##########   Initial Window size setting      ##############################
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if   (  $ratio_whole_seq >9 ){ $window_size= 28; } # $ratio_whole_seq is
	elsif(  $ratio_whole_seq >8 ){ $window_size= 26; } # a whole CS rate !!
	elsif(  $ratio_whole_seq >7 ){ $window_size= 24; }
	elsif(  $ratio_whole_seq >6 ){ $window_size= 22; }
	elsif(  $ratio_whole_seq >5 ){ $window_size= 20; }
	elsif(  $ratio_whole_seq >4 ){ $window_size= 18; }
	elsif(  $ratio_whole_seq >3 ){ $window_size= 16; }
	elsif(  $ratio_whole_seq >2 ){ $window_size= 12; }
	elsif(  $ratio_whole_seq >0 ){ $window_size= 8; }

	print "Window size used is :  $window_size\n";
	#$window_size = 10;
	$largest_win_reached = 24;
	$ori_win_size        = $window_size;

	#----------- Spliting the seq. into arrays to enable $make_gap_dot var -----
	if( $make_gap_dot =~ /^[dD]+/){
	 $input[0] =~s/,//g;             $input[1] =~s/,//g;
	 @input0 = split('', $input[0]); @input1 = split('', $input[1]);
	}

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	#        MAIN Calc part
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	for ($w=0; $w < $length; $w++){

	 $largest_win_reached = $window_size if $window_size > $largest_win_reached;

	 #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ###          FACTOR calculation                                    ##
	 #####################################################################
	 $factor= ($window_size/40)*2  if ($apply_factor eq 'f');

	 #####################################################################
	 ##       Turn RATES to DOTs when there are gaps                    ##
	 #####################################################################
	 # @input0, @input1 are the whole length sequence splited.
	 if($make_gap_dot =~ /^[dD]+/){
		if( ($input0[$w] eq '.') || ($input1[$w] eq '.') ){
		  $ratio_compos_vs_seqid = '.';
		  push(@ratio_array, $ratio_compos_vs_seqid);
	  next;
	}
	 }

	 #####################################################################
	 ###               Getting small windows                            ##
	 #####################################################################
	 $offset = $w - int($window_size/2); # $offset starts from -5 when window_size is 10.
	 $offset = 0 if ($offset < 0);
	 if($variable_win_size ne 'v'){ $window_size = $ori_win_size;}
	 $window_1=substr($input[0], $offset, $window_size);  # window_1 is one segment
	 $window_2=substr($input[1], $offset, $window_size);  # of defined length(size)
	 @array_of_2_seq=($window_1, $window_2);

	 #####################################################################
	 ##      This is to remove the common gaps in the two windows       ##
	 #####################################################################
	 ($win_no_gap1, $win_no_gap2) = @{&main::remov_com_column(\@array_of_2_seq)};

	 #####################################################################
	 ##      Go back if the window size is too small due to gaps        ##
	 #####################################################################
	 if( (length($win_no_gap1) < $ori_win_size)&&($variable_win_size ==1) )
	 {
		$window_size+=1; $w--; next;
	 }

	 #####################################################################
	 ##      Getting Compos and Seq ids                                 ##
	 #####################################################################
	 $compos_id=${&main::compos_id_percent_array(\@array_of_2_seq)};
	 $seq_id   =${&main::seq_id_percent_array(\@array_of_2_seq)};


	 #####################################################################
	 ####    Go back if the Seq id is bigger than Compos id        #######
	 #####################################################################
	 if(($variable_win_size eq 'v') && ($seq_id >  $compos_id))
	 {
		$window_size+=1;  $w--;   next;
	 }

	 #####################################################################
	 ####   Special ID value handling                              #######
	 #####################################################################
	 if   (($seq_id    == 0  ) || ($compos_id == 0)){ $compos_id = 1;}


	 #####################################################################
	 ####     The actual calculation                               #######
	 #####################################################################
	 if(  $minus_whole_cs  eq 'm' ){  ### this substracts rates with whole CS rate
	   $ratio_compos_vs_seqid = int( $seq_id/$compos_id*10 - $ratio_whole_seq );
	   if( $ratio_compos_vs_seqid <= 0){ $ratio_compos_vs_seqid =0; }
	   if( $ratio_compos_vs_seqid > 9){ $ratio_compos_vs_seqid = 9; }
	   if( ($apply_factor eq 'f')&&($variable_win_size eq 'v') ){
		  $ratio_compos_vs_seqid =int($ratio_compos_vs_seqid * $factor); }   }
	 else{
		$ratio_compos_vs_seqid = int($seq_id/$compos_id*10);
		if( ($apply_factor eq 'f')&&($variable_win_size eq 'v') ){
		   $ratio_compos_vs_seqid =int($ratio_compos_vs_seqid * $factor); }
		if( $ratio_compos_vs_seqid > 9){ $ratio_compos_vs_seqid = 9; }
	 }

	 #$ratio_compos_vs_seqid =int(($seq_id/$compos_id)*10-$factor);}
	 #$seq_id/abs($seq_id-$compos_id+0.1)*


	 #####################################################################
	 #######       OUT of the loop (at the near to the end  ##############
	 #####################################################################
	 if( ($w + $window_size/3) > $length ){ $ratio_compos_vs_seqid='.'; }


	 #####################################################################
	 ####    When 's'how option is set(defined at prompt)          #######
	 #####################################################################
	 if( $show_calculation eq 's' ){
		printf ("SC=%-4s %-45s Seq=%-3.2s Compos=%-3.2s W=%-2s F=%-2s\n",
			  $ratio_compos_vs_seqid, $win_no_gap1,$seq_id, $compos_id, $window_size, $factor);
		printf ("        %-45s \n\n", $win_no_gap2);
	 }

	 #####################################################################
	 # Reducing increased window size according to SC rate (option 'r') ##
	 #####################################################################
	 if( ($variable_win_size eq 'v')&&($redu_window eq 'r') ){
		if( $window_size > $ori_win_size ){
	   if( $window_size > ($length/2)){ print chr(7);
	      print "\n The increased window size is over half of seq. suspicious !! \n";
	      print "\n Disable 'v' (for variable window size), at prompt and run again\n\n";
	   }

Bioinf.pl  view on Meta::CPAN

# Options   :
# Returns   : a reference of a hash.
# Argument  : One ref. for hash, one ref. for a scalar.
# Category  :
# Version   :
#--------------------------------------------------------------------
sub scan_windows_and_get_compos_seqid_rate{
	my($base_l)=1;
	my($scale)=1; # these are default params.
	my(%input)=%{$_[0]};
	my($window_size)=${$_[1]};
	if(defined(${$_[2]})){ $base_l=${$_[2]}; } ### <---$base_c is the baseline controller for sensitivity.
	if(defined(${$_[3]})){ $scale =${$_[3]}; } ### <---$base_c is the baseline controller for sensitivity.
	my(@sequences,@out_rate,$i,$title,$window_1,$window_2,$ratio_compos_vs_seqid,@array_of_2_seq,%out_hash );
	my(@keys)= keys %input;
	my($whole_rate, $whole_rate_ref ,$out_rate_arr_ref);
	for ($i=0; $i<=$#keys; $i++){
	 $sequences[$i]= $input{$keys[$i]};   }
	($out_rate_arr_ref,$whole_rate_ref)=&get_windows_compos_and_seqid_rate_array(\@sequences,\$window_size,\$base_l,\$scale);
	@out_rate=@{$out_rate_arr_ref};  $whole_rate=${$whole_rate_ref};
	$title="CS_rate\($whole_rate\)";
	$out_hash{$title}=join(",", @out_rate);
	return( \%out_hash );
}


#________________________________________________________________________
# Title     : get_windows_cs_rate_array
# Usage     : @out_rate = @{&get_windows_cs_rate_array(\@seq, \$win_size)};
# Function  : actual working part of scan_windows_and_get_compos_seqid_rate
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   : \@ratio_array, \$ratio_whole_seq
# Argument  : (\@input, \$window_size);  @input => ('ABCDEFG.HIK', 'DFD..ASDFAFS', 'ASDFASDFASAS');
#             Input ar => ( 'ABCDEFG
#                'DFD..ASDFAFS'
#                'ASDFASDFASAS' )  as the name of  @sequences.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_windows_cs_rate_array{
	my(@input)=@{$_[0]};
	my($base_level)=1;
	my($scale)=1;
	my($window_size)=${$_[1]};

	if(defined(${$_[2]})){ $base_level =${$_[2]}; }   if(defined(${$_[3]})){ $scale  =${$_[3]}; }

	my(@ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $t, $length, $w,
	  $compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
	  $ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate );

	for ($t=0; $t < @input; $t++){
	 $length=length($input[$t]) if (length($input[$t])>$length);   }

	if ($length < $window_size){  $window_size = $length;   }

	  #___________ getting ratio for the whole sequence ___________
	$compos_whole_seq=${&compos_id_percent_array(\@input)};
	$seq_id_whole_seq=${&seq_id_percent_array(\@input)};
	if ($seq_id_whole_seq == 0){  $ratio_whole_seq =$compos_whole_seq/10; }
	else{  $ratio_whole_seq =$compos_whole_seq/$seq_id_whole_seq;  }

	  #___________ getting ratio for each window sequence ___________
	for ($w=0; $w < $length; $w++){
	 $offset = $w - int($window_size/2);  # $offset starts from -5 when window_size is 10.
	 $offset=0 if ($offset < 0);
	 $window_1=substr($input[0], $offset, $window_size);  # window_1 is one segment
	 $window_2=substr($input[1], $offset, $window_size);  # of defined length(size)
	 @array_of_2_seq=($window_1, $window_2); # making an array like this = ('ABCDE', 'BDESA')
	 $compos_id=${&compos_id_percent_array(\@array_of_2_seq)};
	 $seq_id   =${&seq_id_percent_array(\@array_of_2_seq)};
	#print "\n offset = $offset Wind1 = $window_1  Wind2 = $window_2 ";
	#print " Compos1 = $compos_id  Seqid = $seq_id \n";

	 #______  Handle special case when $seqid is zero > the rate becomes $compos_id/10 ______
	 if (($seq_id == 0) && ($compos_id != 0)){ $ratio_compos_vs_seqid = $compos_id/10;   }
	 elsif(($seq_id == $compos_id)&&($seq_id == 0)){ $ratio_compos_vs_seqid = 0;}
	 elsif(($seq_id == $compos_id)&&($seq_id == 100)){ $ratio_compos_vs_seqid = 0;}
	 else{ $ratio_compos_vs_seqid=($compos_id/$seq_id); }
	 push(@ratio_array, $ratio_compos_vs_seqid);  }

	$lowest_rate = ${&min_elem_array(\@ratio_array)};

	if($lowest_rate ==0){ $normalizing_factor=1; $ratio_whole_seq=0; }else{
	 $normalizing_factor=($ratio_whole_seq/$lowest_rate);
	}

	for (@ratio_array){  # the minimum value becomes equal to the whole seq. rate.
	 $_ = int($scale*($_*$normalizing_factor - ($ratio_whole_seq*$base_level))); #<<<----
	 $_=  '^' if($_ > 9); $_=  '_' if($_ < 0);
	}

	$ratio_whole_seq=int($ratio_whole_seq);
	return( \@ratio_array, \$ratio_whole_seq);
}

#________________________________________________________________________
# Title     : read_any_seq_files
# Usage     : %out_seq=%{&read_any_seq_files(\$input_file_name)};
# Function  : Tries to find given input regardless it is full pathname, with or
#             without extension. If not in pwd, it searches the dirs exhaustively.
# Example   : (*out1,  *out2) =&read_any_seq_files(\$input1, \$input2);
#             : (@out_ref_array)=@{&read_any_seq_files(\$input1, \$input2)};
#             : (%one_hash_out) =%{&read_any_seq_files(\$input1)};
# Warning   :
# Keywords  : open_any_seq_files,
# Options   :
# Returns   : 1 ref. for a HASH of sequence ONLY if there was one hash input
#             1 array (not REF.) of references for multiple hashes.
# Argument  : one of more ref. for scalar.
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub read_any_seq_files{
	my(@out_hash_ref_list, $sub, $o, $ext );
	my(@in)=@_;
	for($o=0; $o< @in; $o++){
	 my($found, %out, @file_ext_accepted, $found_file, $sub);
	 if(ref($_[$o])){
		 @file_ext_accepted=('msf', 'fasta','jp','aln','ali','pir',
								  'slx', 'dna','fas','pdb','rms','brk', 'dssp');
		 if( ! -e ${$in[$o]}  or -B ${$in[$o]} or -z ${$in[$o]}  ){
			 print "\n#SUB read_any_seq_files: ${$in[$o]} no seq file exists(or not passed at all) for $0 \n\n",
			 chr(7);
			 exit;
		 }
		 $found_file=${&find_seq_files($in[$o])};
		 print "# in read_any_seq_files, \$found_file => $found_file\n";

		 for $ext(@file_ext_accepted){
			$sub ="open\_$ext\_files";

Bioinf.pl  view on Meta::CPAN

	);
	%array1 = %{&hash_common(\%array1, \%array2)};
	%array2 = %{&hash_common(\%array2, \%array1)};
	%array1 = %{&remov_com_column(\%array1)}; # this removes wrong gaps(in '.' form, in MSF)
	%array2 = %{&remov_com_column(\%array2)};

	@names= keys %array2;
	for $name (@names){
	  @string1 =split('', $array1{$name});
	  @string2 =split('', $array2{$name}); # ! @string2 is the structural. ! (used)

			 @seq_position1  = @{&get_posi_sans_gaps(\$array1{$name})};
			 @seq_position2  = @{&get_posi_sans_gaps(\$array2{$name})}; # @seq_position2 is structural

			  $len_of_seq =($#seq_position2+1);
					 push(@whole_length, $len_of_seq);

			 @position_diffs = @{&get_posi_diff(\@seq_position1, \@seq_position2)};
			 @position_corrected1 = @{&put_position_back_to_str_seq(\@string2, \@position_diffs)};
			 #print "@position_corrected1";
			 $array3{$name}=join(",", @position_corrected1); # array3 is for disply of seq.
	}                      # !! split and join char is ',';

	# %array3 has the form.  These numbers are position differences between the same sequences
	#                        one from str. one from seq.
	# seq1  1,1,2,3,.,2,3,.,1,.,0,0,0,1,1,1,1,1,2
	# seq2  1,1,2,1,.,1,3,.,1,.,0,0,1,0,1,1,1,3,2
	# seq3  1,1,2,3,.,2,3,.,1,.,1,1,0,0,1,1,1,3,2
	my(%final_posi_diffs) =%{&get_posi_diff_hash(\%array3)};
	my($sum_of_posi_diffs)=${&sum_hash(\%final_posi_diffs)};
	my($av_of_posi_diffs) =$sum_of_posi_diffs/($#names); # dividing by seq number.
	my($sum_seq_length)   =${&sum_array(\@whole_length)};
	my($av_rate)          =$av_of_posi_diffs/($sum_seq_length);
	&print_seq_in_block(\%final_posi_diffs); # <--- leave this
	for (values %final_posi_diffs){
	 my(@splited) = split(',', $_);
			for (@splited){
			  $out{$_}++ if ($_ =~ /\d+/);	 }
	}
	# the final result is %out which has accumulated entries with occurances
}



#________________________________________________________________________
# Title     : get_occurances_of_char
# Usage     : %occurances_shft_type=%{&get_occurances_of_char(\%final_posi_diffs)};
#             %char_occur=%{&get_occurances_of_char(\@ref_array_of_chars)};
#             %char_occur=%{&get_occurances_of_char(\$ref_string_of_chars)};
#             %char_occur=%{&get_occurances_of_char($string_of_chars)};
#
# Function  : gets the numbers of occurances for 1, 2, 3 ... position shifts.
#             If hash is given, it only looks at the values.
#             If multiple string, array, hash or combinations of these
#              are given, it will add up to one single result
# Example   :
# Warning   :
# Keywords  : composition of chars, composition table making,
#             make_composition, make composition table
#             occurances_of_char, get_char_occurances, occurances
#             get_percentage_occurances_of_char, percentage_occurances_of_char
# Options   : 'p' for percentage output of the char among others
#             'n' for NO name option when HASH input is given
# Returns   : one ref. of hash  (a =>5, b=>6, c=>4,,,,,)
# Argument  : one ref. of hash (seq1 alsdfjlsj
#                               seq2 asldfjsld
#                               seq3 owiurouou);
# Category  :
# Version   : 1.4
#--------------------------------------------------------------------
sub  get_occurances_of_char{
	my ($i, %H, $no_name, %out, $N,@splited, $val,$percentage_out,
	 $split, $sum);
	for($i=0; $i< @_; $i++){
	if($_[$i]=~/^[\-]?p$/i){
	   $percentage_out=1;   splice(@_, $i, 1); 	   $i--;
	}elsif($_[$i]=~/^[\-]?n$/i){
	   $no_name=1;	        splice(@_, $i, 1);	   $i--;
	}
	}

	for($i=0; $i< @_; $i++){
	if(  ref($_[$i]) eq 'HASH'){
	  my %H=%{$_[$i]};
	  my @names=keys %H;
	  for $key (@names){
		 for $split ( split(//, $H{$key}) ){
			if($no_name==1){ $N=$split
			}else{ $N="$key\_$split"; }
		    $out{$N}++; $sum++
		 }
	  }
	}elsif(ref($_[$i]) eq 'ARRAY'){
	  @splited=@{$_[$i]};
	  for $split (@splited){  $out{$split}++; $sum++ }
	}elsif(ref($_[$i]) eq 'SCALAR'){
	   @splited = split(//, ${$_[$i]});
	   for $split (@splited){  $out{$split}++; $sum++ }
	}elsif( !(ref($_[$i])) ){
	   @splited = split(//, $_[$i]);
	   for $split (@splited){  $out{$split}++; $sum++ }
	}
	}
	if($percentage_out==1){
	 my @keys=keys %out;
	 my %percent;
	 for($i=0; $i< @keys; $i++){
		$percent{$keys[$i]} = $out{$keys[$i]}/$sum*100;
	 }
	 return(\%percent);
	}else{
	 return(\%out);
	}
}


#________________________________________________________________________
# Title     : make_composition_table
# Usage     : %occurances=%{&make_compos_table(\%key_and_value_for_seq)};
# Function  : gets the numbers of occurances for 1, 2, 3 ... position shifts.
# Example   :
# Warning   :
# Keywords  : composition of chars, composition table making, make composition table
#             make_composition_table, get_composition, get_amino_acid_composition
#             protein_composition, make_aa_composition_tablem, aa_composition
# Options   :
# Returns   : one ref. of hash  (a =>5, b=>6, c=>4,,,,,)
# Argument  : one ref. of hash (seq1 alsdfjlsj
#                               seq2 asldfjsld
#                               seq3 owiurouou);
# Category  :
# Version   : 1.2
#--------------------------------------------------------------------
sub  make_composition_table{
     my %input = %{$_[0]};
     my (@splited, $split, %out );
     for (values %input){
        @splited = split(//, $_);
        for $split (@splited){  $out{$split}++; }
     }
     return(\%out);
}

#________________________________________________________________________
# Title     : make_composition_ratio_table_simple
# Usage     : %occurances=%{&make_compos_ratio_table(\%final_posi_diffs)};
# Function  : gets ratio of the numbers of occurances for any chars.
# Example   :
# Warning   : This pools all the sequences, to not distinct seq composition if
#              you put more than one seq.
# Keywords  : composition table, composition of chars, composition table making,
#             make composition table, make_composition_table
# Options   :
# Returns   : one ref. of hash  (a =>0.05, b=>0.06, c=>0.04,,,,,)
# Argument  : one ref. of hash (seq1 alsdfjlsj
#                               seq2 asldfjsld
#                               seq3 owiurouou);
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub  make_composition_ratio_table_simple{
	my %input = %{$_[0]};
	my %out;
	my (@keys, $i, %ratio_out, $each_char_occur, @splited, $split, $all_occur );
	for (values %input){
	 @splited = split(//, $_);
	 for $split (@splited){  $out{$split}++; $all_occur ++; }
	}
	@keys = keys %out;
	for ($i=0; $i < @keys; $i ++){

Bioinf.pl  view on Meta::CPAN

#--------------------------------------------------------------------
sub three_to_one_letter{  my(%aa);
	 $aa{"ala"} = "a";  $aa{"met"} = "m";  $aa{"asp"} = "d";  $aa{"pro"} = "p";
	 $aa{"cys"} = "c";  $aa{"asn"} = "n";  $aa{"glu"} = "e";  $aa{"gln"} = "q";
	 $aa{"phe"} = "f";  $aa{"arg"} = "r";  $aa{"gly"} = "g";  $aa{"ser"} = "s";
	 $aa{"his"} = "h";  $aa{"thr"} = "t";  $aa{"ile"} = "i";  $aa{"val"} = "v";
	 $aa{"lys"} = "k";  $aa{"trp"} = "w";  $aa{"leu"} = "l";  $aa{"tyr"} = "y";
	return(\%aa);
}


#________________________________________________________________________
# Title     : convert_3_to_1_letter
# Usage     : %three_letter  = &three_to_one_letter ;   # takes no arguments (void).
# Function  : a hash of one letter to 3 letter amino acid code , returns a hash
# Example   :
# Warning   :
# Keywords  : 321, 3to1 3_to_1 THREE_TO_ONE_LETTER Three_To_One_Letter
#             convert_3_to_1, convert_3_to_1_aa_name
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub convert_3_to_1_letter{  my(%aa);
    $aa{"ala"} = "a";  $aa{"met"} = "m";  $aa{"asp"} = "d";  $aa{"pro"} = "p";
    $aa{"cys"} = "c";  $aa{"asn"} = "n";  $aa{"glu"} = "e";  $aa{"gln"} = "q";
    $aa{"phe"} = "f";  $aa{"arg"} = "r";  $aa{"gly"} = "g";  $aa{"ser"} = "s";
    $aa{"his"} = "h";  $aa{"thr"} = "t";  $aa{"ile"} = "i";  $aa{"val"} = "v";
    $aa{"lys"} = "k";  $aa{"trp"} = "w";  $aa{"leu"} = "l";  $aa{"tyr"} = "y";
    return(\%aa);
}

#________________________________________________________________________
# Title     : convert_1_to_3_letter
# Usage     : %three_letter  = &three_to_one_letter ;   # takes no arguments (void).
# Function  : a hash of one letter to 3 letter amino acid code , returns a hash
# Example   :
# Warning   :
# Keywords  : 123, 1to3 1_to_3 one_TO_three_LETTER One_To_Three_Letter
#             convert_1_to_3, convert_1_to_3_aa_name
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub convert_1_to_3_letter{  my(%aa);
    $aa{"a"} = "ala";  $aa{"m"} = "met";  $aa{"d"} = "asp";  $aa{"p"} = "pro";
    $aa{"c"} = "cys";  $aa{"n"} = "asn";  $aa{"e"} = "glu";  $aa{"q"} = "gln";
    $aa{"f"} = "phe";  $aa{"r"} = "arg";  $aa{"g"} = "gly";  $aa{"s"} = "ser";
    $aa{"h"} = "his";  $aa{"t"} = "thr";  $aa{"i"} = "ile";  $aa{"v"} = "val";
    $aa{"k"} = "lys";  $aa{"w"} = "trp";  $aa{"l"} = "leu";  $aa{"y"} = "tyr";
    return(\%aa);
}



#________________________________________________________________________
# Title     : amino_acid_compos_id_percent
# Usage     : $percent = &amino_acid_compos_id_percent (%any_hash_with_sequences);
#             The way identity(composition) is derived is;
#
# Function  : gets amino acid composition identity of any given
#             number of sequences(at least 2).
# Example   :
# Warning   :
# Keywords  : get_amino_acid_composition, get_protein_composition, composition
# Options   :
# Argument  : hash of at least 2 sequences.
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent{
	my(%input)= %{$_[0]};
	my(@names)=keys (%input);
	my(@temp, $i, $j, $iden, @all_pairs_id, $iden_sum);
	my(%compos_table1, %compos_table2, $final_iden, $larger);
	for ($i=0; $i < @names; $i ++){  # putting seqs in arrays.
	 $input{$names[$i]}=~ tr/a-z/A-Z/;
	 $input{$names[$i]}=~ s/\W//g;
	 @{"string$i"}= split('', $input{$names[$i]});
	 $larger = @{"string$i"} if @{"string$i"} > $larger;
	}
	for ($i=0; $i < @names; $i++){   # to make permutated pairs.
	 for ($j=$i; $j < @names; $j ++){
		if ($j == $i){ next; }
		for ($k=0; $k < $larger; $k ++){  # getting composition tables for two seqs.
		  $compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
		  $compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/);
		}
		$iden = ${&common_compos_id_hash(\%compos_table1, \%compos_table2)};
		%compos_table1=();  %compos_table2=();
		push (@all_pairs_id,  $iden );
	 }
	}
	for $iden (@all_pairs_id){  $iden_sum+= $iden;  }
	if(@all_pairs_id == 0){ @all_pairs_id =1; }
	$final_iden=$iden_sum/@all_pairs_id;
	\$final_iden;
}

#________________________________________________________________________
# Title     : seq_id_percent_array  (more than 2 elements array)
# Usage     : $percent = &seq_id_percent_array(@any_array_sequences);
#             The way identity(pairwise) is derived is;
#
# Function  : produces amino acid composition identity of any given number of sequences.
# Example   :
# Warning   : This can handle 'common gaps' in the sequences
# Keywords  : get_percent_composition_identity, seq_composition_identity,
#             percent_sequence_composition_id
#
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub seq_id_percent_array{
	my(@input, $denominator,@all_pairs_id, $percent_id);
	my($largest,$p,$i,$j,$k,$iden_residue_num,$iden,@temp,$iden_sum,$gap_num,$final_iden);
	for($d=0; $d<@_; $d++){
		if(ref($_[$d]) eq 'ARRAY'){ @input=@{$_[$d]}; }
		elsif( (ref($_[$d]) eq 'SCALAR') &&( ${$_[$d]}=~/^[aA]/) ){ $average_len_opt =1 }
		elsif( !(ref($_[$d])) && ( $_[$d] =~/^[aA]/) ){ $average_len_opt =1 } }
	if ((@input== 1)||( @input== 0)){
		print "\n\n \" $0 \"  requires at least 2 sequences\n\n";
		print "\n Abnormally dying at seq_id_percent_array in $0 \n\n";
		print chr(7); exit;}
	$shortest=length($input[0]);
	my($sans_gap_seq, $length_sum, $average_seq_len);
	for($p=0; $p < @input; $p++){
		$input[$p]=~ tr/a-z/A-Z/;
		$sans_gap_seq=$input[$p];
		$sans_gap_seq=~s/\W//g;
		$input[$p]=~ s/\W/./g;
		(@{"string$p"})=split('', $input[$p]);
		$largest = length($input[$p]) if length($input[$p]) > $largest;
		$shortest = length($sans_gap_seq) if length($sans_gap_seq) < $shortest;
		$length_sum += length($sans_gap_seq);
	}
	$average_seq_len = $length_sum/@input;
	for($i=0; $i< @input; $i++){
		for($j=$i+1; $j< @input; $j++){
			for ($k=0; $k <  $largest; $k ++){  # getting composition tables for two seqs.
				if ((${"string$i"}[$k] !~ /\W/)&&(${"string$i"}[$k] eq ${"string$j"}[$k])){
					$iden_residue_num++; }
				elsif((${"string$i"}[$k] =~ /\W/)&&(${"string$i"}[$k]=~ /\W/)){ $gap_num++; }}
			if( $average_len_opt == 1){ $denominator = $average_seq_len; }
			else{ $denominator = $shortest; }
			if($denominator == 0){ $denominator=1; }  # in the above it is 50% rather than 0.07%
			$percent_id=($iden_residue_num/($denominator))*100;
			push(@all_pairs_id, $percent_id);
			undef ($iden_residue_num, $gap_num);
		}
	}
	for (@all_pairs_id){  $iden_sum+=$_;    }
	$final_iden=$iden_sum/($#all_pairs_id+1);
	return( \$final_iden );
}

#________________________________________________________________________
# Title     : compos_id_percent_array  (more than 2 elements array)
# Usage     : $percent = &compos_id_percent_array(@any_array_sequences);
#             The way identity(composition) is derived is;
# Function  : produces amino acid composition identity of any given number of sequences.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub compos_id_percent_array{
	 my(@input)=@{$_[0]};
	 my($largest,$iden,@temp,$iden_sum,$final_iden, @all_pairs_id);
	for($i=0; $i<=$#input; $i++){  $input[$i]=~ tr/a-z/A-Z/;  $input[$i]=~ s/[\.\-\s]//g;
		@temp = split('', $input[$i]);  (@{"string$i"})= @temp;
		$largest = @{"string$i"} if @{"string$i"} > $largest;    }
	for($i=0; $i< @input; $i++){ #_________ permutating ___________
		for($j=$i+1; $j<=$#input; $j++){
			for ($k=0; $k <= $largest; $k ++){  # getting composition tables for two seqs.
				$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
				$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/);   }
			$iden =${&calc_compos_id_hash(\%compos_table1, \%compos_table2)};
			push(@all_pairs_id, $iden);  %compos_table1=();  %compos_table2=();   }   }
	for $iden (@all_pairs_id){  $iden_sum+=$iden;  }
	$final_iden=$iden_sum/(@all_pairs_id);
	#-----------------------------------------------------
	#  Input here is like :  %hash1= (A,3,B,3,C,4,D,4), %hash2= (A,4,B,1,C,4)
	sub  calc_compos_id_hash{  # input is like this;
	  my(%hash1)=%{$_[0]};
	  my(%hash2)=%{$_[1]};
	  my(%common_of_the_2)=();
	  my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
	  my(@values1) = values (%hash1);
	  my(@values2) = values (%hash2);
	  my(@combined_values)=(@values1,@values2);
	  for $elem (@combined_values){  $sum_residues += $elem;   }
	  if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
	  else{ for $key1(keys %hash1){
				 $common=&smaller_one($hash1{$key1}, $hash2{$key1});
					 sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
				 $sum_of_the_common_residue_no += $common;     }
			 $compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;   }
	  \$compos_id;
	}
	#-----------------------------------------------------
	return ( \$final_iden ); # final identity for any given set of strings(seq).
}

#________________________________________________________________________________
# Title     : compos_id_percent_hash  (synonym of amino_acid_compos_id_percent)
# Usage     : $percent = &compos_id_percent_hash(%any_hash_with_sequences);
#             The way identity(composition) is derived is;
#
# Function  : gets amino acid composition identity of any given number of sequences.
# Example   :
# Warning   :
# Keywords  : get_amino_acid_composiiton
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#------------------------------------------------------------------------------
sub compos_id_percent_hash{ my(%input, @names);
	if(ref($_[0]) eq 'HASH'){  %input= %{$_[0]}; @names= keys  %input;  }
	else{ print "\n hash ref was not passed to compos_id_percent_hash\n"; exit; }
	my(@temp, $iden, @all_pairs_id, $i, $j, $k,$iden_sum);
	my(%compos_table1, %compos_table2, $final_iden, $larger);
	for ($i=0; $i < @names; $i ++){  $input{$names[$i]}=~ tr/a-z/A-Z/;
	 $input{$names[$i]}=~ s/\W//g;    @temp = split('', $input{$names[$i]});
	 (@{"string$i"})=@temp; $larger = @{"string$i"} if @{"string$i"}>$larger;}
	for ($i=0; $i < @names; $i++){
	 for ($j=$i; $j < @names; $j ++){
		if ($j == $i){ next; }
		for ($k=0; $k < $larger; $k ++){
		  $compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
		  $compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/); }
		$iden = ${&common_compos_id_hash(\%compos_table1, \%compos_table2)};
		%compos_table1=(); %compos_table2=(); push (@all_pairs_id, $iden); }}
	for $iden (@all_pairs_id){ $iden_sum+=$iden; }
	$final_iden=$iden_sum/(@all_pairs_id);
	return(\$final_iden);
}
#________________________________________________________________________
# Title     : common_compos_id_hash (BUG free)
# Usage     : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function  : actual calculation of identity
# Example   : ('A', 200, 'C', 191, D, 99)
#                  ('A', 290, 'C', 199, D, 100)
#             uses only two sequences.
# Warning   :
# Keywords  :
# Options   :
# Returns   : ref. of a scaler (in percent)  eg)  95
# Argument  : two references of hash of seqeunces.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub  common_compos_id_hash{
	my(%hash1)=%{$_[0]};
	my(%hash2)=%{$_[1]};
	my(%common_of_the_2)=();  my($common, $compos_id, $sum_of_the_common_residue_no);
	my(@values1) = values (%hash1);  my(@values2) = values (%hash2);
	my(@combined_values)=(@values1, @values2);
	my($sum_residues) = ${&sum_array(\@combined_values)};
	for $key1(keys %hash1){
	 $common=&smaller_one($hash1{$key1}, $hash2{$key1});
	 sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
	 $sum_of_the_common_residue_no += $common;
	}
	if( $sum_residues == 0){ $sum_residues =1 }
	$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;
	\$compos_id;
}


#________________________________________________________________________
# Title     : calc_compos_id_hash (the same as 'common_compos_hash')
# Usage     : %hash = &calc_compos_hash(\%any_hash1, \%any_hash1);
# Function  : actual calculation of identity
# Example   : ('A', 200, 'C', 191, D, 99)
#                  ('A', 290, 'C', 199, D, 100)
#             uses only two sequences.
# Warning   :
# Keywords  :
# Options   :
# Returns   : ref. of a scaler (in percent)  eg)  95
# Argument  : two references of hash of seqeunces.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub  calc_compos_id_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; my(%common_of_the_2)=();
	  my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
	  my(@values1) = values (%hash1);   my(@values2) = values (%hash2);
	  my(@combined_values)=(@values1,@values2);
	  for $elem (@combined_values){
			$sum_residues += $elem;   }
	  if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
	  else{
		  for $key1(keys %hash1){
			  $common=&smaller_one($hash1{$key1}, $hash2{$key1});
				  sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
			  $sum_of_the_common_residue_no += $common;     }
		  $compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;   }
	  \$compos_id;
}
#________________________________________________________________________
# Title     : get_percentage
# Usage     : %out= %{&get_percentage(\%result, '1')};
# Function  : calculates the percentage content of any single char over the whole
#             length of strings in it.
# Example   : if the string is  'seq  ABCDEEEEEFFEFE' given in a hash
#             if you put 'A' as one argument, it counts the occurances of 'A'
#             and gets the percentage of it.
# Warning   : This converts array and string input as ref. into arbitrary hash and
#             returns hash
#             programmed by A Biomatic
# Keywords  : get_percentage_of_char
# Options   : None yet.
# Returns   : Numerical Percentage
# Argument  : ref. for Scalar string or Array of chars or Hash  AND 'the target char'
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percentage{
	my(@in, $k, $sort, $numerator, $residue, @out_hash_ref, %hash_out );
	for($k=0; $k< @_ ;$k++){
	  if( !ref($_[$k])&& (length($_[$k]) == 1 )){
		 $numerator = $_[$k];
	  }
	  elsif( (ref($_[$k]) eq 'SCALAR') && (length(${$_[$k]}) == 1 )){
		 $numerator = ${$_[$k]};
	  }
	  elsif(ref($_[$k]) eq "HASH")  { push(@in, $_[$k]); }
	  elsif(ref($_[$k]) eq "ARRAY") { push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
	  elsif(ref($_[$k]) eq "SCALAR"){ push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
	}
	####### final output is => @in of hash ref elements #######
	for (@in){   my(%H) = %{$_}; my(@keys)= sort keys %H;
	 for $name(@keys){
		 my($numerator_count);
		 my($seq_len) = length($H{$name}); print  "\n $name Sequence length: ", $seq_len, "\n";
		 my(@string) = split(//, $H{$name});
		 for $residue (@string){  if($residue =~/^$numerator$/){ $numerator_count ++; }}
		 my($percent) = $numerator_count/$seq_len *100;
		 $hash_out{$name}=$percent;    }
	 push(@out_hash_ref, \%hash_out);  }
	if(@out_hash_ref ==1){ return($out_hash_ref[0]); }
	elsif( @out_hash_ref > 1){ return(@out_hash_ref); }
}


#________________________________________________________________________
# Title     : pairwise_percent_id  (pairwise sequence identity in percentage)
# Usage     : $identity = ${&pairwise_percent_id(%arrayinput)};
#
# Function  : takes a ref. of a hash of names and sequences, returns
#             percent identity.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub pairwise_percent_id{
	my($i,$j,$k, @iden_array_ref);
	for($i=0; $i< @_; $i++){  my %input= %{$_[$i]};  my @names= sort keys %input;
	  my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
	  my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
	  for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
		 @temp = split('', $input{$names[$i]});  (@{"string$i"})=@temp;
		 $larger = @{"string$i"} if @{"string$i"} > $larger; }
			 for ($i=0; $i < @names; $i++){       # to make permutated pairs.
				for ($j=$i+1; $j < @names; $j ++){
					for ($k=0; $k < $larger; $k ++){  # getting composition tables for two seqs.
					  $iden+=2 if ((${"string$i"}[$k] eq ${"string$j"}[$k])&&(${"string$i"}[$k] =~ /\w/));
					  $residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
					  $residue_sum2++ if (${"string$j"}[$k] =~ /\w/);  }
					$whole_seq_len =($residue_sum1+$residue_sum2);
					$percent_for_pair = $iden/$whole_seq_len*100;
					push(@percent_for_pair,$percent_for_pair);
					$residue_sum1=0; $residue_sum2=0; $iden=0; } }
			 for $iden (@percent_for_pair){ $iden_sum+=$iden;}
	  $final_av_iden=$iden_sum/( @percent_for_pair );
	  push(@iden_array_ref, \$final_av_iden);  }
	if(@iden_array_ref ==1){ return($iden_array_ref[0]);}else{ return(@iden_array_ref);}
}


#________________________________________________________________________
# Title     : get_seq_identity
# Usage     : $identity = ${&get_seq_identity(%arrayinput)};
#
# Function  : takes a ref. of a hash of names and sequences, returns
#             percent identity. NOT composition identity.

# Example   :
# Warning   :
# Keywords  : get_sequence_identity
# Options   :
# Returns   :
# Argument  : hash(es) of sequences.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_seq_identity{
	my($i,$j,$k, $c, @iden_array_ref);
	for($c=0; $c< @_; $c++){
	  my %input= %{$_[$c]};
	  my @names= sort keys %input;
	  my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
	  my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
	  for ($i=0; $i < @names; $i ++){
		 $input{$names[$i]}=~ tr/a-z/A-Z/;
		 @temp = split(//, $input{$names[$i]});
		 @{"string$i"}=@temp;
		 $larger = @{"string$i"} if @{"string$i"} > $larger; }
		 for ($i=0; $i < @names; $i++){       # to make permutated pairs.
			for ($j=$i+1; $j < @names; $j ++){
				for ($k=0; $k < $larger; $k ++){
				  if ( ${"string$i"}[$k] eq ${"string$j"}[$k] and ${"string$i"}[$k] =~ /\w/){
					 $iden+=2 ;
				  }
				  $residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
				  $residue_sum2++ if (${"string$j"}[$k] =~ /\w/);
				}
				$whole_seq_len =($residue_sum1+$residue_sum2);
				$percent_for_pair = $iden/$whole_seq_len*100;
				push(@percent_for_pair,$percent_for_pair);
				$residue_sum1=0;
				$residue_sum2=0;
				$iden=0;
			}
		 }
	  for $iden (@percent_for_pair){
		  $iden_sum+=$iden;
	  }
	  if(@percent_for_pair <1){ @percent_for_pair=(1); }
	  $final_av_iden=$iden_sum/( @percent_for_pair );
	  push(@iden_array_ref, \$final_av_iden);
	}
	if(@iden_array_ref ==1){
	   return($iden_array_ref[0]);
	}else{
	   return(@iden_array_ref);
	}
}




#________________________________________________________________________
# Title     : get_correct_percent_alignment_rate  (made for Bissan)
# Usage     : &get_correct_percent_alignment_rate(\$file1, \$file2);
# Function  : accepts two files and prints out the sequence identities of the alignment.
# Example   :
# Warning   : Alpha version,  A Biomatic , made for Bissan
# Keywords  :
# Options   : h  # for help
#             v  # for verbose printouts(prints actual sequences)
# Returns   : reference of Scalar for percentage correct alignment(for already
#             aligned sequences)
# Argument  : two sequence files which have identical sequence names.
# Category  :
# Version   :
#--------------------------------------------------------------------
sub get_correct_percent_alignment_rate{
	 my($i, $j, $k, $verbose, @string1, @string2, $larger, $seq_pair_id, @seq_pair_ids );
	 my(%inputhash1) = %{&read_any_seq_files($_[0])};
	 my(%inputhash2) = %{&read_any_seq_files($_[1])};
	 my(@names)= sort keys %inputhash1;
	 ######################################
	 ####### Sub argument handling ########
	 ######################################
	 for($k=0; $k< @_ ;$k++){
		if( !ref($_[$k])&& (length(${$_[$k]}) < 5)){  # when inputs are not ref.
		  if($_[$k]=~ /^[\-vV]$/){ $verbose = 1; next;}
		}
		elsif((ref($_[$k]) eq "SCALAR")&&(length(${$_[$k]})<5)){  #  when inputs are  ref.
		  if(${$_[$k]}=~ /^[\-vV]$/){$verbose = 1;next;}          # should shorter than 5
		}
	 }
	 for($i =0; $i < @names; $i++){
		print "\n\n==== Processing structural $names[$i] against artificial $names[$i]\n";
		$inputhash1{$names[$i]} =~ tr/a-z/A-Z/;
		$inputhash2{$names[$i]} =~ tr/a-z/A-Z/;
		@string1=split(//, $inputhash1{$names[$i]});
		@string2=split(//, $inputhash2{$names[$i]});
		print "\n The string1 is  ",@string1,"\n" if $verbose ==1;
		print "\n The string2 is  ",@string2,"\n" if $verbose ==1;
		(@string2 > @string1) ? ($larger=@string2, $smaller=@string1) : ($larger=@string1, $smaller=@string2);
		$true_seq=$inputhash1{$names[$i]};
		$true_seq=~s/\W//g;
		$true_len=length($true_seq);
		print "\n True seq length:   $true_len  , Whole length inc gap: $larger\n";
		for($j = 0; $j < $larger; $j++){
		  $iden_sum++ if ($string1[$j] eq $string2[$j])&& !($string1[$j]=~/\W/); }
		$seq_pair_id =($iden_sum/$true_len) * 100;
		print "\nID between structural and artifical alignment is  $seq_pair_id \%" , "\n";
		push(@seq_pair_ids, $seq_pair_id);
		undef( $iden_sum, $seq_pair_id );
	 }
	 print "\n", "_"x88, "\n";
	 my($whole_average_of_the_id)=${&array_average(\@seq_pair_ids)};
	 print "The whole average is; $whole_average_of_the_id\n";
	 if(@seq_pair_ids == 1){ return( \$seq_pair_ids[0] ); }
	 elsif(@seq_pair_ids > 1){ return( \@seq_pair_ids ); }
}


#________________________________________________________________________
# Title     : amino_acid_compos_id_percent_trend
# Usage     :
# Function  :
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent_trend{
	my(%input) = %{$_[0]};
	my(@common, @string,@accumu_percent_iden)=(); my(%common_so_far, %compos_table);
	my($percent_id_so_far, $length_of_one_seq,$length_of_all_seq, $seq_no)=0;
	for $key(keys %input){
			$input{$key}=~s/[. \d-]//g;
			@string= split(//, $input{$key});
			print @string; print "\n";
			$length_of_one_seq = $#string+1;
			$length_of_all_seq +=$length_of_one_seq;
			$seq_no += 1;
			%compos_table  = &composition_table(@string);
			@check = keys (%common_so_far);
			if  ($#check < 0){ %common_so_far = %compos_table; }
			else{ %common_so_far= %{&common_compos_2_hash(\%common_so_far,\%compos_table)};}
			for $value(values %common_so_far){ $common_residue_sum +=$value; }
			$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
			$common_residue_sum =0;  }
	for $value(values %common_so_far){ $common_residue_sum +=$value; }
	$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
	return(\$final_percent_id);
}

#________________________________________________________________________
# Title     : composition_table   (can handle both nucleic and protein seq)
# Usage     : %output = %{&compos_table(@input_array1, @input_array2,,,,)};
#             example input
#
# Function  : returns a table of alphabet with occurances.
#             can handle any char, this converts char to upper case.
# Example   :
# Warning   : converts all SMALL letters to Capital letters before counting!!
# Keywords  :
# Options   :
# Returns   : %hash1 = ('A',3, 'C',2, 'D',1, 'Q',2, 'S',1), %hash2,,,
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub composition_table{
	my($i, @input,%input,$input,$j,@ref_out);
	for($i=0;$i<@_; $i++){
	 my(%alphabet)=();
	 if( ref($_[$i]) eq 'ARRAY'){ @input=@{$_[$i]}; undef(%alphabet);
		 for ($j=0; $j<=$#input; $j++){ $input[$j] =~ tr/a-y/A-Y/;
			 $alphabet{$input[$j]}+=1 if ($input[$j] =~/[A-Y]/) }
		 push(@ref_out, \%alphabet); }
	 elsif( ref($_[$i]) eq 'HASH'){ %input=%{$_[$i]};@input=keys %input;undef(%alphabet);
		 for ($j=0; $j< @input; $j++){ $input[$j] =~ tr/a-y/A-Y/;
			 $alphabet{$input[$j]}+=1 if ($input[$j] =~/[A-Y]/) }
		 push(@ref_out, \%alphabet); }
	 elsif( ref($_[$i]) eq 'SCALAR'){ $input=${$_[$i]}; $input=s/\,//g if $input=~/\,/;
		 @input=split('', $input); undef(%alphabet);
		 for ($j=0; $j<@input; $j++){ $input[$j] =~ tr/a-y/A-Y/;
			 $alphabet{$input[$j]}+=1; }
		 push(@ref_out, \%alphabet); }    }
	if(@ref_out ==1){
	 return($ref_out[0]);
	}else{ return(@ref_out); }
}

#________________________________________________________________________
# Title     : common_compos_2_hash
# Usage     : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function  :
# Example   : common gaps means only '.' (dots, not alphabets!!)
#             AAA....BBCB
#             AAAB..B.BCC  --> A.A.....BC. (as in an array)
#             A.AAA...BCA
# Warning   :
# Keywords  :
# Options   :
# Returns   : a hash (string1, number1, string2, number2, string3, number3, ...)
# Argument  : two references of hash of seqeunces.
# Category  :
# Version   :
#--------------------------------------------------------------------
sub  common_compos_2_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]};
	my(%common_of_the_2)=(); my($common)=0;
	for $key1(keys %hash1){
	 $common=&smaller_one($hash1{$key1}, $hash2{$key1});
	 if ($common =~ /\d+/){
		$common_of_the_2{$key1}=$common; } }
	\%common_of_the_2;
}


#________________________________________________________________________
# Title     : pair_percent_id_trend
# Usage     : @array = &pair_percent_id_trend (%arrayinput);
# Function  :
# Example   : common gaps means only '.' (dots, not alphabets!!)
#             AAA....BBCB
#             AAAB..B.BCC  --> A.A.....BC. (as in an array)
#             A.AAA...BCA
#             The resulting array XXXXX..XXXX is literally like so.
#             This is to detect absurd gaps in the above.
#
#
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub pair_percent_id_trend{
	my(%input) = %{$_[0]};
	my(@common, @string,@accumu_percent_iden)=();
	my($percent_id_so_far)=0;
	for $key(keys %input){
	  my($len) = &smaller_one($#common, $#string) unless $#common < 0;
	  $input{$key}=~s/ //g;
	  @string= split(//, $input{$key});
	  $length_of_one_seq = $#string+1;
	  $length_of_all_seq +=$length_of_one_seq;
	  $seq_no += 1;
	  for ($k=0; $k <= $len;$k++){
		  if($#common == -1){
			 @common = @string;
			 last;
		  }
		  elsif (($string[$k] =~ /^(\W)/)&&($1 ne $previous_non_char)){
			 $non_char_count +=1;
			 $previous_non_char=$1;
		  }
		  elsif ( $string[$k] eq $common[$k] ){
			 $common[$k] = $string[$k];
			 $identical_count +=1;
		  }elsif( $string[$k] ne $common[$k]){
			 $common[$k]='.';
		  }
	  }
	  $num_of_iden_char = &count_num_of_char(\@common);
	  $av_seq_no = $length_of_all_seq/$seq_no;
	  $percent_id_so_far = $num_of_iden_char/$av_seq_no*100;

	  print "\n percent_id so far = $percent_id_so_far \n";
	  push(@accumu_percent_iden,$percent_id_so_far);

	} # end of for (after all sequences have been run).
	$num_of_iden_char = &count_num_of_char(@common);
	$av_seq_no = $length_of_all_seq/$seq_no;
	$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
	print "\n percent_id so far = $percent_id_so_far \n";
	\@accumu_percent_iden; # final ids array.
}
#________________________________________________________________________
# Title     : smaller_one
# Usage     : $smaller = & smaller_one($var, $var2);
#
# Function  : gets smaller value of the two inputs
# Example   : will return   5   with  &smaller_one(5, 50);
# Warning   : gets only digits!!
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub smaller_one{
	if (($_[0], $_[1]=~/\d+/)||($_[0] > $_[1])){
	 return $_[1];
	}elsif(($_[0], $_[1]=~/\d+/)||($_[0] <= $_[1])){
	 return $_[0];
	}else{
	 print "\n I am sub 'smaller_one', the input were not digits \n";
	}
}

#________________________________________________________________________
# Title     : count_num_of_char
# Usage     : $num_char = &count_num_of_char(@input_array_of_single_char);
# Function  : takes only ARRAY and counts the number of char. Each elem should be
#             a single char.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub count_num_of_char{
	 my(@input)={$_[0]};
	 my($num_of_char)=0;
	 for $elem(@input){  # this is for the percentage of TWO seqs.
		 if ($elem =~ /\w/){
					 $num_of_char +=1;
		 }
	 }
	 $num_of_char;
}
#________________________________________________________________________
# Title     : remov_com_column2  (this is the older and slower version)
# Usage     : %new_string = %{&remov_com_column2(\%input_hash)};
# Function  :
# Example   : seq1  ABCDE------DDD         seq1  ABCDE--DDD
#             seq2  ABCDEE-----DD-  ==>    seq2  ABCDEE-DD-
#             seq3  ---DEE----DDE-         seq3  ---DEEDDE-
#                         ^^^^
#             from above the 4 columns of gap will be removed
#             To remove absurd gaps in multiple sequence alignment
# Warning   :
# Keywords  :
# Options   :
# Returns   : a ref. of a hash.
#
#               <input hash>                   <out hash>
#
# Argument  : accepts reference for a hash.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub remov_com_column2{
	my(%input) = %{$_[0]};
	my(@common)=();
	my($len)=0;
	my(@string)=();
	my(@new_string)=();
	my(@string2)=();
	my(%new_string);

	########## Finds common gaps ###########
	for $key(keys %input){
	  $len  = &smaller_one($#common, $#string) unless $#common < 0;
		  sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
	  @string = split(/|| /, $input{$key});
	  for ($k=0; $k <= $len;$k++){
		  if($#common == -1){
			  @common = (@string);
			  last;
		  }
		  if (($string[$k] eq '.')&&($common[$k] eq '.')){
			  $common[$k]='.';
		  }else{
			  $common[$k]='X';
		  }
	  }
	}

	########## removes gaps ###########
	for $key2 (keys %input){
		@string2 = split(//, $input{$key2});
		for ($i=0; $i <= $#string2; $i++){
		  if ($common[$i] eq $string2[$i]){
			  print;

Bioinf.pl  view on Meta::CPAN

			 }else{                        ### If not option is set, just write
				%TEM = @in;
				$LEN = ${&max_elem_string_array_show_hash( keys %TEM)};
				 if($LEN > $KL){ $KL = $LEN + $GAP +2};
				 printf ("%-${KL}s ", $in[$k]);  $k++; # print $in[$k], "\t";  $k++;
				 printf ("%-${VL}s\n",$in[$k]);        # print $in[$k], "\n";
			 }
		 }
		  #________________________________________________________
		  # Title    : max_elem_string_array_show_hash
		  # Keywords : largest string length of array
		  # Function : gets the largest string length of element of any array of numbers.
		  # Usage    : ($out1, $out2)=@{&max_elem_array(\@array1, \@array2)};
		  #            ($out1)       =${&max_elem_array(\@array1)          };
		  # Argument : numerical arrays
		  # returns  : one or more ref. for scalar numbers.
		  # Version  : 1.1
		  #-------------------------------------------------------
		  sub max_elem_string_array_show_hash{
			 my(@input, $i, $max_elem);
			 @input = @{$_[0]} || @_;
			 for($i=0; $i< @input ; $i++){
					$max_elem = length($input[0]);
					if (length($input[$i]) > $max_elem){
						$max_elem = length($input[$i]);
					}
			 }
			 \$max_elem;
		  }
		  #####################################insert_gaps_in_seq_hash
	 }
	}
}

#_______________________________________________________________
# Title     :  open_predator_files
# Usage     :
# Function  : gets sec. str. prediction of predator and puts in hash
#             If 's' option is given, it also gives sequence hash ref
#             as the second output ref. This can handle the 2 types
#             of output format of predator. So, the output can will
#             be different according to inputs.
# Example   :
#  There are 2 types of output.  The short output:>
#
#  > MOZ_HUMAN_part
#                .         .         .         .         .
#  1    LDHKTLYYDVEPFLFYVLTQNDVKGCHLVGYFSKEKHCQQKYNVSCIMIL   50
#       ___EEEEEE__HHHHHHH_______EEE____________EEEEEEEEE_
#
# ((-l option for long output )
#  NAME MOZ_HUMAN_part
#  HEADER  |- Residue -|  Pred  Rel      NAli   Asn
#  PRED    1    MET    M  c     0.000    0      ?
#  PRED    2    ALA    A  c     0.000    0      ?
#
# Warning   :
# Keywords  : open_prd_files, open_pred_files, predator, open_prdl_files
#             open_pre_files, secondary structure prediction file
# Options   : 's' for sequence output as well (\%sec_str, \%seq)
#             'p' for percentage of the sec. str.
#             'a' for accumulated percentage. This will
#                  set 'p' automatically
#             'n' for NO name when outputing Percentage of chars with
#                 HASH input to get_occurances_of_char sub.
#      $reverse_residue_order=r by r
# Returns   :
# Argument  :
# Category  :
# Version   : 1.8
#-----------------------------------------------------------
sub open_predator_files{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my( @out_ref, $seq_out, %sec_str, %seq, $percent_out, $NO_name_out,
        $short_form_out_detected, $long_form_out_detected, $accumulate,
        $reverse_residue_order, %rev_sec_str);
    if($char_opt=~/s/i){ $seq_out=1 }
    if($char_opt=~/a/i){ $accumulate=1  }
    if($char_opt=~/p/i){ $percent_out=1 }
    if($char_opt=~/n/i){ $NO_name_out='n' }
    if($char_opt=~/r/){  $reverse_residue_order='r' }

    for($i=0; $i< @file; $i++){
	 my (%sec_str, %seq) if($accumulate !=1);

         open(PREDATOR_FILE, "$file[$i]");
         print "\n# (INFO) open_predator_files: opening $file[$i]";
	 while(<PREDATOR_FILE>){
              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
              # Simple sec str input form
              #________________________________________________
              if(/\> *(\S+)/){
                 $name=$1; $short_form_out_detected=1; print "\n# (INFO) \"$name\" was found in $file[$i]";
              }elsif($short_form_out_detected and /^[\t ]+([CHE_]+) *$/i){
                 $sec_str{$name}.=$1;
              }elsif($short_form_out_detected and  $seq_out==1  and /[ \t]*\d+[ \t]*(\w+)[\t ]*\d+/){
                 $seq{$name}.=$1;
              }elsif($short_form_out_detected and /^[\t ]+\d+ +\S/){
                 next;
              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
              # Sec str form HASH input. Complex form
              #________________________________________________
              }elsif(/^ *NAME +(\S+)/){
                 $name=$1;   $long_form_out_detected=1;
              }elsif($long_form_out_detected and /^ *\S+ +(\d+) +(\S+) +(\S+) +(\S) +(\S+) +\S+ +\S+/){
                 $position=$1;
                 $residue_3_letter=$2;
                 $residue_1_letter=$3;
                 $sec_str_predicted=$4;
                 $reliability=$5;
                 $sec_str{$position}=[$residue_1_letter, $sec_str_predicted, $reliability];
              }
	 }
         close (PREDATOR_FILE);
	 print "\n \%sec_str is: ", %sec_str, "\n" if ($debug == 1);
	 if($seq_out==1){ push(@out_ref, \%sec_str, \%seq);
	 }elsif($percent_out==1 ){
	      push(@out_ref, [%{&get_occurances_of_char(\%sec_str, $NO_name_out, 'p')}] );
	 }elsif($percent_out !=1){ push(@out_ref, \%sec_str) }

         #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
         # If -r option is set (for long form, this does not affect
         #____________________________________________________________
         if($short_form_out_detected and $reverse_residue_order){
              @keys=keys %sec_str;
              for($r=0; $r<@keys; $r++){
                  $rev_sec_str{$keys[$r]}=reverse($sec_str{$keys[$r]});
              }
              %sec_str=%rev_sec_str;
         }
    }
    print "\n# (INFO) \$long_form_out_detected is returned from open_predator_files\n" if $long_form_out_detected;
    print "\n# (INFO) \$short_form_out_detected is returned from open_predator_files\n" if $short_form_out_detected;
    if(@out_ref==1){
       return($out_ref[0]);
    }elsif(@out_ref>1){
       return(@out_ref);
    }
}



#_______________________________________________________________________________
# Title     : open_phd_files
# Usage     : &open_phd_files(\$file_name, $options,,,,,);
# Function  : open phd files and put sequences in a hash(s) (run open_phd_files.pl to
#             get some ideas on how this works. type  'open_phd_files.pl xxx.phdo s',
#             it will produce 5 different hashes of secondary structure pred.
# Example   :
# Warning   : All the spaces are converted to '_'
# Keywords  :
# Options   : $secondary, $access, $PHD_sec, $Rel_sec, $prH_sec, $prE_sec, $prL_sec,
#                  $prL_sec, $SUB_sec, $P_3_acc, $PHD_acc, $Rel_acc, $SUB_acc);
#   $attach_class_info_in_seq_name=c by c ## this makes seq_name   seq_name_PHD_s
#   $simple_seq_with_name_hash=s by s
#
# Returns   : one or more hashes(ref.) secondary structure prediction of PHD server
#             --- The PHD secondary server output which are read by open_phd_files -----
#             1 =>       PHD sec |         HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH     HHHHHHH|
#             2 =>       Rel sec |987544342178899999999987678999998478999999999995679771688999|
#             3 =>       prH sec |001222323478899999999987778999998678999999999986110115788999
#             4 =>       prE sec |000010000101000000000000010000000000000000000000000000010000
#             5 =>       prL sec |987666565410000000000001110000001211000000000002789774100000
#             6 =>       SUB sec |LLLL
#             7 =>       P_3 acc |eeeeeeeeee bbeeebbbebbbbebeeee b bbebbebb eebeebe eee eebbeb|
#             8 =>       PHD acc |988787787630066600060000606667515007007005760671847885760160
#             9 =>       Rel acc |979685546222352421667053233245604127749164753790316552446141
#             0 =>       SUB acc |eeeeeeeee
#             types of PHD output, like 1 for 'PHD sec', 2 for 'Rel sec' etc.
# Argument  : one or more file names and options. Files should be PHD server's result.
# Version   : 1.6
#-------------------------------------------------------------------------------
sub open_phd_files{
  my(@names, $i, $j,$n, $s, @in, @out_hash_ref_list, $base, @option);
  my($secondary, $access, %PHD_sec, %Rel_sec, %prH_sec, %prE_sec, %prL_sec, %prL_sec,
     %SUB_sec, %P_3_acc, %PHD_acc, %Rel_acc, %SUB_acc, $simple_seq_with_name_hash,
     $PHD_sec_on, $Rel_sec_on, $prH_sec_on, $prE_sec_on, $prL_sec_on, $SUB_sec_on,
     $P_3_acc_on, $PHD_acc_on, $Rel_acc_on, $SUB_acc_on, $residues,
     $PHD_sec, $Rel_sec, $prH_sec, $prE_sec, $prL_sec, $SUB_sec, $P_3_acc,

Bioinf.pl  view on Meta::CPAN

	}else{
	  if(@out_hash_ref_list == 1){ return($out_hash_ref_list[0]); }
	  elsif(@out_hash_ref_list > 1){ return(@out_hash_ref_list);  }
	}
}

#________________________________________________________________________
# Title     : open_dna_files  (genbank file opener)
# Usage     : ($out, $out2) = @{&open_dna_files(\$inputfile1, \$inputfile2)};
#             : (@out)        = @{&open_dna_files(\$inputfile1, \$inputfile2)};
#             ---------- Example of dna file --- dna files are genbank file format
#
#
#             1 ggatcttgct gaatacatgg tggcacaatt gaaattagat ccgcgaattt
#               tcatcaaaac
#             61 agcgggatta tggtcaacaa atccgtaaaa atgaaaagcc tgtcttgcga
#               caggcttttt
#             121 tatttgaatg taatcctcac tggtaaacgt ttaacgccaa agacaaaggg
#               actagggatc
#             181 gcttcaagct tttcatcatg agcagctttt tcgatacaag ctgacattga
#
# Function  : open dna files and put sequences in a hash(s)
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   : (@out_array_of_refs)
# Argument  : (\$inputfile1, \$inputfile2, .... )};
# Category  :
# Version   :
#--------------------------------------------------------------------
sub open_dna_files{  my(@in)=@_; my(@names, $i,$n, $s, %hash,@out_hash_ref_list);
	for($i=0; $i<=$#in; $i++){
	 if(ref($in[$i])){ unless (-e ${$_[$i]}){ next; }
		 open(FILE_1,"${$_[$i]}");  undef(%hash);
		 while(<FILE_1>){      # file1 needs to be xxxx.msf for the moment, automatic later
			 if(/Name\:\s+(\S+)\s+/){ $n=$1; $n=~s/\,//g; }
			 if((/\s+\d+\s([acgt ]+)$/)||(/\s\s\s\s+([acgt ]+)$/)){
				$s=$1; $s=~s/ //g; $s=~tr/a-z/A-Z/; $hash{$n}.=$s; }     }
		 push(@out_hash_ref_list, \%hash); } }
	if(@out_hash_ref_list  == 1 ){ return(\%hash); }
	elsif(@out_hash_ref_list > 1){ return(@out_hash_ref_list); } # <-- contains (\%out_seq0, \%out_seq1, \%out_seq2, .... )
}
#________________________________________________________________________
# Title     : open_tem_files
# Usage     : ($r1, $r2, $r3, $r4, $r5)=&open_tem_files(\$infile1, \$inputfile2..)};
#             ---------- Example of xxxx
#             >P1;1cdg
#             sequence
#             APDTSVSNKQNFSTDVIYQIFTDRFSDGNPANNPTGAAFDGTCTN-LRLYCGGDWQGIINKINDGYLTGMGVTAI
#             >P1;1cdg
#             secondary structure and phi angle
#             CCCCCCCCCCCCCCCCEEECCHHHHCCCCHHHCCCPHHCCCCPCC-CCCCCPCCHHHHHHHHHCPHHHHHPCCEE
#             >P1;1cdg
#             solvent accessibility
#             TTTTTTTTTTTFFFFFFFFFFFFFFTTTTTTTTTTTTTTTTTFTT-TTTTFFFFFTFFTTTFTTTFFTTFTFTFF
#             >P1;1cdg
#             DSSP
#             CCCCCCCCCCCCCCCCEEECCHHHHCCCCGGGCCCGGGCCCCCCC-CCCCCCCCHHHHHHHHHCCHHHHHCCCEE
#             >P1;1cdg
#             percentage accessibility
#             67523272360000000000000002213792129b722248085-14110000030015105660028040200
#             2ltn           ----TETTSFLITKFSPDQQNLIFQGDGYTT-KEKLTLTK------AVKNTVGRALYSSP
#             1loe           ----TETTSFSITKFGPDQQNLIFQGDGYTT-KERLTLTK------AVRNTVGRALYSSP
#
#             2ltn           ----CEEEEEEECCCCCCCCCEEEEPCCEEP-PPCEEEEC------CCCPCEEEEEECCC
#             1loe           ----CEEEEEEECCCCCCCCCEEEEPCCEEE-PPEEEEEC------CCCPCEEEEEECCC
#
#             2ltn           ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFTFFT------TTTTTTFFFFTTTT
#             1loe           ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFFFFT------TTTTTTFFFFTTTT
#
#             2ltn           ----CEEEEEEECCCCCCCCCEEEEECCEEC-CCCEEEEC------CCCCCEEEEEECCC
#             1loe           ----CEEEEEEECCCCCCCCCEEEEECCEEE-CCEEEEEC------CCCCCEEEEEECCC
#
#             2ltn           ----543251b16504681c50422650502-75201006------35681200001453
#             1loe           ----6532e1508a07981b50422750404-8a200006------36672200001453
# Function  : opens JPO's xxxx.tem file, stores in 5 hashes. (usually one tem file)
# Example   :
# Warning   :
# Keywords  :
# Options   : -n, n, or N for removing any gaps in the sequences.
#             -s, s, or S for getting only the sequences.
# Returns   : ($r1, $r2, $r3, $r4, $r5) <= these are references for hashes.
# Argument  : (\$inputfile1, \$inputfile2, .... )};
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub open_tem_files{
	my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
	  $s, $t, $u, $v, $w, $x, $y, $z, $pwd, $file, $dir, $output, $in_dir,
	  %hash, @keys, @array, @hash, $option_string, $string, @in, $line,
	  $name, %out, $gap_chr, @str1, @str2, $num_opt, @file, @dir,
	  $char_opt, $char_opt_given, $num_opt_given,
	  @char_options, @file, $original_dir, @read_files, %array_msf, %array_jp,
	  $jp_file, $error_rate, $id_compos, @dir, @names, $name, $name_found,
	  @outref, %sequence, %secondary,%solvent_access, %DSSP, %percent_accessibility,
	  $name_found,$type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc
	);
	##################################################
	##### Start of  general argument handling   ######
	##################################################
	for($k=0; $k< @_ ;$k++){
	  if( !ref($_[$k]) ){
			  if($_[$k]=~ /^[\-]*(\w)$/){
				  $char_opt  .= "\,$1";
				  $char_opt_given =1;
			  }elsif($_[$k]=~ /^\-(\w\w+)$/){       ## When multiple option is given,
				  my(@char_options) = split(/|\,/, $1); ## '-' should be used. eg. '-HEGI'
				  $char_opt .= join("\,", @char_options);  ## as an option string.
				  $char_opt_given = 1;
			  }elsif($_[$k]=~ /^([\-]*\d)$/){
				  $num_opt   .= "\,$1";  ## delimiter is ','
				  $num_opt_given = 1;
			  }elsif(-f $_[$k]){     ## When file is given,
				  push(@file, \$_[$k] );
			  }elsif(-d $_[$k]){     ## When dir is given,
				  push(@dir, \$_[$k] );    }
	  }elsif( ref($_[$k]) ){
			if(ref($_[$k]) eq "SCALAR")
				{if(${$_[$k]} =~ /^[\-]*(\w)$/){  ## check if it has '-' before option char
					$char_opt  .= "\,$1";  ## delimiter for option char is ','
					$char_opt_given = 1;
				}elsif(${$_[$k]}=~ /^\-(\w\w+)$/){       ## When multiple option is given,
					my(@char_options) = split(/|\,/, $1); ## '-' should be used. for eg. '-HEGI'
					$char_opt  .= join("\,", @char_options);  ## as an option string.
					$char_opt_given =1;
				}elsif(${$_[$k]}=~ /^([\-]*\d)$/){
					$num_opt   .= "\,$1";  ## delimiter is ','
					$num_opt_given = 1;
				}elsif(-f ${$_[$k]}){     ## When file is given,
					push(@file, $_[$k] );
				}elsif(-d ${$_[$k]}){     ## When dir is given,
					push(@dir, $_[$k] );  }
			}elsif(ref($_[$k]) eq "ARRAY"){  ## When ARRAY is given,
					push(@array, $_[$k]);
			}elsif(ref($_[$k]) eq "HASH"){   ## When HASH is given,
					push(@hash, $_[$k]);
			}
	  }
	  ###################################################
	  ## The output of this option handling section is
	  ## one or combination of these:
	  ## $char_opt_given   ##<<-- Simple boolean '1' or none
	  ## $num_opt_given    ##<<-- Simple boolean '1' or none
	  ## $char_opt, as ('A,B,C')
	  ## $num_opt,  as ('1,-2,3')
	  ## @file          as (\file1, \file2,...)
	  ## @dir           as (\dir1, \dir2,...)
	  ## @array         as (\array1, \array2,,,)
	  ## @hash          as (\hash1, \hash2,,,,)
	  ###################################################
	}
	################################################
	##### END of  general argument handling   ######
	################################################

	for($i=0; $i < @file; $i++){
	 if(ref($file[$i])){ unless(-T ${$file[$i]}){ next; }
		 open(FILE_1, "${$file[$i]}");
		 while(<FILE_1>){
			 if(/^\>P1\;([\w\-]+)/){ $name=$1; #=================== SEQUENCE
				($type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc)=();
			 }elsif(/^sequence/){  $type_seq = 1;
			 }elsif(($type_seq ==1)&&(/^([\w\-]+)[\*]*$/)){
				my($line) = $1;
				if( $char_opt =~ /n/i){  ## to remove the gaps etc.
					$line=~s/\W//g;
					$sequence{$name}.=$line;
				}else{
					$sequence{$name}.=$line;
				} #from below ============== SECONDARY
			 }elsif(/^secondary structure and phi angle/){  $type_secon = 1;
			 }elsif(($type_secon ==1)&&(/^([\w\-]+)[\*]*$/)){
				$secondary{$name}.=$1;     #from below============= SOLVENT ACCESSIBILITY
			 }elsif(/^solvent accessibility/){  $type_sol = 1;
			 }elsif(($type_sol ==1)&&(/^([\w\-]+)[\*]*$/)){
				$solvent_access{$name}.=$1;     #from below========= DSSP
			 }elsif(/^DSSP/){  $type_DSSP = 1;
			 }elsif(($type_DSSP ==1)&&(/^([\w\-]+)[\*]*$/)){
				$DSSP{$name}.=$1;     #from below=================== PERCENTAGE ACCESSIBILITY
			 }elsif(/^percentage accessibility/){  $type_acc = 1;
			 }elsif(($type_acc ==1)&&(/^([\w\-]+)[\*]*$/)){
				$percent_accessibility{$name}.=$1;  } }
		  push(@outref,\%sequence,\%secondary,\%solvent_access,\%DSSP,\%percent_accessibility);
	  }  }
	if( ($char_opt =~ /s/i) || ( @outref  == 1 ) ){
	  return(\%sequence); }
	elsif( @outref > 1){ return(@outref); } # <-- contains (\%sequence,\%secondary,....)
}

#________________________________________________________________________
# Title     : open_hlx_files
# Usage     :
# Function  :
#             Example of hlx file (For Bo Nielson)
#             Residue Frame Score Probability
#             1 M   a  1.00563E+00 2.05479E-03
#             2 T   b  1.01814E+00 2.52053E-03
#             3 R   c  1.01814E+00 2.52053E-03
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   : list of ref. for hash(es)
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub open_hlx_files{  my(@in)=@_; my(@names, $i,$n, $s, %hash,@out_hash_ref_list);
	for($i=0; $i< @in; $i++){
	 if(ref($in[$i])){ unless(-e ${$in[$i]}){ next; }}
	 open(FILE_1, "${$in[$i]}");
	 while(<FILE_1>){
		if(/^[\s]+([\d]+)\s+(\w+)\s+(\w+)\s+\S+\s+(\S+)$/){
		  $hash_residue{$1}=($2); # residue num is key, residue is value.
		  $hash_frame{$1}=($3);   # residue num is key, frame is  value.
		  $hash_prob{$1}=($4);    # residue num is key, probability is  value.
		}
	 }
	 push(@out_hash_ref_list, \%hash_residue, \%hash_frame, \%hash_prob);
	}
	if($#out_hash_ref_list  == 0 ){ return(\%hash_residue); }
	elsif($#out_hash_ref_list > 0){ return(@out_hash_ref_list); }
}


#________________________________________________________________________
# Title     : open_jp_files  (bug free!!)
# Usage     : %out_hash=%{&open_jp_files(\$file_name)};
# Function  : reads jp files and stores results in a hash.
# Example   :
# Warning   : All the spaces  '-' !!!
# Keywords  :
# Options   :
# Returns   : a reference of a hash for names and  their sequences.
# Argument  :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub open_jp_files{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};

Bioinf.pl  view on Meta::CPAN

								if ($i % 400 == 0){
													 $julian += 365;
								} elsif ($i % 4 == 0){
													 $julian += 366;
								} else {
													 $julian += 365;
								}
		  }
		  for ($i = 1; $i < $mo; ++$i){
								if ($i == 1 || $i == 3 || $i == 5 ||
													 $i == 7 || $i == 8 || $i == 10 ||
													 $i == 12) {
													 $julian += 31;
								} elsif ( $i == 4 || $i == 6 ||
													 $i == 9 || $i == 11) {
													 $julian += 30;
								} elsif ($leapflag eq "TRUE"){
													 $julian += 29;
								} else {
													 $julian += 28;
								}
		  }
		  $julian += $dy;
}

#________________________________________________________________________
# Title     : opendir_and_go
# Usage     : &opendir_and_go($input_dir); #$inputdir='/nfs/ind4/ccpe1/people/A Biomatic /jpo/align';
# Function  : open dir and process all files if you wish, and then go in any sub
#             dir of it. Using recursion. created by A Biomatic
#             if any file is linked, it skips that file.
# Example   : as in my 'indexing.pl' for perl file indexer.
# Warning   : Seems to work fine.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub opendir_and_go{
    my($original_dir)=$_[0];
    my(@read_files)=&read_any_dir($original_dir);
    foreach $file(@read_files){
        my($dir)=$split_path[$#split_path];
        if (-l $realfile1){
                               next;
        }elsif (-d $realfile1){
                 &opendir_and_go($realfile1);
        }elsif (-f $realfile1){ #<<------ This is where things match
            chdir($original_dir);
            if($realfile1 =~/(\d+\-$no\.msf)$/){
                @dir=split(/\//, $realfile1);
                $dir=$dir[($#dir-1)];  # where am I ?
                # $jp_file = $original_dir.'/'.$dir.'.jp';
                # %array_msf =&open_msf_files($realfile1);
                # %array_jp  =&open_jp_files ($jp_file);
                # $array_ref_msf = \%array_msf;
                # $array_ref_jp  = \%array_jp;
                # $error_rate =&get_posi_shift_hash($array_ref_msf, $array_ref_jp);
                # $id_compos  =&amino_acid_compos_id_percent($array_ref_jp);
                # push(@rates_accumu,$error_rate);
                # push(@compos_id,$id_compos);
            }
        }
        else
        {
                               next;
        }
    }
}
#________________________________________________________________________
# Title     : occurances
# Usage     : sort occurances (@any_array_with_repeating_element);
# Function  : this is for sort, to sort things according to the higher num. of occu.
# Example   :
# Warning   : This is from 21 DAYS book, page 373.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub occurances{
	 $occurance_hash{$a} <=> $occurance_hash{$b};
}

#________________________________________________________________________
# Title     : extract_ori_seq
#             nt5
# Usage     : &extract_ori_seq($input_file, $output_file, $out_seq_no, *array2);
# Function  : extract seqs. which are from struc. alignment only. to be analysed.
#             after mul. alignment with added seq. you can extract original str.
#             sequ. by using this. The output always has ...msff  ext.
#             *array_ali is the JPO's or true alignment hash.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub extract_ori_seq{
	 local($input_file, $output_file, $out_seq_no, *array1, *array2) = @_; # something like $dir.$mul_factor.msf
		  local(%array_ext) = &open_msf_files("$input_file");
		  %array_ext = &hash_substract(*array_ext, *array2); # getting rid of added seq.
		  %array_ext = &hash_common(*array_ext, *array1);
		  open(OUTPUT, ">$output_file"); # this is different from $dir.msff
	 printf OUTPUT "PileUp\n\n\n";
	 printf OUTPUT "  MSF:%5d  Type: P                     Check:    0  ..\n",$ls;
	 print  OUTPUT "\n\n";
		  my(@keys3) = ( keys %array_ext );
		  $max = &max_str_value_hash(%array_ext);
	 $ls = $max;
	 $seq3 = ($#keys3+1);
	 for($j=0; $j < $seq3 ;$j++){
					 $name3=$keys3[$j];
					 printf OUTPUT " Name: %10s     Len: %5d  Check:    0  Weight:  1.00\n",$name3,$ls;

Bioinf.pl  view on Meta::CPAN

						  while($is < $ie){
										  $iee=$is+10;
										  $iee=$ls if $iee > $ls;
										  $out.=' ';
										  while($is < $iee){
													 $char=substr($string,$is,1);
													 if($char ne ' '){
													 $char =~ tr/a-z/A-Z/;
													 $out.=$char;
													 $char='.' if $char eq '-';
													 }
													 $is++;
										  }
						  }
				print OUTPUT "$out\n";
				}
				print OUTPUT "\n";	# open(OUTPUT, ">$dir.msff");
	 }
}	#  end of extract_ori_seq
#________________________________________________________________________
# Title     : get_pair_homol_array
# Usage     : $hom_out_count = ${&get_pair_homol_array(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function  : get pair wise seq. !! Number of pair identical residues.
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_pair_homol_array{
	 my(@input)=@{$_[0]};
	 $input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$input[0]);
	 my(@string2)= split(//,$input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
		  print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter, $sum)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 $sum = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }
	 \$id_counter; # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title     : get_percent_homol_arr
# Usage     : $homology_out = ${&get_pair_homol(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function  : get pair wise seq. identity of any two strings, outputs a scalar (%)
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percent_homol_arr{
	 my(@input)=@{$_[0]};
	 $input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$input[0]);
	 my(@string2)= split(//,$input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
		  print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter, $sum,$percent_homol)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 $sum = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }else{
		  $percent_homol=($id_counter/$sum)*100;
	 }
	 return(\$percent_homol); # $id_counter is the homology counter;
}

#________________________________________________________________________
# Title     : get_pair_homol_hash
# Usage     : $homology_out = & get_pair_homol (%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function  : get pair wise seq. identity as a scalar count
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_pair_homol_hash{
	 my(%input)=@_;
	 &hash_chk(\%input);
	 my(@keys_input)= keys (%input);
	 my(@values_input) = values (%input);
	 $values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$values_input[0]);
	 my(@string2)= split(//,$values_input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
					 print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		 print "\n There is something wrong in getting homology in get_pair_homol \n";
		 &caller_info;
			}
	 return ($id_counter); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title     : get_percent_homo_hash
# Usage     : $homology_out = &get_pair_homol_hash(%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function  : get pair wise seq. identity(%) of any two strings put in as a hash
# Example   :
# Warning   : reliable, but input seq. strings shouldn't contain spaces.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_percent_homo_hash{
	 my(%input)=@_;
	 &hash_chk(\%input);
	 my(@keys_input)= keys (%input);
	 my(@values_input) = values (%input);
	 $values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
	 $values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
	 my(@string1)= split(//,$values_input[0]);
	 my(@string2)= split(//,$values_input[1]);
	 if (($#string1 == -1) || ($#string2 == -1)){
					 print "\n One of the string is empty O.K. ? \n";
	 }
	 my($larger)= &max($#string1, $#string2);
	 my($id_counter, $gap_counter, $non_equal_counter,$percent_homol,)=0;
	 for ($i = 0; $i<=$larger; $i++){
		  if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
				$gap_counter+=1;
		  }elsif ($string1[$i] eq $string2[$i]){
				$id_counter +=1;
		  }else{
				$non_equal_counter += 1;
		  }
	 }
	 my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
	 if ($sum != ($larger+1)){
		  print "\n There is something wrong in getting homology in get_pair_homol \n";
		  &caller_info;
	 }else{
		  $percent_homol=($id_counter/$sum)*100;
	 }
	 return ($percent_homol);
}


#________________________________________________________________________
# Title     : file_size
# Usage     : $outputfilesize = &file_size($input_file_name);
# Function  : returns the size of any single testing file
# Example   :
# Warning   : Q is for quality of this sub. This can't be wrong.
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub file_size { my($infile)=$_[0];
	if ( $size=(-s "$infile")){ return $size; }
}

#________________________________________________________________________
# Title     : seq_comp_percent2
# Usage     : @outarray = &seq_comp_percent2(@any_input_string_array);
# Function  : get string seq COMPOSITION identities(a to z). gets array
#             of strings and outs array of % numbers
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub seq_comp_percent2{   	# simple and basic seq. id. eg. ABC on ABCABC is 50 % identical.
	my(@input)=@_;
	my(@array_of_ids2, $id2, @char1, @char2);
	&array_chk(sort @input);
	my($longest_str_size)  = &get_long_str_size (@input), "\n";
	my($shortest_str_size) = &get_short_str_size(@input), "\n";
	print "longest_str_size",$longest_str_size;
	print "shortest_str_size",$shortest_str_size;
	if (($longest_str_size/$shortest_str_size) > 4){
		  print "\n  The shortest string is less than 1/4 of the longest\n";
		  print "  This is quite meaningless, but will go on\n";
	}
	for ($i = 0; $i <= $#input ; $i++){
		  if ($input[$i]=~/(\W)/){
			  &remove_non_char($input[$i]);
		  }
		  @char1 = split(/|\s+|\.+|\-+/, $input[$i]);   # splitting into char.
		  foreach $char (@char1){
			  if ($char eq ' '){
					 next;
			  }
			  $charcount1{$char} +=1; # making array of ['A' => 6, 'B'=>2...]
		  }
		  for($j = $i+1 ; $j <= $#input; $j++){
			  if ($input[$j]=~/(\W)/){
				  &remove_non_char($input[$j]);
			  }
			  @char2 = split(/|\s+|\.+|\-+/, $input[$j]); # splitting into char.
			  for $char (@char2){
				  $charcount2{$char} +=1; 	 # making ary of ['A' => 6, ..]
			  }
			  $id2 = &get_id_among_2_2(*charcount1, *charcount2); # gets % id.
			  push (@array_of_ids2, $id2);
			  %charcount2=();
		  }
		  %charcount1=();
	}
	@array_of_ids2;
}

######################################################################################
###########  file and dir handling stuff #################
###############################################################################

#________________________________________________________________________
# Title     : get_full_file_name
# Usage     : $any_path = ${&get_full_dir_path($any_directory)}; or &dir_path('.') for pwd.
# Function  : returns full directory path (= pwd ), eg.  /nfs/ind4/ccpe1/people/A Biomatic
# Example   :
# Warning   :
# Keywords  : get_long_path_name, get_complete_path_name
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_full_file_name{
	my($pwd)=`pwd`;
	chomp($pwd);
	\$pwd;

Bioinf.pl  view on Meta::CPAN

# Title     : hash_output_chk
# Usage     : &hash_output_chk(\%outing_hash);
# Function  : checks hash output of any subroutine.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub hash_output_chk{
	for($i=0; $i<= $#_; $i++){
	 my(%tem)=%{$_[$i]};  my(@keys)=keys %tem;
	 for ($j =0; $j<@keys; $j++){
		unless(($keys[$j]=~/[\s\S]+/)&&($tem{$keys[$j]}=~/[\s\S]+/)){
		  print "\n Err. at Hash_output_chk at $0 \n", chr(7); exit;
		}
	 }
	}
}

#________________________________________________________________________
# Title     : n
# Usage     : &n;
# Function  : puts one single new line
# Example   :
# Warning   :
# Keywords  : put_new_line, new_line
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub n{
    print "\n";
}


#________________________________________________________________________
# Title     : cls
# Usage     : &cls;
# Function  : clears screen
# Example   :
# Warning   :
# Keywords  : clear_screen
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub cls{   my($cls) = `clear`;
    print $cls;
}


#________________________________________________________________________
# Title     : seq_comp_percent1
# Usage     : @outarray = &seq_comp_percent1(@any_input_string_array);
# Function  : get string seq identities(a to z). gets array of strings and outs array of % numbers
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   : one ref. of an array
# Argument  : one ref. of an array
# Category  :
# Version   :
#--------------------------------------------------------------------
sub seq_comp_percent1{ 		# this is affected by seq. length
		  my(@input)=@{$_[0]};
		  my(@array_of_ids1, $id1, @char1, @char2);
		  &array_chk(\@input);
		  @input = sort (@input);
		  $longest_str_size  = &get_long_str_size (@input), "\n";
		  $shortest_str_size = &get_short_str_size(@input), "\n";
		  if (($longest_str_size/$shortest_str_size) > 4){
					 print "\n The shortest string is less than 1/4 of the longest\n";
					 print " This is quite meaningless, but will go on\n";
		  }
		  for ($i = 0; $i <= $#input ; $i++){
					 if ($input[$i]=~/(\W)/){
								print "\n Warn: seq($input[$i] contains non char\n";
								&remove_non_char($input[$i]);
					 }
					 @char1 = split(/|\s+|\.+|\-+/, $input[$i]);  # splitting into char.
					 foreach $char (@char1){
								$charcount1{$char} +=1; # making array of ['A' => 6, 'B'=>2...]
					 }
					 for($j = $i+1 ; $j <= $#input; $j++){
								if ($input[$j]=~/(\W)/){
										  print "\n Warn: seq($input[$i] contains non char\n";
										  &remove_non_char($input[$j]);
								}
								@char2 = split(/|\s+|\.+|\-+/, $input[$j]);  # splitting into
								foreach $char (@char2){
										  $charcount2{$char} +=1; # making array of ['A' => 6, 'B'=>2...]
								}
								$id1 = &get_id_among_2_1(*charcount1, *charcount2); # gets % id.
								# print %charcount1,"\n";
								push (@array_of_ids1, $id1);
								%charcount2=();
					 }
					 %charcount1=();
		  }
		  \@array_of_ids1;
}


#________________________________________________________________________
# Title     : get_id_among_2_1
# Usage     : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function  : gets the % id of any two sequences, returns in  100.0% format.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   :
#--------------------------------------------------------------------
sub get_id_among_2_1{ 	# 66.67 % if ABC with ABCABC  (due to diff. seq. length.)
	 local(*hash1, *hash2)= @_;
	 my($identity, $no_of_same, $sum_of_same, $av);
	 my(@num_char1)=values %hash1;
	 my(@num_char2)=values %hash2;
	 for $key1 (sort keys %hash1){
					 for $key2 (sort keys %hash2){
								if ($key1 eq $key2){
										  $no_of_same = &min($hash1{$key1},$hash2{$key2});
										  $sum_of_same += $no_of_same;
										  last;
								}
					 }
	 }
	 $identity = $sum_of_same*2/(&sum_array(@num_char1,@num_char2))*100;
	 # print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title     : get_id_among_2_2
# Usage     : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function  : gets the % id of any two sequences, returns in  100.0% format.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_id_among_2_2{       #  eg) 50% if ABC with AABBCC or ABCABC
	 local(*hash1, *hash2)= @_;
	 my($identity, $no_of_same, $sum_of_same, $av);
	 #print %hash1,"\n";
	 #print %hash2,"\n";
	 my(@num_char1)=values %hash1;
	 my(@num_char2)=values %hash2;
	 for $key1 (sort keys %hash1){
			for $key2 (sort keys %hash2){
				if ($key1 eq $key2){
					 $no_of_same = &min($hash1{$key1},$hash2{$key2});
					 $sum_of_same += $no_of_same;
					 last;
				}
			}
	 }
	 $seq1=&sum_array(@num_char1);
	 $seq2=&sum_array(@num_char2);
	 $longer_seq = &max($seq1, $seq2);
	 $identity = $sum_of_same/$longer_seq*100;
	 #print "percent iden = ", $identity, "\n";
}

#________________________________________________________________________
# Title     : array_average
# Usage     : $output = &array_average(\@any_array);
# Function  : (the same as average_array)
# Example   :
# Warning   : If divided by 0, it will automatically replace it with 1
# Keywords  : get_array_average, av_array, average_array, get_average_array
#             average_of_array, average_array
# Options   :
# Returns   : single scaler digit.
# Argument  : takes one array reference.
# Category  :
# Version   : 1.2
#--------------------------------------------------------------------
sub array_average{
	my(@input)= @{$_[0]};
	my $int_option = ${$_[1]} || $_[1];
	my($item,$average,$num,$sum);
	my $num_of_elem = @input;

	for $item(@input){
	 if( $item =~ /^$/ ){  ## If it matches nothing. '$item == 0' does not work !!!
		$num_of_elem --; ## This is to make sure that the denominator does not
	 }                  ## count blank element. (to get correct element number)
	 else{ $sum += $item;  }
	}
	if($num_of_elem ==0){ $num_of_elem =1; }  ## To prevent 'Division by 0' error
	if($int_option =~ /[\-]*i[nt]*/){
	  $average= int( $sum/$num_of_elem );
	}else{   $average = $sum/$num_of_elem }

	return(\$average);
}

#________________________________________________________________________
# Title     : average_array
# Usage     : $output = &average_array(\@any_array);
# Function  : (the same as array_average)
# Example   :
# Warning   : If divided by 0, it will automatically replace it with 1
# Keywords  :
# Options   :
# Returns   : single scaler digit.
# Argument  : takes one array reference.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub average_array{
	my(@input)= @{$_[0]};
	my $int_option = ${$_[1]} if ref($_[1]);
	my $int_option =  $_[1]  if !ref($_[1]);
	my($item,$average,$num,$sum);
	my $num_of_elem = @input;

	for $item(@input){
	 if( $item =~ /^$/ ){  ## If it matches nothing. '$item == 0' does not work !!!
		$num_of_elem --; ## This is to make sure that the denominator does not
	 }                  ## count blank element. (to get correct element number)

Bioinf.pl  view on Meta::CPAN

    if($_[2]=~/E2=(\S+)/){
      $E_value2=$1;
    }

    for($i=0; $i< @input_interm_lines; $i++){
     if($input_interm_lines[$i]=~/\S+ +\d+ +(\S+) +\S+ +\d+ +(\S+) +\S+/){
            if($1 < $E_value1 and $2 < $E_value2){
               push(@filtered_lines, $input_interm_lines[$i]);
            }
     }
    }
    return(\@filtered_lines);
}


#___________________________________________________________________________________
# Title     : make_intermediate_sequence_library
# Usage     : &make_intermediate_sequence_library(\@files, "FASTA_DB=$owl_db_fasta");
#               while @files have either pdbs or pdbg file (PDB grouping file)
# Function  : extracts intermediate sequences from OWL fasta database to
#             make intermediate seq library
#             This looks for /gn0/jong/DB/PDB/PDB95D_against_OWL/E/$msp_file_gz
#                and         /gn0/jong/DB/PDB/PDB95D_against_OWL/D/$msp_file_gz
# Example   :
# Keywords  : make_interm_library_for_each_group, make_interm_lib,
#             make_intermediate_library, compile_interm_library, create_interm_library,
# Options   :
#      'FASTA_DB' for sequence source fasta file  eg:  "FASTA_DB=$source_db_fasta"
#      o  for overwrite option (overwrites 1.2.3.fa like file)
#      MSP_DIR= for msp seq file result directory
#      m=       for msp seq file result direc (same as MSP_DIR)
#      e=       for E value thresh
#   $pdbg_file= by p=
#      E=       for E value thresh
#      s=       for score thresh
#
# Returns   :
# Argument  :
# Category  :
# Version   : 2.0
#-----------------------------------------------------------------------------------
sub make_intermediate_sequence_library{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my(%hash, @pdbd_seqs, @superfamily, @members, @files, $over_write, $msp_file_gz,
         $msp_seq_file_dir, $msp_file, $msp_file_long, $msp_file_gz_long,
         $pdbd_seq_long, $final_merged_ISL_fasta,$pdbg_file, $pwd , $range_start,
         $range_stop, @files_NOT_processed);
    my $source_db_fasta=$ENV{'NRDB_FASTA'}; ## general default
    my $evalue_thresh=0.001; #  default
    my $score_thresh=70;     #  default
    my $range_thresh=10;
    my $percent_id_thresh=0.95;
    print "\n# (i) Running sub of: make_intermediate_sequence_library\n";

    if($vars{'FASTA_DB'}=~/\S+/){  $source_db_fasta=$vars{'FASTA_DB'} }
    if($vars{'MSP_DIR'}=~/\S+/){   $msp_seq_file_dir=$vars{'MSP_DIR'} }
    if($vars{'p'}=~/\S+/){         $pdbg_file=$vars{'p'}; print "\n# (i) $vars{'p'} is given \n"; } ## PDBG file input
    if($vars{'m'}=~/\S+/){         $msp_seq_file_dir=$vars{'m'} }
    if($vars{'DB'}=~/\S+/){        $source_db_fasta=$vars{'DB'} }
    if($vars{'E'}=~/\S+/){         $evalue_thresh=$vars{'E'} }
    if($vars{'e'}=~/\S+/){         $evalue_thresh=$vars{'e'} }
    if($vars{'s'}=~/\S+/){         $score_thresh=$vars{'s'} }
    if($char_opt=~/o/){            $over_write='o' }

    if( !(-s $pdbg_file )){
        print "\n# (W) Is your pdbg file given to me? \n\n";
        if(-s $file[0]){            $pdbg_file=$file[0];
        }else{            die "\n# (E) I can not find \$pdbg_file \n";        }
    }else{
        print "\n# (i) \$pdbg_file $file[0] does exist, GOOD! \n";
    }

    print "\n# (1) make_intermediate_sequence_library: \$evalue_thresh is $evalue_thresh, opening $pdbg_file file\n";

    open(PDBG, "$pdbg_file");

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Collecting sequence members for superfamilies
    #_______________________________________________
    while(<PDBG>){
       if(/\>(\S+) +(\d+\.\d+\.\d+)\.\d+\.\d+/){
            $super_family=$2;
            $hash{$super_family} .=" $1"; ## %hash has 'd1cus_ d2eng_ ...'
       }
    }
    close (PDBG);

    @superfamily=sort keys %hash;
    print "\n# (i) make_intermediate_sequence_library: \@superfamily has @superfamily\n\n" if $verbose;
    unless(@superfamily){  die "\n# (E) \@superfamily is too small to go on \n\n"; }
    $pwd=`pwd`; chomp($pwd);
    chdir($msp_seq_file_dir);

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # collecting superfamily member sequences to a hash
    #____________________________________________________
    for($i=0; $i< @superfamily; $i++){
       my (%interm_hash, $out_fasta_file );
       my $superfamily=$superfamily[$i];
       my @pdbd_seqs=split(/ +/, $hash{$superfamily});
       $out_fasta_file="$superfamily[$i]\.fa"; #<----------------- final output file name like 1.2.1.fa
       if(-s $out_fasta_file and !$over_write){
           print "\n# (1) make_intermediate_sequence_library: $out_fasta_file Already EXISTS and no o opt. skipping\n";
           next;
       }
       if(@pdbd_seqs){      print "\n# (i) \@pdbd_seqs for $superfamily is: @pdbd_seqs\n";
       }else{               print "\n# (E) \@pdbd_seqs is empty, strange, error in making $out_fasta_file \n\n";
       }

        for($j=0; $j< @pdbd_seqs; $j++){
             $pdbd_seq=$pdbd_seqs[$j];
             $pdbd_seq_long="pdb\_$pdbd_seqs[$j]";

             my (@msp_content, $evalue, $sub_dir, $percentage_id, $range_length);
             if($pdbd_seq=~/^ *$/){  next; }

             $msp_file="$pdbd_seq\.msp";
             $msp_file_gz="$pdbd_seq\.msp\.gz";
             $msp_file_long="pdb\_$pdbd_seq\.msp"; ## this is to handle Sarah's pdb_ prefixed pdbd files
             $msp_file_gz_long="pdb\_$pdbd_seq\.msp\.gz";

             print "# (i)      I am processing $msp_file  <- subroutine:make_intermediate_sequence_library\n";
             if($msp_file_gz=~/^([dec])\S/ or $msp_file_long=~/pdb_([dec])\S/){
                  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
                  # Trying to locate the search result file as like 'd1hlb__.msp.gz' in various dirctory
                  #_______________________________________________________________________________________
                  $sub_dir1="\U$1";
                  $sub_dir2="$1"; ## just in case the subdir name was not in capital
                  if(-s $msp_file){
                       open(MSP_FILE, "$msp_file");  @msp_content=<MSP_FILE>; close (MSP_FILE);
                  }elsif(-s $msp_file_long){
                       open(MSP_FILE, "$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
                       print "\n# (i) \@msp_content is read \n";
                  }elsif( -s "$msp_seq_file_dir\/$sub_dir1/$msp_file_long"){
                       open(MSP_FILE, "$msp_seq_file_dir\/$sub_dir1/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
                       print "\n# (i) \@msp_content is read \n";
                  }elsif( -s "$msp_seq_file_dir\/$sub_dir2/$msp_file_long"){
                       open(MSP_FILE, "$msp_seq_file_dir\/$sub_dir2/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
                       print "\n# (i) \@msp_content is read \n";
                  }elsif( -s "$msp_seq_file_dir\/$msp_file_long"){
                       open(MSP_FILE, "$msp_seq_file_dir\/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
                       print "\n# (i) \@msp_content is read \n";
                  }elsif( -s "$msp_seq_file_dir\/$msp_file_long"){
                       open(MSP_FILE, "$msp_seq_file_dir\/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
                       print "\n# (i) \@msp_content is read \n";
                  }elsif( -s $msp_file_gz){ @msp_content=`gunzip -c $msp_file_gz`;
                  }elsif( -s "./$sub_dir1/$msp_file_gz"){  @msp_content=`gunzip -c ./$sub_dir1/$msp_file_gz`;
                  }elsif( -s "./$sub_dir2/$msp_file_gz"){  @msp_content=`gunzip -c ./$sub_dir2/$msp_file_gz`;
                  }elsif( -s $msp_file_gz_long){ @msp_content=`gunzip -c $msp_file_gz_long`;
                  }elsif( -s "./$sub_dir1/$msp_file_gz_long"){  @msp_content=`gunzip -c ./$sub_dir1/$msp_file_gz_long`;
                  }elsif( -s "./$sub_dir2/$msp_file_gz_long"){  @msp_content=`gunzip -c ./$sub_dir2/$msp_file_gz_long`;
                  }elsif( -s "$msp_seq_file_dir\/$sub_dir1/$msp_file_gz"){
                           @msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir1/$msp_file_gz`;
                  }elsif( -s "$msp_seq_file_dir\/$sub_dir2/$msp_file_gz"){
                           @msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir2/$msp_file_gz`;
                  }elsif( -s "$msp_seq_file_dir\/$sub_dir1/$msp_file_gz_long"){
                           @msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir1/$msp_file_gz_long`;
                  }elsif( -s "$msp_seq_file_dir\/$sub_dir2/$msp_file_gz_long"){
                           @msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir2/$msp_file_gz_long`;
                  }else{
                       print "\n# Error Error Error Error Error Error Error Error Error Error \n";
                       print "# (E) PWD is ", `pwd`, "$msp_seq_file_dir\/$sub_dir2, $msp_seq_file_dir\/$sub_dir1\n";
                       print "# (E) make_intermediate_sequence_library ($superfamily): cant find $msp_file_gz or $msp_file\n";
                       push(@files_NOT_processed, $msp_file);
                       print chr(7), chr(7), chr(7);
                       next;
                  }

                  print "\n# (i) Now I have finished reading in ONE msp file for $pdbd_seq ($superfamily) into \@msp_content";

                  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
                  # Processing each MSP line
                  #_______________________________________________________________________________________
                  my ($identical_one_added, $interm_seq);
                  for($k=0; $k< @msp_content; $k++){ ## NEW MSP format
                      if($msp_content[$k]=~/^ *(\S+) +(\S+) +(\S+) +\d+ +\d+ +[pdb_]*$pdbd_seq[_\d+\-\d+]* +(\d+) +(\d+) +(\S+)/){     # [pdb_]* is for Sarah's pdb_ prefix
                          #if($pdbd_seq eq $6){ next }  ## this is to exclude the same seq match
                          $evalue=$2;               $percentage_id=$3;
                          $range_length=$5-$4;      $score =$1;  $interm_seq=$6; $range_start=$4, $range_stop=$5;

                          if($interm_seq=~/(\S+)_\d+\-\d+/){ $interm_seq=$1; }

                          if($evalue < $evalue_thresh and $range_length > $range_thresh and $score > $score_thresh){
                              if($percentage_id <= $percent_id_thresh){  ## to keep one pdb seq ($percentage_id=1)
                                  unless($interm_seq=~/^d\d\S/){  # to remove entries like:  d1abc_10-20_d2acb__ (two pdb seqs)
                                      $interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
                                  }
                              }elsif($percentage_id ==1 and $identical_one_added < 1){  ## this is to prevent more than 1 100% id seq
                                  $interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
                                  $identical_one_added++;
                              }
                          }
                      }elsif($msp_content[$k]=~/^ *(\d+) +(\S+) +\d+ +\d+ +$pdbd_seq +(\d+) +(\d+) +(\S+)/){
                          $evalue=$2;              $score=$1;
                          $range_length=$4-$3;     $interm_seq=$5;
                          if($evalue < $evalue_thresh and $range_length > $range_thresh
                              and $score > $score_thresh){
                              $interm_hash{$superfamily} .=" $interm_seq\_$3\-$4\_$pdbd_seq";
                          }
                      }
                  }

             }

        }
        chdir($pwd);

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #
        #__________________________________
        $family_name=$superfamily[$i];
        @members=@{ &remove_similar_seqlets($interm_hash{$superfamily[$i]}) };
        %seq=%{&fetch_sequence_from_db($source_db_fasta, \@members)};
        &write_fasta(\%seq, $out_fasta_file);
     }

     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # Merging (Compiling) the superfamily fasta files into one
     #_________________________________________________
     $final_merged_ISL_fasta=${&merge_superfam_fasta_files_for_ISL}; ## returns the final compiled_interm_lib.fa file name
     print "\n# (E) Following files could not be processed\n @files_NOT_processed \n";
     return( \$final_merged_ISL_fasta);
}



#________________________________________________________________________________
# Title     : read_machine_readable_sso_lines
# Usage     : @out_refs=@{&read_machine_readable_sso_lines(\@SSO, $get_alignment,
#                           $create_sso, $upper_expect_limit,$new_format, $lower_expect_limit,
#                           $attach_range_in_names, $attach_range_in_names2)};
# Function  :
# Example   :
# Keywords  : read_m10_sso_lines read_msso_lines
# Options   : a c r r2 n
#             u= for upper E value limit
#             l= for lower E value limit
# Category  :
# Version   : 1.5
#--------------------------------------------------------------------------------
sub read_machine_readable_sso_lines{
   my ($upper_expect_limit, $lower_expect_limit)=(50,0);
   my (%match, @out_refs, $query_found, $query_sq_stop, $query_sq_statrt, $match_found,
      $match_seq, $match_found2, $i, $j,$match_found3, $overlap, $sw_score,
      $match_sq_stop, $match_seq2, $sw_ident, $name_range, $query_seq,
      $al_display_start, $match_seq_count);
   for($i=0; $i< @_; $i++){
       if($_[$i]=~/u=(\S+)/){    $upper_expect_limit=$1 }



( run in 1.667 second using v1.01-cache-2.11-cpan-39bf76dae61 )