Bioinf

 view release on metacpan or  search on metacpan

Bioinf.pl  view on Meta::CPAN

								if($evalue > $E_value ){
										if($enquiry=~/^(\S+)_\d+\-\d+/){
											 $msp_0{"$1"} ="" unless $msp_0{"$1"};
											 next;
										}else{
											 $msp_0{"$enquiry"} ="" unless $msp_0{"$enquiry"};;
											 next;
										}
								}

								if($score < $score_thresh1){     next;     }
								if($enquiry=~/^(\S+)_\d+\-\d+/){
										$msp_0{"$1"} .="$match_seq ";
								}else{
										$msp_0{"$enquiry"} .="$match_seq ";
										$msp_00{join(' ', sort($enquiry, $match_seq))} = " $score $evalue";
								}
						}
				}
				%stat=%msp_0;

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
				# filtering duplicates
				#____________________________________________
				@keys=keys %stat;
				for($k=0; $k< @keys; $k++){
						@split=split(/ +/,$stat{$keys[$k]});
						@non_dup=@{&remove_dup_in_array(\@split)};
						for($j=0; $j<@non_dup; $j++){
								if($non_dup[$j]=~/^ *$/){
										splice(@non_dup, $j, 1);       $j--;
										next;
								}
								if($non_dup[$j] eq $keys[$k]){
										splice(@non_dup, $j, 1);       $j--;
										next;
								}
						}
						$stat2{$keys[$k]}=join(' ', @non_dup);
				}

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				# Showing the actual matched sequences
				# %stat has following contents
				#    d1ash__ d1bam__ d1mba__ d2lhb__
				#    d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
				#    d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
				#______________________________________________________________________________
				if($verbose=~/v/){
					 @keys= sort keys %stat2;
					 for($k=0; $k< @keys; $k++){
							print "$keys[$k]: $stat2{$keys[$k]}\n";
					 }
				}

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				# Getting statistics
				#_________________________________________
				$evalue=$s;
				$E_mult_factor1=1;
				@output=@{&get_isearch_result_stat(\%stat2, \@pdbg_seqs, \$evalue,
									\$base, \$E_mult_factor1,  $leng_thresh, \%msp_00)};
				%correct=%{$output[3]};
				%final_stat_big_hash=(%final_stat_big_hash, %correct);
				if($verbose){
						@keys=sort keys %correct;
						for($k=0; $k< @keys; $k++){
							 print "$keys[$k] $correct{$keys[$k]}\n";
						}
				}
		}
		return(\%final_stat_big_hash);
}


#________________________________________________________________________________
# Title     : get_scop_correcting_pairs
# Usage     : %correct=%{&get_scop_correcting_pairs()};
# Function  :
# Example   :
# Keywords  : get_pdb_correcting_pairs , correct_pairs_in_scop, correct_homology_pairs
# Options   :
# Category  :
# Version   : 1.4
#--------------------------------------------------------------------------------
sub get_scop_correcting_pairs{
    my (%correcting_pairs, @correcting_pairs);

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # %correcting_pairs is a correcting table for old pdb40d file classi
    #_____________________________________________________________________
    @correcting_pairs=(  # should be pairs
        'd2kauc1 d2kauc2', 'd1pkya1 d1pkya2',
        'd1pvda2 d1trka1', 'd1pbe_1 d1pbe_2',
        'd1poxa3 d1pvda2', 'd1efga1 d1efga2',
        'd1dsba1 d1dsba2', 'd2gsta1 d2gsta2',
        'd1bct__ d1brd__', 'd1qora1 d1qora2',
        'd2ohxa1 d2ohxa2', 'd1efga2 d1eft_1',
        'd1tada1 d1tada2', 'd1gsea1 d1gsea2',
        'd1gesa2 d2tmda3', 'd1lvl_2 d2tmda3',
        'd2tmda3 d2tpra2', 'd1tde_1 d2tmda3',
        'd1nhp_2 d2tmda3', 'd1gesa1 d2tmda3',
        'd1lvl_1 d2tmda3', 'd2tmda3 d2tpra1',
        'd1fcda1 d2tmda3', 'd1nhp_1 d2tmda3',
        'd1tde_2 d2tmda3', 'd1pbe_1 d2tmda3',
        'd1ebha1 d1ebha2', 'd1gesa2 d2dlda2', ## 3.4.1  with
        'd1gesa2 d1psda2', 'd1nhp_2 d2dlda2',
        'd1ldm_1 d1tde_2', 'd1coy_1 d1ldb_1',
        'd1lvl_2 d1psda2', 'd1psda2 d1tde_2',
        'd1hyha1 d1tde_2', 'd1fcda1 d1ldm_1',
        'd1hdca_ d1nhp_2', 'd1fcda1 d1hlpa1',
        'd1llda1 d1lvl_2', 'd2dlda2 d2tpra2',
        'd1ldm_1 d1nhp_2', 'd1llda1 d1pbe_1',
        'd1gdha2 d2tpra1', 'd1ldb_1 d1nhp_2',
        'd1gesa2 d1scua2', 'd1fcda1 d1hyha1',
        'd1gesa1 d1hlpa1', 'd1gdha2 d1gesa2',
        'd1lvl_2 d2dlda2', 'd1gesa1 d2dlda2',
        'd1nhp_2 d2ohxa2', 'd1tde_2 d2dlda2', # 3.4.1. with 3.18.1, 3.17.1.
        'd1nhp_1 d2cmd_1', 'd1fcda1 d1ldb_1',
        'd1lvl_1 d2ohxa2', 'd1nhp_2 d2naca2',
        'd1pbe_1 d2ohxa2', 'd1gdha2 d1nhp_2',
        'd2cmd_1 d2tpra1', 'd1tde_1 d2cmd_1',
        'd1llda1 d1nhp_2', 'd1hlpa1 d1nhp_2',
        'd1nhp_1 d2dlda2', 'd1hyha1 d1nhp_2',
        'd1nhp_2 d1psda2', 'd1fcda1 d2cmd_1',
        'd1fcda1 d1llda1', 'd1lvl_2 d1udpa_',
        'd1psda2 d2tpra2', 'd1hdca_ d1lvl_2',
        'd1gesa2 d1llda1', 'd1nhp_2 d1qora2',
        'd1ldm_1 d2tpra1', 'd1coy_1 d2dlda2',
        'd2dlda2 d2tpra1', 'd1hdca_ d1pbe_1',
        'd1coy_1 d1gdha2', 'd1nhp_2 d2cmd_1',
        'd1llda1 d1tde_1', 'd1llda1 d1lvl_1',
        'd1bdma1 d2tpra1', 'd1gd1o1 d2tpra2',
        'd1ldb_1 d1lvl_1', 'd1hlpa1 d1tde_2',
        'd1coy_1 d1psda2', 'd1nhp_2 d1udpa_',
        'd1llda1 d1tde_2', 'd1tde_2 d2cmd_1',
        'd1llda1 d2tpra2', 'd1ldb_1 d1tde_1',
        'd1coy_1 d1hlpa1', 'd1coy_1 d2cmd_1',
        'd1bdma1 d1gesa2', 'd1hyha1 d2tpra2',
        'd1gesa2 d1hyha1', 'd1gesa2 d2ohxa2',
        'd1ldb_1 d1tde_2', 'd1hlpa1 d1pbe_1',
        'd1ldm_1 d2tpra2', 'd2ohxa2 d2tpra1',
        'd1ldb_1 d2tpra2', 'd1gesa2 d1ldm_1',
        'd1lvl_2 d1qora2', 'd1gesa1 d2naca2',
        'd1coy_1 d1llda1', 'd1coy_1 d1hyha1',
        'd1coy_1 d1ldm_1', 'd1ldm_1 d1lvl_2',
        'd1eny__ d1nhp_2', 'd1pbe_1 d2pgd_2',
        'd1ldb_1 d1pbe_1', 'd1ldb_1 d1lvl_2',
        'd1gesa2 d1hlpa1', 'd1dhr__ d1nhp_2',
        'd1hdca_ d1tde_1', 'd1gesa1 d1psda2',
        'd1pbe_1 d2cmd_1', 'd1tde_2 d1udpa_',
        'd1pbe_1 d2dlda2', 'd1hdca_ d1tde_2',
        'd1gesa2 d1ldb_1', 'd1psda2 d2tpra1',
        'd1gdha2 d1lvl_2', 'd1tde_1 d2dlda2',
        'd1ldm_1 d1pbe_1', 'd1pbe_1 d1scua2',
        'd1gesa1 d2ohxa2', 'd1lvl_2 d2naca2',
        'd1gd1o1 d1lvl_1', 'd1fvl__ d1kst__',
        'd1kst__ d2ech__', 'd1hsaa2 d1std__', ## d1hsaa.. is NOT homol, but to fix a problem in E_100_e_0.0005_j30_segged_2092
        'd1afp__ d1hfi__'
        );


     for($i=0; $i< @correcting_pairs; $i++){
                     $correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
     }
     return(\%correcting_pairs);
}

#__________________________________________________________________
# Title     : get_isearch_result_stat
# Usage     : &get_self_isearch_stat(\%stat2, \@pdbg_seqs, \$evalue);
# Function  :
# Example   : Following input (hash eg: %stat2, input with the first word as key)
#              will become columnar output.
#
#    d1ash__ d1bam__ d1mba__ d2lhb__
#    d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
#    d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
#
#    Will become:
#      ....
#      d1ash__ d2lhb__ Homolog: G1   98 0.012
#      d1baba_ d1flp__ Homolog: G1   82 0.072
#      d1baba_ d1hbg__ Homolog: G1   79 0.13
#      d1baba_ d2lhb__ Homolog: G1   228 8e-12
#      d1baba_ d3aaha_ Nomolog: G1   74 2
#      d1baba_ d3sdha_ Homolog: G1   92 0.012
#      d1cola_ d1hbg__ Nomolog: G1   79 0.59
#      d1cpca_ d1cpcb_ Homolog: G1   176 4.9e-08
#      ....
#
# Keywords  : get_stat_interm_search, get_intermediate_search_stat
# Options   : _  for debugging.
#             #  for debugging.
# Package   : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns   : [$av_correct, $num_enq_seq]
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Category  :
# Version   : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
	my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
	    $sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
	    $av_correct, $av_false, $actual_e_value, $correct_matched,
	    %correcting_pairs, @correcting_pairs, %correct);

	my %seqs=%{$_[0]};
	my @pdbg_seqs=@{$_[1]};
	my $evalue=${$_[2]};
	my $pdbg_base=${$_[3]} || $ARGV[3];
	my $E_mult_factor1=${$_[4]};
	my $E_mult_factor2=${$_[4]};
        if(ref($_[5])){  $leng_thresh =${$_[5]}  }else{ $leng_thresh=$_[5]; }
	my %msp_0=%{$_[6]};
	my %msp_00=%{$_[7]};

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# %correcting_pairs is a correcting table for old pdb40d file classi
		#_____________________________________________________________________
		@correcting_pairs=(  # should be pairs
                                'd2kauc1 d2kauc2',              'd1pkya1 d1pkya2',
                                'd1pvda2 d1trka1',                              'd1pbe_1 d1pbe_2',
				'd1poxa3 d1pvda2',				'd1efga1 d1efga2',
				'd1dsba1 d1dsba2',				'd2gsta1 d2gsta2',
				'd1bct__ d1brd__',				'd1qora1 d1qora2',
				'd2ohxa1 d2ohxa2',				'd1efga2 d1eft_1',
				'd1tada1 d1tada2',				'd1gsea1 d1gsea2',

Bioinf.pl  view on Meta::CPAN

       }else{
           print "\n# You seemed made a mistake, O.K., I will kill myself!\n\n";
           print chr(7);  exit;
       }
   }

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # (0) If blast is chosen run Blast
   #_________________________________________________________
   if($algorithm=~/[psi\-]*[pb][last]*/i){
      print "\n# (i) Doing PSI search with @file\n";
      @final_out=@{&do_psi_blast_search(\@file, "d=$source_DB_file",
                           "i=$input_seq_file",  $over_write,
                           $make_msp_in_sub_dir_opt, $Lean_output)};
      return(\@final_out); #<<<<<<----------- F I N I S H
   }

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # (1) Controlling which kind of search it should do. Do save_in_gz_in_sub_dir first if d is set
   #______________________________________________________________________________________________
   if( $make_msp_in_sub_dir_opt ){  ## convert sso to msp and put in sub dir like /D/, /S/

       for($x=0; $x < @seq_names; $x++){
          my ($over_write_sso_by_age, $over_write_msp_by_age,  %single_seq,
              $out_file_sso_gz_name, $out_file_msp_name, $out_file_gz_name, $existing_sso);
          my ($seq_name, $seq)= ($seq_names[$x], $seq_input{$seq_names[$x]});
          my $first_char= substr("\U$seq_name", 0, $sub_dir_size);
          mkdir ("$first_char", 0777) unless -d $first_char;
          chdir("$first_char");
          print "\n# (i) do_sequence_search: You set \'d\' or \'D\' opt\n";
          print "# (i) making subDIRs ($first_char) with $seq_name $sequence_DB to store MSP files\n";

          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          # Let's make each fasta file for each seq to be used in searching
          #_____________________________________________________________________
          my $temp_file_name="$seq_name.fa";
          %single_seq=($seq_name, $seq_input{$seq_name});
          &write_fasta(\%single_seq, $temp_file_name ); ## e for writing each file

          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          # Making output file name according to the option given
          #_______________________________________________________________________
          if($machine_readable and $algorithm=~/[fastassearch]+/){
                 $out_file_sso_name="$seq_name\.msso";
          }else{ $out_file_sso_name="$seq_name\.sso";      }
          $out_file_sso_gz_name="$out_file_sso_name\.gz";
          $out_file_msp_name="$seq_name\.msp";
          $out_file_gz_name="$seq_name\.msp\.gz";

          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          # Check if SSO file already there
          #_______________________________________________________________________
          if(-s $out_sso_file){ $existing_sso=$out_file_sso_name }
          elsif(-s $out_sso_gz_name){ $existing_sso=$out_file_sso_gz_name }
          if(-s $out_msp_name){ $existing_msp=$out_file_msp_name }
          elsif(-s $out_gz_name){ $existing_msp=$out_file_gz_name }

          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          # If the dates of files created are long ago, overwrite to refresh
          #____________________________________________________________________
          if(  (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){
               $over_write_sso_by_age='o';
          }
          if(  (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){
               $over_write_msp_by_age='o';
          }

          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          #  To check if the target seq DB is in ../
          #________________________________________________
          if(-s $sequence_DB){
              print "\n# (i) Good, target \$sequence_DB $sequence_DB is in this working dir\n";
          }elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB"; }

          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          # (2) Searching: Making MSP files directly,  MSP file format is the major format used in geanfammer!
          #_________________________________________________________________________________________
          if($char_opt =~/D/){ #### To make MSP file
               if( !(-s $out_file_gz_name or -s $out_file_msp_name) or $over_write or $over_write_msp_by_age){
                    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                    # (2.1) Running  run_fasta_sequence_search !!
                    #_______________________________________________________
                    print "\n# (i) Running  run_fasta_sequence_search !!\n";
                    $gzipped_msp_file=${&run_fasta_sequence_search(
                                       "a=$algorithm",
                                       "O=$out_file_msp_name",
                                       "File=$temp_file_name", "e=$E_val",
                                       "DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
                    $gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
               }else{
                   print "\n#  Line No. ", __LINE__,", $out_file_gz_name already exists or
                         \$over_write is set or NOT older than $age_in_days_of_out_file\n";
               }
          }
          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          # To make gzipped SSO files and MSP files
          #_______________________________________________
          elsif($create_sso or $char_opt=~/m/){ ### To make gzipped SSO files
               if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
                   print "\n# (i) Running  run_fasta_sequence_search with \"\$create_sso option\"!!\n\n";
                   $gzipped_msp_file=${&run_fasta_sequence_search(
                                       "a=$algorithm",
                                       "O=$out_file_msp_name", "$create_sso",
                                       "File=$temp_file_name", "e=$E_val",
                                       "DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};

                   $gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
               }else{
                   print "\n#  Line No. ", __LINE__,", $out_file_gz_name already exists or
                         \$over_write is set or NOT older than $age_in_days_of_out_file\n";
               }
          }else{
               if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
                   system(" $algorithm -m 10 -H  -E $E_val $temp_file_name $sequence_DB $k_tuple > $out_file_sso_name");
                   system("gzip $out_file_sso_name");
               }else{
                   print "\n#  Line No. ", __LINE__,", $out_file_gz_name already exists or
                         \$over_write is set or NOT older than $age_in_days_of_out_file\n";
               }
          }
          unlink("$seq_name.fa") if -s "$seq_name.fa";
          unlink("$first_char/$seq_name\.fa") if -s "$first_char/$seq_name\.fa";
          print "\n# Sub dir $first_char and $seq_name\.msp has been made, finishing do_sequence_search\n";
          chdir ('..');
      }
      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # F I N I S H
      #________________________________________
      goto EXIT;
   } # if ($char_opt =~/[dD]/){  is finished


   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # (2) Writing on PWD. This is the big single MSP output
   #____________________________________________________________
   $Single_msp_out_file="$base_name\.msp" if($single_big_msp eq 's');
   if(-s $Single_msp_out_file and !$over_write ){
       print "\n# (i) $Single_msp_out_file exists, and no \$over_write is set, skipping \n";
       push(@final_out, $Single_msp_out_file);
   }else{  $over_write  ='o';  }

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # Check if it is necessary to write each sequences.fa files
   #______________________________________________________
   if( $over_write ){  &write_fasta_seq_by_seq(\%seq_input, 'e'); } ## e for writing each seq file

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
   #  When, you didn't use "DB=$XXX" and "File=$FXXX" format, first file input is DB etc
   #_______________________________________________________________________________________
   $defined_all_ok=&check_if_defined($input_file_name, $sequence_DB);
   if(!$defined_all_ok){ print "\n# (E) FATAL: do_sequence_search: You did not use \"DB=\$XXX\" format\n"; exit   };

   print "\n# Finished writing the enquiry fasta files from \%seq_input by write_fasta";
   print "\n# I am in do_sequence_search sub, Target database used :  $sequence_DB with seqs of \'@seq_names\'\n";


   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   #  Main search with given @seq_names
   #______________________________________________________________
   for($j=0; $j< @seq_names; $j++){  # @seq_names has sequence names coming from  (@seq_names) = keys %seq_input;
       my ($over_write_sso_by_age, @temp, $existing_sso, $out_gz_name,
           $over_write_msp_by_age, $existing_msp, $out_msp_file, $seq_name);
       $seq_name=$seq_names[$j];
       $each_seq_fasta="$seq_name\.fa";
       $out_msp_file="$seq_name\.msp";
       $out_gz_name="$seq_name\.msp\.gz";
       $out_msso_file="$seq_name\.msso";

       &die_if_file_not_present($each_seq_fasta);

       print "\n# (i) :-) Found $each_seq_fasta is searched against $sequence_DB\n";
       if($algorithm=~/fasta/){       $out_sso_file="$seq_name\.fsso";
       }elsif($algorithm=~/ssearch/){ $out_sso_file="$seq_name\.ssso"; }
       $out_sso_gz_name="$out_sso_name\.gz";

       if(-s $out_sso_file){ $existing_sso=$out_sso_file }
       elsif(-s $out_sso_gz_name){ $existing_sso=$out_sso_gz_name }
       if(-s $out_msp_file){ $existing_msp=$out_msp_file }
       elsif(-s $out_gz_name){ $existing_msp=$out_gz_name }
       if(  (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){
            $over_write_sso_by_age='o';
       }
       if(  (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){
            $over_write_msp_by_age='o';
       }

       #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       #  To check if the target seq DB is in ../
       #________________________________________________
       if(-s $sequence_DB){ print "\n# (i) \$sequence_DB $sequence_DB exists, Good\n";
       }elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB";
       }elsif( -s "../../$sequence_DB"){ $sequence_DB="../../$sequence_DB"; }

       #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       # If MSP file already exist
       #_____________________________________________________________
       if( -s $out_msp_file and !$over_write_msp_by_age and !$over_write ){
            print "\n# (i) File: $out_msp_file exists, skipping, to overwrite use \'o\' opt or set days";
            push(@final_out, $out_msp_file);
       }else{  ## -E is for e value cutoff. -b is for num of seq fetched
           #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~~~~~~~~~~~~~~
           #  K-tuple is 1 by default. If xxxx.sso exsts, skip running fasta or ssearch
           #________________________________________________________________________________
           if(-s $out_sso_file and !$over_write ){ ## If SSO is already present, JUST READ IT!
                print "\n# (i) Just opening existing $out_sso_file $out_sso_file $out_msp_file $over_write_msp_by_age $over_write\n";
                open(SSO_ALREADY, "$out_sso_file");
                @temp=<SSO_ALREADY>;
                print "\n# (i) \@temp has ", scalar(@temp), " lines\n";
                close(SSO_ALREADY);
                &compress_files_by_gzip($out_sso_file);
           }else{ ## Run FASTA HERE
                print "\n# (i) Running \"run_fasta_sequence_search\" ";
                $gzipped_msp_file=${&run_fasta_sequence_search(
                                   "a=$algorithm",
                                   "O=$out_msp_file", "$create_sso",
                                   "File=$each_seq_fasta", "e=$E_val",
                                   "DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
                push(@final_out, $gzipped_msp_file) if -s $gzipped_msp_file ;
                unlink($each_seq_fasta) if $Lean_output;
           }
       }
       if($machine_readable and $create_sso and -s $out_sso_file){ &cp($out_sso_file, $out_msso_file); }
   } # end of for($j=0; $j< @seq_names; $j++){
   return(\@final_out);
   EXIT:

} # do_sequence_search



#__________________________________________________________________________
# Title     : do_hmm_sequence_search
# Usage     : &do_hmm_sequence_search(\@file, "method=$default_search_method",
#								$over_write, "DB=$pdbd40_seq_fasta");
#
# Function  : does hmm sequence search using Sean Eddy's HMMER (hmmls, hmmfs)
# Example   :
# Keywords  : do_seq_search_with_hmm, do_hmmt_sequence_search
# Options   :
#    "method=ls"  for turning hmmls search option on (default)
#    "method=fs"  for turning hmmfs search option on
#    method= by method=
#   o  for overwriting existint xxxxx.hmm files

Bioinf.pl  view on Meta::CPAN

	 return(\$final_time);
}


#________________________________________________________________________
# Title     : get_date
# Usage     : @outformat = &get_date;  eg result >  (010595 1-May-1995)
# Function  : returns date: $date6d (6 digit format) and
#             $datec (dd-mmm-yyyy format), Tim's version is 'getdate' in th_lib.pl
# Example   : 30-Nov-1995
# Keywords  : get_present_date,
# Options   :
# Returns   : ref of an array for (1-May-1995 and 010595)
# Argument  :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------
sub get_date{
	 my($date_alphabet, $date6d);
	 my(@time) = localtime(time);
	 my($ty,$tm,$td) = ($time[5],$time[4],$time[3]);
	 my($year) = '19' . $ty;
	 my($mon) = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$tm];
	 my($day) = $td;
	 if($day < 10){
		  $day = ' ' . $day;
	 }
	 $date_alphabet = $day.'-'.$mon.'-'.$year;
	 $tm++;
	 if($tm < 10){
		  $tm = '0'.$tm;
	 }
	 if($td < 10){
		  $td = '0'.$td;
	 }
	 $date6d = $td.$tm.$ty;
	 return ([$date_alphabet, $date6d]);
}

#__________________________________________________________________________
# Title     : if_file_older_than_x_days
# Usage     : if( ${&if_file_older_than_x_days($ARGV[0], $days)} > 0){
# Function  : checks the date of last modi of file given and compares with
#             present time. Substracts diff and returns the actual diff days.
# Example   :
# Keywords  : how_old_file, how_old, is_file_older_than_x_days, file_age,
#             file_age_in_days, check_file_age
# Options   :
# Returns   : the actual days older, so NON-ZERO, otherwise, 0
# Argument  :
# Version   : 1.2
#----------------------------------------------------------------------------
sub if_file_older_than_x_days{
    my($how_old_days);
    my $days=1; # default
    if(@_ < 2){ print "\n# if_file_older_than_x_days needs 2 args\n"; exit; }
    my $file=${$_[0]} || $_[0];
    $days=${$_[1]} || $_[1];
    unless(-s $file){  print "\n# if_file_older_than_x_days: $file does NOT exist\n"; exit; }

    if(lstat($file)){ # to handle Symbolyc link
       print "\n# (i) if_file_older_than_x_days: running lstat\n";
       $how_old_days=(localtime(time- (lstat($file))[9]))[3]; ## should be lstat not stat
    }else{
       print "\n# (i) if_file_older_than_x_days: running stat\n";
       $how_old_days=(localtime(time- (stat($file))[9]))[3]; ## should be lstat not stat
    }
    if($how_old_days > $days and $how_old_days < 10000){
       print "\n# if_file_older_than_x_days: $file is older than $days\n";
       return(\$days);
    }else{
       print "\n# if_file_older_than_x_days: $file is NOT older than $days\n";
       return(0);
    }
}



#________________________________________________________________________
# Title     : array_chk
# Usage     : &array_chk(\@any_array_to_chk);
# Function  : checks if any inputting array is empty or with one element.
# Example   : This is used only with subs which accepts array inputs.
# Warning   :
# Keywords  : array_check
# Options   :
# Returns   : nothing, prints out messages to STDOUT
# Argument  : gets on ref. of array.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub array_chk{  my(@input)=@{$_[0]};
	if (@input == 0){
	 &caller_info;
	 print "\n >>> $0 \n";
	 print "\n >>> Error: Input array to this subroutine was empty\n", chr(7);
	 print "\n To continue prog. type \'y\', or \'n\' to quit (with enter).\n ---->";
	 $key = ${&yes_or_no};
	 if($key ne 'y'){  print "\n !! Aborting the operation !! \n"; exit(0); }}
	elsif ($#input == 0){
	 print "\n >>> Warn: Input array to this subroutine was only one, O.K ?\n";
	 print "\n >>> It means your input was not an array at all, probable	error\n";
	 &caller_info;
	 #________________________________________________________
	 # Title    : caller_info
	 # Function : tells you calleing programs and sub's information with file, subname, main, etc
	 # Usage    : &caller_info; (just embed anywhere you want to check.
	 #----------------------------------------------------------------------
	 sub caller_info{	    # caller(1), the num. tells you which info you choose
		my($i)=1;
		while(($pack, $file, $line, $subname, $args) = caller($i++)){
		  my($level) = $i-1;
		  print "\n", chr(169)," This sub info was made by \&caller_info subroutine";
		  print "\n ", chr(164)," Package  from => $pack ";
		  print "\n ", chr(164)," Exe. file was => $file ";
		  print "\n ", chr(164)," Line was  at? => $line (in $file)";
		  print "\n ", chr(164)," Name of  sub? => $subname";
		  print "\n ", chr(164)," How many arg? => $args";
		  print "\n ", chr(164)," Level of sub? => $level (1 is for where \&caller_info is )\n\n";
		}
	 }
	 #________________________________________________________
	 #________________________________________________________
	 # Title    : yes_or_no
	 # returns  : ref. of a Scalar for 'y' or 'n'
	 # Usage    : $yes_or_no = ${&yes_or_no};

Bioinf.pl  view on Meta::CPAN

			   "u=$upper_expect_limit",
			   "l=$lower_expect_limit",
			   "k=$k_tuple",
			   $add_range,
			   $create_sso,
			   "t=$Score_thresh",
			   "m=$margin",
			   "a=$algorithm",
			   $msp_directly_opt,
			   $machine_readable,
			   $new_format )};
		  print "\n# File created: @file_created \n";
		}

   	if($do_in_batch or $scramble_order and @file > 0){
	   for($i=0; $i< @file; $i++){
					%fasta_seqs=%{&open_fasta_files(\$file[$i])};
		  if($char_opt=~/r/){ ## reverse the query seqs.
			  %fasta_seqs=%{&reverse_sequences(\%fasta_seqs)};
		  }
		  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		  #  Main sequence search
		  #___________________________________________________
		  my @file_created=@{&do_sequence_search(\%fasta_seqs,
				       "FILE_AGE=$age_in_days_of_out_file",
                                       "DB=$input_db_file",
                                       "File=$file[$i]",
                                       $single_big_msp,
                                       $over_write,
                                       "u=$upper_expect_limit",
                                       "l=$lower_expect_limit",
                                       "k=$k_tuple",
                                       $add_range,
                                       $create_sso,
                                       "t=$Score_thresh",
                                       "m=$margin",
                                       "a=$algorithm",
                                       $msp_directly_opt,
                                       $machine_readable,
                                       $new_format )};
		  print "\n# File created: @file_created \n";
	   }
	 }elsif(@file > 0){ ## reads in the big database file continously
	   my $make_gz_in_sub_dir_opt='d';
	   my ($ori_seq_name, $first_char, $seq_file_msp_name,
	       $seq, $seq_name, $first_char);
	 	   for($i=0; $i< @file; $i++){
		   open(FASTA, "$file[$i]");
		   while(<FASTA>){
			  if( /\> *((\w)\S+)/ ){
				  $ori_seq_name=$1;
				  if($seq=~/\S/ and $seq_name=~/\S/){
                                              my ($overwrite_by_age, $existing_msp_file);
                                              $seq_file_name="$seq_name\.fa";
                                              $seq_file_msp_name="$seq_name\.msp";
                                              $seq_file_msp_gz_name="$seq_name\.msp\.gz";
                                              $first_char= substr("\U$seq_name", 0, 1);
                                              if(-s "$first_char\/$seq_file_msp_name"){  $existing_msp_file="$first_char\/$seq_file_msp_name"; }
                                              elsif(-s "$first_char\/$seq_file_msp_gz_name"){  $existing_msp_file="$first_char\/$seq_file_msp_gz_name"; }

                                              if(  (localtime(time- (stat($existing_msp_file))[9]))[3] > $age_in_days_of_out_file ) {
                                                                     $overwrite_by_age='o';
                                                                     print "\n# interm_lib_search: $seq_file_msp_name is older than $age_in_days_of_out_file days, ovrwrting\n";
                                              }

                                              if( !$over_write  and (-s "$first_char\/$seq_file_msp_name" or -s "$first_char\/$seq_file_msp_gz_name")
                                                                     and !$overwrite_by_age ){
						 print "\n# interm_lib_search: $first_char\/$seq_file_msp_name already exists or newer than $age_in_days_of_out_file \n";
						 $seq='';
					 }else{
						 &do_sequence_search({"$seq_name", "$seq"},
						                      "DB=$input_db_file" ,
																							"FILE_AGE=$age_in_days_of_out_file",
						                      "File=$seq_file_name",
																							$single_msp, $over_write,
																							"u=$upper_expect_limit",
																							"$make_gz_in_sub_dir_opt",
																							"l=$lower_expect_limit",
																							"k=$k_tuple",
																							$make_msp_in_sub_dir_opt );
						 $seq='';
					 }
			      }
				  $seq_name=$ori_seq_name;
				  $first_char="\U$2"; # for storing output
			  }elsif(eof){
			      $seq.=$_;
				  if($seq=~/\S/ and $seq_name=~/\S/){
										 my ($overwrite_by_age, $existing_msp_file);
										 $seq_file_name="$seq_name\.fa";
					 $seq_file_msp_name="$seq_name\.msp";
										 $seq_file_msp_gz_name="$seq_name\.msp\.gz";
										 $first_char= substr("\U$seq_name", 0, 1);
										 if(-s "$first_char\/$seq_file_msp_name"){  $existing_msp_file="$first_char\/$seq_file_msp_name"; }
										 elsif(-s "$first_char\/$seq_file_msp_gz_name"){  $existing_msp_file="$first_char\/$seq_file_msp_gz_name"; }

										 if(  (localtime(time- (stat($existing_msp_file))[9]))[3] > $age_in_days_of_out_file ) {
													$overwrite_by_age='o';
													print "\n# interm_lib_search : $seq_file_msp_name is older than $age_in_days_of_out_file days, ovrwrting\n";
										 }

					 if( !$over_write  and (-s "$first_char\/$seq_file_msp_name" or -s "$first_char\/$seq_file_msp_gz_name")
					     and !$overwrite_by_age ){
						 print "\n# $first_char\/$seq_file_msp_name already exists. (interm_lib_search) \n";
					 }else{
						 &do_sequence_search({"$seq_name", "$seq"},
						                      "DB=$input_db_file" ,
																							"FILE_AGE=$age_in_days_of_out_file",
																							"File=$seq_file_name",
							                  $single_msp, $over_write,
							                  "u=$upper_expect_limit",
							                  "$make_gz_in_sub_dir_opt",
							                  "l=$lower_expect_limit",
							                  "k=$k_tuple",
							                  $make_msp_in_sub_dir_opt );
						 $seq='';
					 }
			      }
			  }elsif(/^(\w+)$/){
				  $seq.=$1;
			  }
		   }
		   close FASTA;
	   }
	 }
}



#______________________________________________________________________
# Title     : geanfammer
# Usage     : &geanfammer(\@your_genome_or_db_to_analyse_file,
#                          $verbose);
#
# Function  : Creates a domain level clustering file from a given
#              FASTA format sequence DB. It has been used for complete
#              genome sequence analysis. Can use PSI-blat, fasta, ssearch
#
#              ------------ USAGE INFORMATION -------------------
#             The parameters you put are important for the meaningful
#               protein family maker.
#             The most important one is the E and e options (Mostly,
#               they can have same value).
#             Large E is for setting the threshold for the single
#               linkage clustering.
#             This means, any sequence hit BELOW the threshold
#               (which is good ) will be linked.
#             For example, if Seq1 matched with Seq2 with E value
#              of FASTA search:
#              0.001, and you set the threshold 0.1, then YOU
#              ordered the geanfammer to regard them a family.
#
#             The second small e option is for the dividing a complex
#              and wrong cluster into correct more correct
#              duplication modules. This is necessary as a
#              lot of multidomain proteins can be clustered together
#              WRONGLY by single linkage.

Bioinf.pl  view on Meta::CPAN





#____________________________________________________________
# Title    : read_any_dir_for_dir
# Function : read any dir and REMOVES the '.' and '..' entries. And
#            then put in array.
# Usage    : @file_list = @{&read_any_dir(\$absolute_path_dir_name, ....)};
# Argument : takes one or more scaler references.
# Returns  : one ref. of array.
# Keywords : read_any_dir_for_dir_command
# Warning  : This does not report '.', '..', '#xxxx', ',xxxx', etc. only legitimate
# Warning  : file and dir names are reported.
# Version  : 1.2
#----------------------------------------------------------------------
sub read_any_dir_for_dir{

    my ($i,%big_files, $in_dir, $size_sum, @final_files, @in,
        @target_file_names, $in_dir, $k, @possible_dirs, @read_files, @stat);
    @in=@_;
		for($k=0; $k < @in; $k++){
			 if   ( ($in[$k] eq '.') || !(defined($in[$k]))){    $in_dir='.';  splice(@in, $k, 1); $k-- }
			 elsif(!(ref($in[$k]))){   $in_dir=$in[$k];     }
			 elsif(ref($in[$k]) eq 'SCALAR'){
					 $in_dir =${$in[$k]};
					 splice(@in, $k, 1); $k--;next;   }
			 elsif(ref($in[$k]) eq 'ARRAY'){
					@target_file_names= @{$in[$k]}; splice(@in, $k, 1); $k--;  }

			 if($in_dir!~/^\.$/ and $in_dir =~ /^([\w\-\.]+)$/){  $in_dir="\.\/$in_dir";   # if it is a short dir name
					 unless(-d $in_dir){ $in_dir=${&dir_search_special(\$in_dir, \@target_file_names)} }
			 }
			 print "\n# $0: read_any_dir_for_dir: \$in_dir is  $in_dir\n";
		}
			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
			 # This is to search the input dir names in your whole dirs
			 #____________________________________________________________
			 sub dir_search_special{
                             my($in_dir)=${$in[0]};
                             my(@ENV_dir, @probable_dir_list, @dirs,@possible_dirs, $final_dir);
                             if($in_dir =~ /\/([\w\.\-]+)$/){   $in_dir = $1; }
                             @probable_dir_list=('ALIGN', 'PDB', 'PATH', 'HOME', 'JPO', 'PIRDIR', 'PDBSST','PDBENT',
                                                                                                                                     'BLASTDB', 'PIRDIR', 'SWDIR');
                             for (@probable_dir_list){ @dirs=split(':', $ENV{$_});
                             for (@dirs){ if (/$in_dir$/){ $final_dir = $_; } } }
                             if(@possible_dirs <1){  # goes up one level and tries to find dir.
                                             my($pwd)=`pwd`;
                                             chomp($pwd);
                                             my(@temp)=split('/', $pwd);
                                             pop(@temp);
                                             my($up_pwd)=join('/', @temp);
                                             $in_dir="$up_pwd\/$in_dir";
                                             $final_dir=$in_dir if (-d $in_dir);
                             }
                             return(\$final_dir);
			 }#~~~~~~~ End of sub ~~~~~~~~~~~

		 @read_files = @{&read_file_names_only(\$in_dir, \@target_file_names)};
		 for($i=0; $i < @read_files; $i ++){
					@stat=stat($read_files[$i]);
					$size_sum+=$stat[7];
					if($stat[7] > 1000000){ $big_files{$stat[7]} = $read_files[$i]; }
					if( ($read_files[$i]=~/^[\W]+$/)||($read_files[$i] =~ / +/)){
							splice( @read_files, $i, 1 ); $i--  }
					if( ($read_files[$i]=~/\.\.+/)||($read_files[$i] =~ /\#+/)||($read_files[$i]=~/\,+/)){
							splice( @read_files, $i, 1 ); $i-- }
					if(-d "$in_dir\/$read_files[$i]" ){ push(@read_dirs, $read_files[$i]); next}
		 }
		 push(@final_files, @read_files);
		 return(\@final_files, \%big_files,  \$size_sum, \@read_dirs);
}

#________________________________________________________________________________
# Title     : produce_random_numbers
# Usage     : @rand_nums=@{&produce_random_numbers($how_many, $range)};
# Function  :
# Example   : @rand_nums=@{&produce_random_numbers($how_many, $range)};
#              while $how_many->10, $range->10
# Keywords  : generate_random_numbers, make_random_numbers, create_random_numbers
#             random_numbers, get_random_numbers
# Options   :
# Category  :
# Version   : 1.1
#--------------------------------------------------------------------------------
sub produce_random_numbers{
    my (%hash, $how_many, $i, $range);
    unless(@_ > 1){
                    print "\n# $0: produce_random_numbers needs 2 args. 1st for howmany, 2nd for range\n";
                    exit;
    }
    $how_many=${$_[0]} || $_[0];
    $range   =${$_[1]} || $_[1];
    for($i=0; $i<$how_many; $i++){
             $hash{$i}=int(rand($range))+1; ## if your range is 10, it will make numbers 1-10
    }
    return([values %hash]);
}

#________________________________________________________________________________
# Title     : read_seq_matrix_files
# Usage     : %matrix=%{&read_seq_matrix_files(\@file)};
# Function  : Makes similarrity matrix hash(reflexive, so it has AT as well as TA)
#             %matrix looks like this:  $matrix{X}{Y}= 4
# Example   :
# Keywords  : get_2D_aa_matrix, read_seq_matrix
# Options   :
#     $reflexive_combi=r by r -r
# Category  :
# Version   : 1.2
#--------------------------------------------------------------------------------
sub read_seq_matrix_files{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }



( run in 1.048 second using v1.01-cache-2.11-cpan-39bf76dae61 )