Bioinf
view release on metacpan or search on metacpan
if($evalue > $E_value ){
if($enquiry=~/^(\S+)_\d+\-\d+/){
$msp_0{"$1"} ="" unless $msp_0{"$1"};
next;
}else{
$msp_0{"$enquiry"} ="" unless $msp_0{"$enquiry"};;
next;
}
}
if($score < $score_thresh1){ next; }
if($enquiry=~/^(\S+)_\d+\-\d+/){
$msp_0{"$1"} .="$match_seq ";
}else{
$msp_0{"$enquiry"} .="$match_seq ";
$msp_00{join(' ', sort($enquiry, $match_seq))} = " $score $evalue";
}
}
}
%stat=%msp_0;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# filtering duplicates
#____________________________________________
@keys=keys %stat;
for($k=0; $k< @keys; $k++){
@split=split(/ +/,$stat{$keys[$k]});
@non_dup=@{&remove_dup_in_array(\@split)};
for($j=0; $j<@non_dup; $j++){
if($non_dup[$j]=~/^ *$/){
splice(@non_dup, $j, 1); $j--;
next;
}
if($non_dup[$j] eq $keys[$k]){
splice(@non_dup, $j, 1); $j--;
next;
}
}
$stat2{$keys[$k]}=join(' ', @non_dup);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Showing the actual matched sequences
# %stat has following contents
# d1ash__ d1bam__ d1mba__ d2lhb__
# d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
# d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
#______________________________________________________________________________
if($verbose=~/v/){
@keys= sort keys %stat2;
for($k=0; $k< @keys; $k++){
print "$keys[$k]: $stat2{$keys[$k]}\n";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Getting statistics
#_________________________________________
$evalue=$s;
$E_mult_factor1=1;
@output=@{&get_isearch_result_stat(\%stat2, \@pdbg_seqs, \$evalue,
\$base, \$E_mult_factor1, $leng_thresh, \%msp_00)};
%correct=%{$output[3]};
%final_stat_big_hash=(%final_stat_big_hash, %correct);
if($verbose){
@keys=sort keys %correct;
for($k=0; $k< @keys; $k++){
print "$keys[$k] $correct{$keys[$k]}\n";
}
}
}
return(\%final_stat_big_hash);
}
#________________________________________________________________________________
# Title : get_scop_correcting_pairs
# Usage : %correct=%{&get_scop_correcting_pairs()};
# Function :
# Example :
# Keywords : get_pdb_correcting_pairs , correct_pairs_in_scop, correct_homology_pairs
# Options :
# Category :
# Version : 1.4
#--------------------------------------------------------------------------------
sub get_scop_correcting_pairs{
my (%correcting_pairs, @correcting_pairs);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# %correcting_pairs is a correcting table for old pdb40d file classi
#_____________________________________________________________________
@correcting_pairs=( # should be pairs
'd2kauc1 d2kauc2', 'd1pkya1 d1pkya2',
'd1pvda2 d1trka1', 'd1pbe_1 d1pbe_2',
'd1poxa3 d1pvda2', 'd1efga1 d1efga2',
'd1dsba1 d1dsba2', 'd2gsta1 d2gsta2',
'd1bct__ d1brd__', 'd1qora1 d1qora2',
'd2ohxa1 d2ohxa2', 'd1efga2 d1eft_1',
'd1tada1 d1tada2', 'd1gsea1 d1gsea2',
'd1gesa2 d2tmda3', 'd1lvl_2 d2tmda3',
'd2tmda3 d2tpra2', 'd1tde_1 d2tmda3',
'd1nhp_2 d2tmda3', 'd1gesa1 d2tmda3',
'd1lvl_1 d2tmda3', 'd2tmda3 d2tpra1',
'd1fcda1 d2tmda3', 'd1nhp_1 d2tmda3',
'd1tde_2 d2tmda3', 'd1pbe_1 d2tmda3',
'd1ebha1 d1ebha2', 'd1gesa2 d2dlda2', ## 3.4.1 with
'd1gesa2 d1psda2', 'd1nhp_2 d2dlda2',
'd1ldm_1 d1tde_2', 'd1coy_1 d1ldb_1',
'd1lvl_2 d1psda2', 'd1psda2 d1tde_2',
'd1hyha1 d1tde_2', 'd1fcda1 d1ldm_1',
'd1hdca_ d1nhp_2', 'd1fcda1 d1hlpa1',
'd1llda1 d1lvl_2', 'd2dlda2 d2tpra2',
'd1ldm_1 d1nhp_2', 'd1llda1 d1pbe_1',
'd1gdha2 d2tpra1', 'd1ldb_1 d1nhp_2',
'd1gesa2 d1scua2', 'd1fcda1 d1hyha1',
'd1gesa1 d1hlpa1', 'd1gdha2 d1gesa2',
'd1lvl_2 d2dlda2', 'd1gesa1 d2dlda2',
'd1nhp_2 d2ohxa2', 'd1tde_2 d2dlda2', # 3.4.1. with 3.18.1, 3.17.1.
'd1nhp_1 d2cmd_1', 'd1fcda1 d1ldb_1',
'd1lvl_1 d2ohxa2', 'd1nhp_2 d2naca2',
'd1pbe_1 d2ohxa2', 'd1gdha2 d1nhp_2',
'd2cmd_1 d2tpra1', 'd1tde_1 d2cmd_1',
'd1llda1 d1nhp_2', 'd1hlpa1 d1nhp_2',
'd1nhp_1 d2dlda2', 'd1hyha1 d1nhp_2',
'd1nhp_2 d1psda2', 'd1fcda1 d2cmd_1',
'd1fcda1 d1llda1', 'd1lvl_2 d1udpa_',
'd1psda2 d2tpra2', 'd1hdca_ d1lvl_2',
'd1gesa2 d1llda1', 'd1nhp_2 d1qora2',
'd1ldm_1 d2tpra1', 'd1coy_1 d2dlda2',
'd2dlda2 d2tpra1', 'd1hdca_ d1pbe_1',
'd1coy_1 d1gdha2', 'd1nhp_2 d2cmd_1',
'd1llda1 d1tde_1', 'd1llda1 d1lvl_1',
'd1bdma1 d2tpra1', 'd1gd1o1 d2tpra2',
'd1ldb_1 d1lvl_1', 'd1hlpa1 d1tde_2',
'd1coy_1 d1psda2', 'd1nhp_2 d1udpa_',
'd1llda1 d1tde_2', 'd1tde_2 d2cmd_1',
'd1llda1 d2tpra2', 'd1ldb_1 d1tde_1',
'd1coy_1 d1hlpa1', 'd1coy_1 d2cmd_1',
'd1bdma1 d1gesa2', 'd1hyha1 d2tpra2',
'd1gesa2 d1hyha1', 'd1gesa2 d2ohxa2',
'd1ldb_1 d1tde_2', 'd1hlpa1 d1pbe_1',
'd1ldm_1 d2tpra2', 'd2ohxa2 d2tpra1',
'd1ldb_1 d2tpra2', 'd1gesa2 d1ldm_1',
'd1lvl_2 d1qora2', 'd1gesa1 d2naca2',
'd1coy_1 d1llda1', 'd1coy_1 d1hyha1',
'd1coy_1 d1ldm_1', 'd1ldm_1 d1lvl_2',
'd1eny__ d1nhp_2', 'd1pbe_1 d2pgd_2',
'd1ldb_1 d1pbe_1', 'd1ldb_1 d1lvl_2',
'd1gesa2 d1hlpa1', 'd1dhr__ d1nhp_2',
'd1hdca_ d1tde_1', 'd1gesa1 d1psda2',
'd1pbe_1 d2cmd_1', 'd1tde_2 d1udpa_',
'd1pbe_1 d2dlda2', 'd1hdca_ d1tde_2',
'd1gesa2 d1ldb_1', 'd1psda2 d2tpra1',
'd1gdha2 d1lvl_2', 'd1tde_1 d2dlda2',
'd1ldm_1 d1pbe_1', 'd1pbe_1 d1scua2',
'd1gesa1 d2ohxa2', 'd1lvl_2 d2naca2',
'd1gd1o1 d1lvl_1', 'd1fvl__ d1kst__',
'd1kst__ d2ech__', 'd1hsaa2 d1std__', ## d1hsaa.. is NOT homol, but to fix a problem in E_100_e_0.0005_j30_segged_2092
'd1afp__ d1hfi__'
);
for($i=0; $i< @correcting_pairs; $i++){
$correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
}
return(\%correcting_pairs);
}
#__________________________________________________________________
# Title : get_isearch_result_stat
# Usage : &get_self_isearch_stat(\%stat2, \@pdbg_seqs, \$evalue);
# Function :
# Example : Following input (hash eg: %stat2, input with the first word as key)
# will become columnar output.
#
# d1ash__ d1bam__ d1mba__ d2lhb__
# d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
# d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
#
# Will become:
# ....
# d1ash__ d2lhb__ Homolog: G1 98 0.012
# d1baba_ d1flp__ Homolog: G1 82 0.072
# d1baba_ d1hbg__ Homolog: G1 79 0.13
# d1baba_ d2lhb__ Homolog: G1 228 8e-12
# d1baba_ d3aaha_ Nomolog: G1 74 2
# d1baba_ d3sdha_ Homolog: G1 92 0.012
# d1cola_ d1hbg__ Nomolog: G1 79 0.59
# d1cpca_ d1cpcb_ Homolog: G1 176 4.9e-08
# ....
#
# Keywords : get_stat_interm_search, get_intermediate_search_stat
# Options : _ for debugging.
# # for debugging.
# Package : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns : [$av_correct, $num_enq_seq]
# Tips :
# Argument :
# Todo :
# Author : A Scientist
# Category :
# Version : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
$sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
$av_correct, $av_false, $actual_e_value, $correct_matched,
%correcting_pairs, @correcting_pairs, %correct);
my %seqs=%{$_[0]};
my @pdbg_seqs=@{$_[1]};
my $evalue=${$_[2]};
my $pdbg_base=${$_[3]} || $ARGV[3];
my $E_mult_factor1=${$_[4]};
my $E_mult_factor2=${$_[4]};
if(ref($_[5])){ $leng_thresh =${$_[5]} }else{ $leng_thresh=$_[5]; }
my %msp_0=%{$_[6]};
my %msp_00=%{$_[7]};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# %correcting_pairs is a correcting table for old pdb40d file classi
#_____________________________________________________________________
@correcting_pairs=( # should be pairs
'd2kauc1 d2kauc2', 'd1pkya1 d1pkya2',
'd1pvda2 d1trka1', 'd1pbe_1 d1pbe_2',
'd1poxa3 d1pvda2', 'd1efga1 d1efga2',
'd1dsba1 d1dsba2', 'd2gsta1 d2gsta2',
'd1bct__ d1brd__', 'd1qora1 d1qora2',
'd2ohxa1 d2ohxa2', 'd1efga2 d1eft_1',
'd1tada1 d1tada2', 'd1gsea1 d1gsea2',
}else{
print "\n# You seemed made a mistake, O.K., I will kill myself!\n\n";
print chr(7); exit;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (0) If blast is chosen run Blast
#_________________________________________________________
if($algorithm=~/[psi\-]*[pb][last]*/i){
print "\n# (i) Doing PSI search with @file\n";
@final_out=@{&do_psi_blast_search(\@file, "d=$source_DB_file",
"i=$input_seq_file", $over_write,
$make_msp_in_sub_dir_opt, $Lean_output)};
return(\@final_out); #<<<<<<----------- F I N I S H
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Controlling which kind of search it should do. Do save_in_gz_in_sub_dir first if d is set
#______________________________________________________________________________________________
if( $make_msp_in_sub_dir_opt ){ ## convert sso to msp and put in sub dir like /D/, /S/
for($x=0; $x < @seq_names; $x++){
my ($over_write_sso_by_age, $over_write_msp_by_age, %single_seq,
$out_file_sso_gz_name, $out_file_msp_name, $out_file_gz_name, $existing_sso);
my ($seq_name, $seq)= ($seq_names[$x], $seq_input{$seq_names[$x]});
my $first_char= substr("\U$seq_name", 0, $sub_dir_size);
mkdir ("$first_char", 0777) unless -d $first_char;
chdir("$first_char");
print "\n# (i) do_sequence_search: You set \'d\' or \'D\' opt\n";
print "# (i) making subDIRs ($first_char) with $seq_name $sequence_DB to store MSP files\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Let's make each fasta file for each seq to be used in searching
#_____________________________________________________________________
my $temp_file_name="$seq_name.fa";
%single_seq=($seq_name, $seq_input{$seq_name});
&write_fasta(\%single_seq, $temp_file_name ); ## e for writing each file
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Making output file name according to the option given
#_______________________________________________________________________
if($machine_readable and $algorithm=~/[fastassearch]+/){
$out_file_sso_name="$seq_name\.msso";
}else{ $out_file_sso_name="$seq_name\.sso"; }
$out_file_sso_gz_name="$out_file_sso_name\.gz";
$out_file_msp_name="$seq_name\.msp";
$out_file_gz_name="$seq_name\.msp\.gz";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if SSO file already there
#_______________________________________________________________________
if(-s $out_sso_file){ $existing_sso=$out_file_sso_name }
elsif(-s $out_sso_gz_name){ $existing_sso=$out_file_sso_gz_name }
if(-s $out_msp_name){ $existing_msp=$out_file_msp_name }
elsif(-s $out_gz_name){ $existing_msp=$out_file_gz_name }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If the dates of files created are long ago, overwrite to refresh
#____________________________________________________________________
if( (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){
$over_write_sso_by_age='o';
}
if( (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){
$over_write_msp_by_age='o';
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To check if the target seq DB is in ../
#________________________________________________
if(-s $sequence_DB){
print "\n# (i) Good, target \$sequence_DB $sequence_DB is in this working dir\n";
}elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB"; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) Searching: Making MSP files directly, MSP file format is the major format used in geanfammer!
#_________________________________________________________________________________________
if($char_opt =~/D/){ #### To make MSP file
if( !(-s $out_file_gz_name or -s $out_file_msp_name) or $over_write or $over_write_msp_by_age){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2.1) Running run_fasta_sequence_search !!
#_______________________________________________________
print "\n# (i) Running run_fasta_sequence_search !!\n";
$gzipped_msp_file=${&run_fasta_sequence_search(
"a=$algorithm",
"O=$out_file_msp_name",
"File=$temp_file_name", "e=$E_val",
"DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
$gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
}else{
print "\n# Line No. ", __LINE__,", $out_file_gz_name already exists or
\$over_write is set or NOT older than $age_in_days_of_out_file\n";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To make gzipped SSO files and MSP files
#_______________________________________________
elsif($create_sso or $char_opt=~/m/){ ### To make gzipped SSO files
if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
print "\n# (i) Running run_fasta_sequence_search with \"\$create_sso option\"!!\n\n";
$gzipped_msp_file=${&run_fasta_sequence_search(
"a=$algorithm",
"O=$out_file_msp_name", "$create_sso",
"File=$temp_file_name", "e=$E_val",
"DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
$gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
}else{
print "\n# Line No. ", __LINE__,", $out_file_gz_name already exists or
\$over_write is set or NOT older than $age_in_days_of_out_file\n";
}
}else{
if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
system(" $algorithm -m 10 -H -E $E_val $temp_file_name $sequence_DB $k_tuple > $out_file_sso_name");
system("gzip $out_file_sso_name");
}else{
print "\n# Line No. ", __LINE__,", $out_file_gz_name already exists or
\$over_write is set or NOT older than $age_in_days_of_out_file\n";
}
}
unlink("$seq_name.fa") if -s "$seq_name.fa";
unlink("$first_char/$seq_name\.fa") if -s "$first_char/$seq_name\.fa";
print "\n# Sub dir $first_char and $seq_name\.msp has been made, finishing do_sequence_search\n";
chdir ('..');
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# F I N I S H
#________________________________________
goto EXIT;
} # if ($char_opt =~/[dD]/){ is finished
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) Writing on PWD. This is the big single MSP output
#____________________________________________________________
$Single_msp_out_file="$base_name\.msp" if($single_big_msp eq 's');
if(-s $Single_msp_out_file and !$over_write ){
print "\n# (i) $Single_msp_out_file exists, and no \$over_write is set, skipping \n";
push(@final_out, $Single_msp_out_file);
}else{ $over_write ='o'; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if it is necessary to write each sequences.fa files
#______________________________________________________
if( $over_write ){ &write_fasta_seq_by_seq(\%seq_input, 'e'); } ## e for writing each seq file
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# When, you didn't use "DB=$XXX" and "File=$FXXX" format, first file input is DB etc
#_______________________________________________________________________________________
$defined_all_ok=&check_if_defined($input_file_name, $sequence_DB);
if(!$defined_all_ok){ print "\n# (E) FATAL: do_sequence_search: You did not use \"DB=\$XXX\" format\n"; exit };
print "\n# Finished writing the enquiry fasta files from \%seq_input by write_fasta";
print "\n# I am in do_sequence_search sub, Target database used : $sequence_DB with seqs of \'@seq_names\'\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main search with given @seq_names
#______________________________________________________________
for($j=0; $j< @seq_names; $j++){ # @seq_names has sequence names coming from (@seq_names) = keys %seq_input;
my ($over_write_sso_by_age, @temp, $existing_sso, $out_gz_name,
$over_write_msp_by_age, $existing_msp, $out_msp_file, $seq_name);
$seq_name=$seq_names[$j];
$each_seq_fasta="$seq_name\.fa";
$out_msp_file="$seq_name\.msp";
$out_gz_name="$seq_name\.msp\.gz";
$out_msso_file="$seq_name\.msso";
&die_if_file_not_present($each_seq_fasta);
print "\n# (i) :-) Found $each_seq_fasta is searched against $sequence_DB\n";
if($algorithm=~/fasta/){ $out_sso_file="$seq_name\.fsso";
}elsif($algorithm=~/ssearch/){ $out_sso_file="$seq_name\.ssso"; }
$out_sso_gz_name="$out_sso_name\.gz";
if(-s $out_sso_file){ $existing_sso=$out_sso_file }
elsif(-s $out_sso_gz_name){ $existing_sso=$out_sso_gz_name }
if(-s $out_msp_file){ $existing_msp=$out_msp_file }
elsif(-s $out_gz_name){ $existing_msp=$out_gz_name }
if( (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){
$over_write_sso_by_age='o';
}
if( (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){
$over_write_msp_by_age='o';
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To check if the target seq DB is in ../
#________________________________________________
if(-s $sequence_DB){ print "\n# (i) \$sequence_DB $sequence_DB exists, Good\n";
}elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB";
}elsif( -s "../../$sequence_DB"){ $sequence_DB="../../$sequence_DB"; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If MSP file already exist
#_____________________________________________________________
if( -s $out_msp_file and !$over_write_msp_by_age and !$over_write ){
print "\n# (i) File: $out_msp_file exists, skipping, to overwrite use \'o\' opt or set days";
push(@final_out, $out_msp_file);
}else{ ## -E is for e value cutoff. -b is for num of seq fetched
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~~~~~~~~~~~~~~
# K-tuple is 1 by default. If xxxx.sso exsts, skip running fasta or ssearch
#________________________________________________________________________________
if(-s $out_sso_file and !$over_write ){ ## If SSO is already present, JUST READ IT!
print "\n# (i) Just opening existing $out_sso_file $out_sso_file $out_msp_file $over_write_msp_by_age $over_write\n";
open(SSO_ALREADY, "$out_sso_file");
@temp=<SSO_ALREADY>;
print "\n# (i) \@temp has ", scalar(@temp), " lines\n";
close(SSO_ALREADY);
&compress_files_by_gzip($out_sso_file);
}else{ ## Run FASTA HERE
print "\n# (i) Running \"run_fasta_sequence_search\" ";
$gzipped_msp_file=${&run_fasta_sequence_search(
"a=$algorithm",
"O=$out_msp_file", "$create_sso",
"File=$each_seq_fasta", "e=$E_val",
"DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
push(@final_out, $gzipped_msp_file) if -s $gzipped_msp_file ;
unlink($each_seq_fasta) if $Lean_output;
}
}
if($machine_readable and $create_sso and -s $out_sso_file){ &cp($out_sso_file, $out_msso_file); }
} # end of for($j=0; $j< @seq_names; $j++){
return(\@final_out);
EXIT:
} # do_sequence_search
#__________________________________________________________________________
# Title : do_hmm_sequence_search
# Usage : &do_hmm_sequence_search(\@file, "method=$default_search_method",
# $over_write, "DB=$pdbd40_seq_fasta");
#
# Function : does hmm sequence search using Sean Eddy's HMMER (hmmls, hmmfs)
# Example :
# Keywords : do_seq_search_with_hmm, do_hmmt_sequence_search
# Options :
# "method=ls" for turning hmmls search option on (default)
# "method=fs" for turning hmmfs search option on
# method= by method=
# o for overwriting existint xxxxx.hmm files
return(\$final_time);
}
#________________________________________________________________________
# Title : get_date
# Usage : @outformat = &get_date; eg result > (010595 1-May-1995)
# Function : returns date: $date6d (6 digit format) and
# $datec (dd-mmm-yyyy format), Tim's version is 'getdate' in th_lib.pl
# Example : 30-Nov-1995
# Keywords : get_present_date,
# Options :
# Returns : ref of an array for (1-May-1995 and 010595)
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub get_date{
my($date_alphabet, $date6d);
my(@time) = localtime(time);
my($ty,$tm,$td) = ($time[5],$time[4],$time[3]);
my($year) = '19' . $ty;
my($mon) = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$tm];
my($day) = $td;
if($day < 10){
$day = ' ' . $day;
}
$date_alphabet = $day.'-'.$mon.'-'.$year;
$tm++;
if($tm < 10){
$tm = '0'.$tm;
}
if($td < 10){
$td = '0'.$td;
}
$date6d = $td.$tm.$ty;
return ([$date_alphabet, $date6d]);
}
#__________________________________________________________________________
# Title : if_file_older_than_x_days
# Usage : if( ${&if_file_older_than_x_days($ARGV[0], $days)} > 0){
# Function : checks the date of last modi of file given and compares with
# present time. Substracts diff and returns the actual diff days.
# Example :
# Keywords : how_old_file, how_old, is_file_older_than_x_days, file_age,
# file_age_in_days, check_file_age
# Options :
# Returns : the actual days older, so NON-ZERO, otherwise, 0
# Argument :
# Version : 1.2
#----------------------------------------------------------------------------
sub if_file_older_than_x_days{
my($how_old_days);
my $days=1; # default
if(@_ < 2){ print "\n# if_file_older_than_x_days needs 2 args\n"; exit; }
my $file=${$_[0]} || $_[0];
$days=${$_[1]} || $_[1];
unless(-s $file){ print "\n# if_file_older_than_x_days: $file does NOT exist\n"; exit; }
if(lstat($file)){ # to handle Symbolyc link
print "\n# (i) if_file_older_than_x_days: running lstat\n";
$how_old_days=(localtime(time- (lstat($file))[9]))[3]; ## should be lstat not stat
}else{
print "\n# (i) if_file_older_than_x_days: running stat\n";
$how_old_days=(localtime(time- (stat($file))[9]))[3]; ## should be lstat not stat
}
if($how_old_days > $days and $how_old_days < 10000){
print "\n# if_file_older_than_x_days: $file is older than $days\n";
return(\$days);
}else{
print "\n# if_file_older_than_x_days: $file is NOT older than $days\n";
return(0);
}
}
#________________________________________________________________________
# Title : array_chk
# Usage : &array_chk(\@any_array_to_chk);
# Function : checks if any inputting array is empty or with one element.
# Example : This is used only with subs which accepts array inputs.
# Warning :
# Keywords : array_check
# Options :
# Returns : nothing, prints out messages to STDOUT
# Argument : gets on ref. of array.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub array_chk{ my(@input)=@{$_[0]};
if (@input == 0){
&caller_info;
print "\n >>> $0 \n";
print "\n >>> Error: Input array to this subroutine was empty\n", chr(7);
print "\n To continue prog. type \'y\', or \'n\' to quit (with enter).\n ---->";
$key = ${&yes_or_no};
if($key ne 'y'){ print "\n !! Aborting the operation !! \n"; exit(0); }}
elsif ($#input == 0){
print "\n >>> Warn: Input array to this subroutine was only one, O.K ?\n";
print "\n >>> It means your input was not an array at all, probable error\n";
&caller_info;
#________________________________________________________
# Title : caller_info
# Function : tells you calleing programs and sub's information with file, subname, main, etc
# Usage : &caller_info; (just embed anywhere you want to check.
#----------------------------------------------------------------------
sub caller_info{ # caller(1), the num. tells you which info you choose
my($i)=1;
while(($pack, $file, $line, $subname, $args) = caller($i++)){
my($level) = $i-1;
print "\n", chr(169)," This sub info was made by \&caller_info subroutine";
print "\n ", chr(164)," Package from => $pack ";
print "\n ", chr(164)," Exe. file was => $file ";
print "\n ", chr(164)," Line was at? => $line (in $file)";
print "\n ", chr(164)," Name of sub? => $subname";
print "\n ", chr(164)," How many arg? => $args";
print "\n ", chr(164)," Level of sub? => $level (1 is for where \&caller_info is )\n\n";
}
}
#________________________________________________________
#________________________________________________________
# Title : yes_or_no
# returns : ref. of a Scalar for 'y' or 'n'
# Usage : $yes_or_no = ${&yes_or_no};
"u=$upper_expect_limit",
"l=$lower_expect_limit",
"k=$k_tuple",
$add_range,
$create_sso,
"t=$Score_thresh",
"m=$margin",
"a=$algorithm",
$msp_directly_opt,
$machine_readable,
$new_format )};
print "\n# File created: @file_created \n";
}
if($do_in_batch or $scramble_order and @file > 0){
for($i=0; $i< @file; $i++){
%fasta_seqs=%{&open_fasta_files(\$file[$i])};
if($char_opt=~/r/){ ## reverse the query seqs.
%fasta_seqs=%{&reverse_sequences(\%fasta_seqs)};
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main sequence search
#___________________________________________________
my @file_created=@{&do_sequence_search(\%fasta_seqs,
"FILE_AGE=$age_in_days_of_out_file",
"DB=$input_db_file",
"File=$file[$i]",
$single_big_msp,
$over_write,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
"k=$k_tuple",
$add_range,
$create_sso,
"t=$Score_thresh",
"m=$margin",
"a=$algorithm",
$msp_directly_opt,
$machine_readable,
$new_format )};
print "\n# File created: @file_created \n";
}
}elsif(@file > 0){ ## reads in the big database file continously
my $make_gz_in_sub_dir_opt='d';
my ($ori_seq_name, $first_char, $seq_file_msp_name,
$seq, $seq_name, $first_char);
for($i=0; $i< @file; $i++){
open(FASTA, "$file[$i]");
while(<FASTA>){
if( /\> *((\w)\S+)/ ){
$ori_seq_name=$1;
if($seq=~/\S/ and $seq_name=~/\S/){
my ($overwrite_by_age, $existing_msp_file);
$seq_file_name="$seq_name\.fa";
$seq_file_msp_name="$seq_name\.msp";
$seq_file_msp_gz_name="$seq_name\.msp\.gz";
$first_char= substr("\U$seq_name", 0, 1);
if(-s "$first_char\/$seq_file_msp_name"){ $existing_msp_file="$first_char\/$seq_file_msp_name"; }
elsif(-s "$first_char\/$seq_file_msp_gz_name"){ $existing_msp_file="$first_char\/$seq_file_msp_gz_name"; }
if( (localtime(time- (stat($existing_msp_file))[9]))[3] > $age_in_days_of_out_file ) {
$overwrite_by_age='o';
print "\n# interm_lib_search: $seq_file_msp_name is older than $age_in_days_of_out_file days, ovrwrting\n";
}
if( !$over_write and (-s "$first_char\/$seq_file_msp_name" or -s "$first_char\/$seq_file_msp_gz_name")
and !$overwrite_by_age ){
print "\n# interm_lib_search: $first_char\/$seq_file_msp_name already exists or newer than $age_in_days_of_out_file \n";
$seq='';
}else{
&do_sequence_search({"$seq_name", "$seq"},
"DB=$input_db_file" ,
"FILE_AGE=$age_in_days_of_out_file",
"File=$seq_file_name",
$single_msp, $over_write,
"u=$upper_expect_limit",
"$make_gz_in_sub_dir_opt",
"l=$lower_expect_limit",
"k=$k_tuple",
$make_msp_in_sub_dir_opt );
$seq='';
}
}
$seq_name=$ori_seq_name;
$first_char="\U$2"; # for storing output
}elsif(eof){
$seq.=$_;
if($seq=~/\S/ and $seq_name=~/\S/){
my ($overwrite_by_age, $existing_msp_file);
$seq_file_name="$seq_name\.fa";
$seq_file_msp_name="$seq_name\.msp";
$seq_file_msp_gz_name="$seq_name\.msp\.gz";
$first_char= substr("\U$seq_name", 0, 1);
if(-s "$first_char\/$seq_file_msp_name"){ $existing_msp_file="$first_char\/$seq_file_msp_name"; }
elsif(-s "$first_char\/$seq_file_msp_gz_name"){ $existing_msp_file="$first_char\/$seq_file_msp_gz_name"; }
if( (localtime(time- (stat($existing_msp_file))[9]))[3] > $age_in_days_of_out_file ) {
$overwrite_by_age='o';
print "\n# interm_lib_search : $seq_file_msp_name is older than $age_in_days_of_out_file days, ovrwrting\n";
}
if( !$over_write and (-s "$first_char\/$seq_file_msp_name" or -s "$first_char\/$seq_file_msp_gz_name")
and !$overwrite_by_age ){
print "\n# $first_char\/$seq_file_msp_name already exists. (interm_lib_search) \n";
}else{
&do_sequence_search({"$seq_name", "$seq"},
"DB=$input_db_file" ,
"FILE_AGE=$age_in_days_of_out_file",
"File=$seq_file_name",
$single_msp, $over_write,
"u=$upper_expect_limit",
"$make_gz_in_sub_dir_opt",
"l=$lower_expect_limit",
"k=$k_tuple",
$make_msp_in_sub_dir_opt );
$seq='';
}
}
}elsif(/^(\w+)$/){
$seq.=$1;
}
}
close FASTA;
}
}
}
#______________________________________________________________________
# Title : geanfammer
# Usage : &geanfammer(\@your_genome_or_db_to_analyse_file,
# $verbose);
#
# Function : Creates a domain level clustering file from a given
# FASTA format sequence DB. It has been used for complete
# genome sequence analysis. Can use PSI-blat, fasta, ssearch
#
# ------------ USAGE INFORMATION -------------------
# The parameters you put are important for the meaningful
# protein family maker.
# The most important one is the E and e options (Mostly,
# they can have same value).
# Large E is for setting the threshold for the single
# linkage clustering.
# This means, any sequence hit BELOW the threshold
# (which is good ) will be linked.
# For example, if Seq1 matched with Seq2 with E value
# of FASTA search:
# 0.001, and you set the threshold 0.1, then YOU
# ordered the geanfammer to regard them a family.
#
# The second small e option is for the dividing a complex
# and wrong cluster into correct more correct
# duplication modules. This is necessary as a
# lot of multidomain proteins can be clustered together
# WRONGLY by single linkage.
#____________________________________________________________
# Title : read_any_dir_for_dir
# Function : read any dir and REMOVES the '.' and '..' entries. And
# then put in array.
# Usage : @file_list = @{&read_any_dir(\$absolute_path_dir_name, ....)};
# Argument : takes one or more scaler references.
# Returns : one ref. of array.
# Keywords : read_any_dir_for_dir_command
# Warning : This does not report '.', '..', '#xxxx', ',xxxx', etc. only legitimate
# Warning : file and dir names are reported.
# Version : 1.2
#----------------------------------------------------------------------
sub read_any_dir_for_dir{
my ($i,%big_files, $in_dir, $size_sum, @final_files, @in,
@target_file_names, $in_dir, $k, @possible_dirs, @read_files, @stat);
@in=@_;
for($k=0; $k < @in; $k++){
if ( ($in[$k] eq '.') || !(defined($in[$k]))){ $in_dir='.'; splice(@in, $k, 1); $k-- }
elsif(!(ref($in[$k]))){ $in_dir=$in[$k]; }
elsif(ref($in[$k]) eq 'SCALAR'){
$in_dir =${$in[$k]};
splice(@in, $k, 1); $k--;next; }
elsif(ref($in[$k]) eq 'ARRAY'){
@target_file_names= @{$in[$k]}; splice(@in, $k, 1); $k--; }
if($in_dir!~/^\.$/ and $in_dir =~ /^([\w\-\.]+)$/){ $in_dir="\.\/$in_dir"; # if it is a short dir name
unless(-d $in_dir){ $in_dir=${&dir_search_special(\$in_dir, \@target_file_names)} }
}
print "\n# $0: read_any_dir_for_dir: \$in_dir is $in_dir\n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# This is to search the input dir names in your whole dirs
#____________________________________________________________
sub dir_search_special{
my($in_dir)=${$in[0]};
my(@ENV_dir, @probable_dir_list, @dirs,@possible_dirs, $final_dir);
if($in_dir =~ /\/([\w\.\-]+)$/){ $in_dir = $1; }
@probable_dir_list=('ALIGN', 'PDB', 'PATH', 'HOME', 'JPO', 'PIRDIR', 'PDBSST','PDBENT',
'BLASTDB', 'PIRDIR', 'SWDIR');
for (@probable_dir_list){ @dirs=split(':', $ENV{$_});
for (@dirs){ if (/$in_dir$/){ $final_dir = $_; } } }
if(@possible_dirs <1){ # goes up one level and tries to find dir.
my($pwd)=`pwd`;
chomp($pwd);
my(@temp)=split('/', $pwd);
pop(@temp);
my($up_pwd)=join('/', @temp);
$in_dir="$up_pwd\/$in_dir";
$final_dir=$in_dir if (-d $in_dir);
}
return(\$final_dir);
}#~~~~~~~ End of sub ~~~~~~~~~~~
@read_files = @{&read_file_names_only(\$in_dir, \@target_file_names)};
for($i=0; $i < @read_files; $i ++){
@stat=stat($read_files[$i]);
$size_sum+=$stat[7];
if($stat[7] > 1000000){ $big_files{$stat[7]} = $read_files[$i]; }
if( ($read_files[$i]=~/^[\W]+$/)||($read_files[$i] =~ / +/)){
splice( @read_files, $i, 1 ); $i-- }
if( ($read_files[$i]=~/\.\.+/)||($read_files[$i] =~ /\#+/)||($read_files[$i]=~/\,+/)){
splice( @read_files, $i, 1 ); $i-- }
if(-d "$in_dir\/$read_files[$i]" ){ push(@read_dirs, $read_files[$i]); next}
}
push(@final_files, @read_files);
return(\@final_files, \%big_files, \$size_sum, \@read_dirs);
}
#________________________________________________________________________________
# Title : produce_random_numbers
# Usage : @rand_nums=@{&produce_random_numbers($how_many, $range)};
# Function :
# Example : @rand_nums=@{&produce_random_numbers($how_many, $range)};
# while $how_many->10, $range->10
# Keywords : generate_random_numbers, make_random_numbers, create_random_numbers
# random_numbers, get_random_numbers
# Options :
# Category :
# Version : 1.1
#--------------------------------------------------------------------------------
sub produce_random_numbers{
my (%hash, $how_many, $i, $range);
unless(@_ > 1){
print "\n# $0: produce_random_numbers needs 2 args. 1st for howmany, 2nd for range\n";
exit;
}
$how_many=${$_[0]} || $_[0];
$range =${$_[1]} || $_[1];
for($i=0; $i<$how_many; $i++){
$hash{$i}=int(rand($range))+1; ## if your range is 10, it will make numbers 1-10
}
return([values %hash]);
}
#________________________________________________________________________________
# Title : read_seq_matrix_files
# Usage : %matrix=%{&read_seq_matrix_files(\@file)};
# Function : Makes similarrity matrix hash(reflexive, so it has AT as well as TA)
# %matrix looks like this: $matrix{X}{Y}= 4
# Example :
# Keywords : get_2D_aa_matrix, read_seq_matrix
# Options :
# $reflexive_combi=r by r -r
# Category :
# Version : 1.2
#--------------------------------------------------------------------------------
sub read_seq_matrix_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
( run in 1.048 second using v1.01-cache-2.11-cpan-39bf76dae61 )