Bioinf
view release on metacpan or search on metacpan
'd1eny__ d1nhp_2', 'd1pbe_1 d2pgd_2',
'd1ldb_1 d1pbe_1', 'd1ldb_1 d1lvl_2',
'd1gesa2 d1hlpa1', 'd1dhr__ d1nhp_2',
'd1hdca_ d1tde_1', 'd1gesa1 d1psda2',
'd1pbe_1 d2cmd_1', 'd1tde_2 d1udpa_',
'd1pbe_1 d2dlda2', 'd1hdca_ d1tde_2',
'd1gesa2 d1ldb_1', 'd1psda2 d2tpra1',
'd1gdha2 d1lvl_2', 'd1tde_1 d2dlda2',
'd1ldm_1 d1pbe_1', 'd1pbe_1 d1scua2',
'd1gesa1 d2ohxa2', 'd1lvl_2 d2naca2',
'd1gd1o1 d1lvl_1', 'd1fvl__ d1kst__',
'd1kst__ d2ech__', 'd1hsaa2 d1std__', ## d1hsaa.. is NOT homol, but to fix a problem in E_100_e_0.0005_j30_segged_2092
'd1afp__ d1hfi__'
);
for($i=0; $i< @correcting_pairs; $i++){
$correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
}
return(\%correcting_pairs);
}
#__________________________________________________________________
# Title : get_isearch_result_stat
# Usage : &get_self_isearch_stat(\%stat2, \@pdbg_seqs, \$evalue);
# Function :
# Example : Following input (hash eg: %stat2, input with the first word as key)
# will become columnar output.
#
# d1ash__ d1bam__ d1mba__ d2lhb__
# d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
# d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
#
# Will become:
# ....
# d1ash__ d2lhb__ Homolog: G1 98 0.012
# d1baba_ d1flp__ Homolog: G1 82 0.072
# d1baba_ d1hbg__ Homolog: G1 79 0.13
# d1baba_ d2lhb__ Homolog: G1 228 8e-12
# d1baba_ d3aaha_ Nomolog: G1 74 2
# d1baba_ d3sdha_ Homolog: G1 92 0.012
# d1cola_ d1hbg__ Nomolog: G1 79 0.59
# d1cpca_ d1cpcb_ Homolog: G1 176 4.9e-08
# ....
#
# Keywords : get_stat_interm_search, get_intermediate_search_stat
# Options : _ for debugging.
# # for debugging.
# Package : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns : [$av_correct, $num_enq_seq]
# Tips :
# Argument :
# Todo :
# Author : A Scientist
# Category :
# Version : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
$sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
$av_correct, $av_false, $actual_e_value, $correct_matched,
%correcting_pairs, @correcting_pairs, %correct);
my %seqs=%{$_[0]};
my @pdbg_seqs=@{$_[1]};
my $evalue=${$_[2]};
my $pdbg_base=${$_[3]} || $ARGV[3];
my $E_mult_factor1=${$_[4]};
my $E_mult_factor2=${$_[4]};
if(ref($_[5])){ $leng_thresh =${$_[5]} }else{ $leng_thresh=$_[5]; }
my %msp_0=%{$_[6]};
my %msp_00=%{$_[7]};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# %correcting_pairs is a correcting table for old pdb40d file classi
#_____________________________________________________________________
@correcting_pairs=( # should be pairs
'd2kauc1 d2kauc2', 'd1pkya1 d1pkya2',
'd1pvda2 d1trka1', 'd1pbe_1 d1pbe_2',
'd1poxa3 d1pvda2', 'd1efga1 d1efga2',
'd1dsba1 d1dsba2', 'd2gsta1 d2gsta2',
'd1bct__ d1brd__', 'd1qora1 d1qora2',
'd2ohxa1 d2ohxa2', 'd1efga2 d1eft_1',
'd1tada1 d1tada2', 'd1gsea1 d1gsea2',
'd1gesa2 d2tmda3', 'd1lvl_2 d2tmda3',
'd2tmda3 d2tpra2', 'd1tde_1 d2tmda3',
'd1nhp_2 d2tmda3', 'd1gesa1 d2tmda3',
'd1lvl_1 d2tmda3', 'd2tmda3 d2tpra1',
'd1fcda1 d2tmda3', 'd1nhp_1 d2tmda3',
'd1tde_2 d2tmda3', 'd1pbe_1 d2tmda3',
'd1ebha1 d1ebha2', 'd1gesa2 d2dlda2', ## 3.4.1 with 3.15.1
'd1gesa2 d1psda2', 'd1nhp_2 d2dlda2',
'd1ldm_1 d1tde_2', 'd1coy_1 d1ldb_1',
'd1lvl_2 d1psda2', 'd1psda2 d1tde_2',
'd1hyha1 d1tde_2', 'd1fcda1 d1ldm_1',
'd1hdca_ d1nhp_2', 'd1fcda1 d1hlpa1',
'd1llda1 d1lvl_2', 'd2dlda2 d2tpra2',
'd1ldm_1 d1nhp_2', 'd1llda1 d1pbe_1',
'd1gdha2 d2tpra1', 'd1ldb_1 d1nhp_2',
'd1gesa2 d1scua2', 'd1fcda1 d1hyha1',
'd1gesa1 d1hlpa1', 'd1gdha2 d1gesa2',
'd1lvl_2 d2dlda2', 'd1gesa1 d2dlda2',
'd1nhp_2 d2ohxa2', 'd1tde_2 d2dlda2',
'd1nhp_1 d2cmd_1', 'd1fcda1 d1ldb_1',
'd1lvl_1 d2ohxa2', 'd1nhp_2 d2naca2',
'd1pbe_1 d2ohxa2', 'd1gdha2 d1nhp_2',
'd2cmd_1 d2tpra1', 'd1tde_1 d2cmd_1',
'd1llda1 d1nhp_2', 'd1hlpa1 d1nhp_2',
'd1nhp_1 d2dlda2', 'd1hyha1 d1nhp_2',
'd1nhp_2 d1psda2', 'd1fcda1 d2cmd_1',
'd1fcda1 d1llda1', 'd1lvl_2 d1udpa_',
'd1psda2 d2tpra2', 'd1hdca_ d1lvl_2',
'd1gesa2 d1llda1', 'd1nhp_2 d1qora2',
'd1ldm_1 d2tpra1', 'd1coy_1 d2dlda2',
'd2dlda2 d2tpra1', 'd1hdca_ d1pbe_1',
'd1coy_1 d1gdha2', 'd1nhp_2 d2cmd_1',
'd1llda1 d1tde_1', 'd1llda1 d1lvl_1',
'd1bdma1 d2tpra1', 'd1gd1o1 d2tpra2',
'd1ldb_1 d1lvl_1', 'd1hlpa1 d1tde_2',
'd1coy_1 d1psda2', 'd1nhp_2 d1udpa_',
'd1gesa2 d1hlpa1', 'd1dhr__ d1nhp_2',
'd1hdca_ d1tde_1', 'd1gesa1 d1psda2',
'd1pbe_1 d2cmd_1', 'd1tde_2 d1udpa_',
'd1pbe_1 d2dlda2', 'd1hdca_ d1tde_2',
'd1gesa2 d1ldb_1', 'd1psda2 d2tpra1',
'd1gdha2 d1lvl_2', 'd1tde_1 d2dlda2',
'd1ldm_1 d1pbe_1', 'd1pbe_1 d1scua2',
'd1gesa1 d2ohxa2', 'd1lvl_2 d2naca2',
'd1gd1o1 d1lvl_1'
);
for($i=0; $i< @correcting_pairs; $i++){
$correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
}
if($E_mult_factor1=~/^ *$/){ $E_mult_factor1=1; };
@keys=sort keys %seqs;
@keys=@{&strip_sequence_ranges(\@keys)};
@keys=@{&remove_dup_in_array(\@keys)};
@pdbg_seqs_ori=@pdbg_seqs;
$num_enq_seq=@pdbg_seqs;
print "\n# In get_isearch_result_stat: PDBG seqs $num_enq_seq \n=> @pdbg_seqs\n\n" if $verbose;
#@pdbg_seqs=@{&strip_sequence_ranges(\@pdbg_seqs)};
#@pdbg_seqs=@{&remove_dup_in_array(\@pdbg_seqs)};
if($num_enq_seq < 2){ print "\n# \$num_enq_seq is less than 2 @pdbg_seqs $base\n"; exit; }
for($c=0; $c < @keys; $c++){
my($enq_seq, $correct, $false_positive);
$num_of_matched=@match_seqs=split(/ +/, $seqs{$keys[$c]});
$enq_seq=$keys[$c];
for($d=0; $d< @match_seqs; $d++){
my($correct_matched, @sorted);
$match_seq=$match_seqs[$d];
$sorted=join(' ', sort ($enq_seq, $match_seq) );
for($i=0; $i< @pdbg_seqs; $i++){
if($match_seq =~/d?$pdbg_seqs[$i]/i or $correcting_pairs{$sorted} ){
print "\n# \$match_seq = $match_seq, \$pdbg_seqs $pdbg_seqs[$i] \$enq_seq: $enq_seq\n" if $verbose;
$correct++;
$correct_matched=1;
unless($correct{$sorted}){
$correct_group{$base} .="Homolog: $sorted $base $msp_0{$sorted}\n";
}
$correct{$sorted} = "Homolog: $base $msp_0{$sorted}";
}
}
if($correct_matched !=1){
$false_positive++;
unless( $correct{$sorted} ){
$correct_group{$base} .="Nomolog: $sorted $base $msp_0{$sorted}\n";
}
$correct{$sorted} = "Nomolog: $base $msp_0{$sorted}";
}
}
if(@match_seqs == 0){ @match_seqs=1; $percent_correct=0; }
$sum_correct += $correct;
$sum_false += $false_positive;
}
$av_correct = $sum_correct/$num_enq_seq;
$av_false = $sum_false /($num_enq_seq);
#### $actual_e_value becomes whatever $E_mult_factor1 defined ~~~~~~~~~~~~
if($E_mult_factor1 != 1){
$actual_e_value=$evalue * $E_mult_factor1;
}elsif($E_mult_factor2 != 1){
$actual_e_value= $evalue * $E_mult_factor2;
}else{ $actual_e_value=$evalue }
$num_enq_seq--;
$sum_correct_for_additional = $num_enq_seq+1;
$match_count=$sum_correct_for_additional * $av_correct;
#$sum_correct= $sum_correct_for_additional;
if($verbose){
printf ("%-10s %-12s %-13f %-13f %-7s %-7s %-7s %-7s %-4s\n", $pdbg_base,
$actual_e_value, $av_correct, $av_false, $num_enq_seq,
$sum_correct_for_additional, $sum_false, $match_count, $leng_thresh);
}
@correct_new=@{&remove_dup_in_array(\@correct_new)};
for($i=0; $i< @correct_new; $i++){
print "\n# correct new: $correct_new[$i]" ;
}
$num_correct=$match_count/2;
print "Num of non-reflective correcct: $num_correct Nomolog: $sum_false \n\n" if $verbose;
return([$av_correct, $sum_correct, $num_enq_seq, \%correct, \%correct_group]);
}
#__________________________________________________________________
# Title : strip_sequence_ranges
# Usage :
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : remove_sequence_ranges, remove_sequence_name_ranges,
# remove_ranges_in_sequences, strip_sequence_name_ranges,
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub strip_sequence_ranges{
my (@out, $i);
my @in=@{$_[0]} or @in=@_;
for($i=0; $i< @in; $i++){
if($in[$i]=~/^(\S+)_\d+\-\d+/){
push(@out, $1);
}else{
push(@out, $in[$i]);
}
system("$default_search_method -T $score_thresh -E $evalue_cutoff $file[$i] $target_DB > $output_hmm_result");
}else{
print "Running: $default_search_method -t $score_thresh $file[$i] $target_DB \> $output_hmm_result\n";
system("$default_search_method -t $score_thresh $file[$i] $target_DB > $output_hmm_result");
}
}else{
print "\n# The $out_hmm_file file already exists. To overwrite use -o opt\n";
}
push(@out_hmm_file_names, $output_hmm_result);
}
if(@out_hmm_file_names > 1){
return(\@out_hmm_file_names);
}else{
return(\$out_hmm_file_names[0]);
}
}
#_______________________________________________________________________
# Title : divide_clusters
# Usage : ÷_clusters(\@file);
# Function : This is the main funciton for divclus.pl
# divides complex single linkage cluster into smaller duplication
# module level sub clusters.
# Example : ÷_clusters(\@file, $verbose, $range, $merge, $sat_file,
# $dindom, $indup, "T=$length_thresh", "e=$evalue", $over_write,
# $optimize, "s=$score", "f=$factor");
#
# Keywords : divicl, divclus, div_clus, divide clusters
# Options : _ for debugging.
# f=<digit> for determing the factor in filtering out non-homologous
# regions, 7 = 70% now!!
# l=<digit> for seqlet(duplication module) length threshold
# t=<digit> for seqlet(duplication module) length threshold
# (same as l opt, confusing, huh? )
# s=<digit> for score threshold
# e=<digit> for evalue threshold
# z for activating remove_similar_sequences, rather than remove_dup....
# o for overwriting
# v for verbose printout (infor)
# D for dynamic factor
# S $short_region= S by S -S # taking shorter region overlap in removing similar reg
# L $large_region= L by L -L # taking larger region overlap in removing similar reg
# A $average_region=A by A -A # taking average region overlap in removing similar reg
# o for $over_write
#
# Version : 2.9
#------------------------------------------------------------------------
sub divide_clusters{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
$score, @temp_show_sub, $optimize, $file, $evalue, $over_write, $din_dom,
$sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
$average_region, $dynamic_factor, @sub_clustering_clu_files);
$factor=7; # default factor is 7 for 70%
$length_thresh=30;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Dealing with options
#_________________________________________
if($char_opt=~/m/){ $merge='m';
}if($char_opt=~/v/){ $verbose='v'; # for showing debugging information
}if($char_opt=~/i/){ $indup='i';
}if($char_opt=~/z/){ $optimize='z';
}if($char_opt=~/o/){ $over_write='o';
}if($char_opt=~/d/){ $din_dom='d';
}if($char_opt=~/s/){ $sat_file='s';
}if($char_opt=~/y/){ $dynamic_factor='y';
}if($char_opt=~/S/){ $short_region ='S';
}if($char_opt=~/L/){ $large_region ='L';
}if($char_opt=~/A/){ $average_region='A';
}if($vars{'T'}=~/\d+/){ $length_thresh= $vars{'T'};
}if($vars{'l'}=~/\d+/){ $length_thresh= $vars{'l'}; ## synonym of 't'
}if($vars{'f'}=~/\S+/){ $factor= $vars{'f'};
}if($vars{'s'}=~/\d+/){ $score = $vars{'s'};
}if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'};
}if($vars{'E'}=~/\d+/){ $evalue= $vars{'E'}; # synonym of e
}
$percent_fac=$factor*10;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
#________________________________________________________________________________
if(@file == 1){ #<=== @file has xxxx.msp, yyyy.msp zzzz.msp ....,
print "\n# (1) divide_clusters: One single file was given=> \"@file\"\n" if $verbose;
$file=$file[0];
$base_1=${&get_base_names($file)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) Define the output cluster file name: eg, 3-232_cluster_F7.clu , F7 means factor used is 7
#______________________________________________________________________________________________
$output_clu_file="$base_1\_F${factor}\.clu";
if( !$over_write and -s $output_clu_file){
print "\n# $output_clu_file Already EXISTS, skipping. Use \'o\' opt to overwrite\n"; exit;
}
print "# (2) divide_clusters: processing ONE single file \"@file\" with merge_sequence_in_msp_file\n" if $verbose;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (3) merge_sequence_in_msp_file does not do much. Just filtering and producing
# sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format from msp format
#________________________________________________________________________________
@grouped_seq_names=@{&merge_sequence_in_msp_file(\@file, "s=$score", $optimize, $din_dom, $sat_file,
$optimize, "T=$length_thresh", "e=$evalue", "f=$factor", "$range", "$merge", $verbose,
$short_region, $large_region, $average_region, $over_write, $dynamic_factor)};
if($verbose){
print "\n\n# (3) divide_clusters: finished running \"merge_sequence_in_msp_file\"\n ==>";
for($i=0; $i< @grouped_seq_names; $i++){
print "\n-->> $grouped_seq_names[$i]";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) This is critical seqlet merging step
#________________________________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
"f=$factor", $short_region, $large_region, $average_region, $optimize)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (5) This is showing the result in clu file format
#________________________________________________________________________________
@temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor" );
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $file);
}else{ push(@bad, $file); }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (6) Final write up stage (unecessary)
#_______________________________________________________________
&write_good_bad_list_in_divide_clusters(\@good, \@bad);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# when more than one single file input is given
#____________________________________________________________
elsif(@file >1 ){
my (@good, @bad);
if($indup =~/i/i){ open (INDUP, ">indup_stat\.txt"); } # this is not essential.
for($i=0; $i< @file; $i++){
my (@grouped_seq_names, @temp_show_sub);
my $indup_c=0;
my $big_msp_file=$file[$i];
unless(-s $big_msp_file){ print "\n# (E) \$big_msp_file does not exist\n"; exit }
$base_1=${&get_base_names($big_msp_file)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Define the output cluster file name: eg, 3-232_cluster_F7.clu , F7 means factor used is 7
#______________________________________________________________________________________________
$output_clu_file="$base_1\_F${factor}\.clu";
print "\n# DIVCLUS: just before merge_sequence_in_msp_file, \$output_clu_file is $output_clu_file from input file $big_msp_file" if $verbose;
if( !$over_write and -s $output_clu_file){
print "\n# $output_clu_file Already EXISTS, skipping. Use \'w\' opt to overwrite\n";
next; }
print "\n# (1) divide_clusters: processing file \"$big_msp_file\" for $output_clu_file" if $verbose;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) If clu file(eg 2-1618_ss.clu ) is in pwd, tries to skip
#____________________________________________________________
if((-s $output_clu_file) > 10 and $over_write !~/o/){
print "# $output_clu_file exists, skipping, use \"o\" option to overwrite\n"; next;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (3) merge_sequence_in_msp_file does not do much. Just filtering and producing
# sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format of STRING from msp format
# $big_msp_file is an MSP file
#________________________________________________________________________________
print "\n# (i) divide_clusters : I am merging seq in $big_msp_file file for $output_clu_file\n" if $verbose;
@grouped_seq_names=@{&merge_sequence_in_msp_file(\$big_msp_file, "s=$score", $din_dom, $sat_file, $optimize,
"T=$length_thresh", "e=$evalue", "f=$factor", "$range", "$merge", $verbose, $over_write,
$short_region, $large_region, $average_region, $dynamic_factor )};
if($verbose){
print "\n# \@file has more than one input file\n # The result of \"merge_sequence_in_msp_file\"\n";
print "@grouped_seq_names";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) Clustering the sets of merged seqlets => CORE algorithm
#____________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
$short_region, $large_region, $average_region)};
@temp_show_sub=&show_subclusterings(\@out, $big_msp_file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor");
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $big_msp_file);
}else{ push(@bad, $big_msp_file); }
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
&write_good_bad_list_in_divide_clusters(\@good, \@bad);
sub write_good_bad_list_in_divide_clusters{
my (@good, @bad, $i); @good=@{$_[0]}; @bad=@{$_[1]};
open(GOODBAD, ">good_bad.list");
print GOODBAD "GOOD: all link : 000\n";
for($i=0; $i< @good; $i++){ print GOODBAD "$good[$i]\n"; }
print GOODBAD "BAD : Not all link: 000\n";
for($i=0; $i< @bad; $i++){ print GOODBAD "$bad[$i]\n"; }
close(GOODBAD);
}
#_______________________________________________________________
}
return(\@sub_clustering_out_files); # contains (xxxx.clu, yyy.clu,, )
}
#______________________________________________________________________________
# Title : remove_file_extension
# Usage :
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub remove_file_extension{
my (@modified_files, $i, @files);
@files=@_;
for($i=0; $i< @files; $i++){
$base=${&get_base_names($files[$i])};
rename($files[$i], $base);
push(@modified_files, $base);
}
return(\@modified_files);
}
#______________________________________________________________________________
# Title : remove_small_files
# Usage : @files_removed=@{&remove_small_files(@ARGV)};
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
print "\n# new seqlet : $seqlets[$i]\n" if $verbose;
splice(@seqlets, $i+1, 1);
$i--;
}else{
if($start1 < $start2){
$short_start=$start2; $large_start=$start1; ## note that short start should be $start2 if $start2 is bigger
}else{
$short_start=$start1; $large_start=$start2;
}
if($end1 < $end2){
$short_end=$end1; $large_end=$end2;
}else{
$short_end=$end2; $large_end=$end1;
}
if($shorter_region){
$seqlets[$i]="$seq1\_$short_start\-${short_end}$tail1";
}elsif($larger_region){
$seqlets[$i]="$seq1\_$large_start\-${large_end}$tail1";
}
splice(@seqlets, $i+1, 1);
$i--;
}
}
}
}
}
}
print "\n# (3) remove_similar_seqlets: The final out are: @seqlets\n" if $verbose;
return(\@seqlets);
}
#__________________________________________________________________________
# Title : show_subclusterings
# Usage : &show_subclusterings(\@out);
# Function : This is the very final sub of divclus.pl
# Example : @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup);
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : print_subclusterings, sum_subclusterings, write_subclustering
# show_clusterings, display_subclusterings
# Options :
# f for file output, eg: xxxxxxx.sat
# Category :
# Version : 2.7
#-------------------------------------------------------------------------
sub show_subclusterings{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($max_size, $sat_file_name, $clu_file_name,
$ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
$indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
@sub_clustering_out_files); # clusall_1e-5_clu_14-324_ss.sat
my @out=@{$array[0]};
$indup_count=0;
if($char_opt=~/d/){ $dindom=1; }
if($char_opt=~/i/){ $indup=1; }
if($vars{'f'}=~/\S+/){ $factor= $vars{'f'}; }
if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
if($vars{'s'}=~/\d+/){ $score = $vars{'s'}; }
if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'}; }
#print "\n# (1) show_subclusterings : \@file has : @file\n";
if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.msp/ or
$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/ ){
$ori_cluster_size=$2;
$ori_cluster_num =$3;
$base=$1;
$sat_file_name="$base\.sat";
$clu_file_name="$base\.clu";
}else{
print "\n# (2) LINE:",__LINE__," The \@file input to show_subclusterings is not the right format, dying\n";
print "\n Sarah!, right format looks like: 13-234.msp or 8-420_cluster.msp \n"; exit;
}
open(CLU, ">$clu_file_name") or die "\n# (ERROR) show_subclusterings failed miserably to open \"$clu_file_name\" \n";
push(@sub_clustering_out_files, $clu_file_name);
@out=@{&sort_string_by_length(\@out)};
for($i=0; $i< @out; $i++){ # @out has ( 'YAL054C_98-695 YBR041W_90-617', 'YBR115C_230-842 YBR222C_16-537 YER015W_121-686', etc)
my $count+=$i+1;
my ( $int_dup_number, $sub_clu_size, $seq_with_range, @sp, $new_clus_NAME,
%tem, %tem2, %tem3, $j, @keys, $num_seq);
if($out[$i]=~/^ *$/){ next }
@sp=sort split(/ +/, $out[$i]);
for($j=0; $j < @sp; $j++){
$seq_with_range=$sp[$j];
if($seq_with_range=~/^((\S+)_((\d+)\-(\d+)))/){
$tem{$2}++;
$tem2{$2}.=sprintf("%-15s ", $1);
$tem3{$2} =$3;
$tem4{$2} =$5-$4;
}
}
@keys=sort keys %tem;
$num_seq=$sub_clu_size=@keys;
if($max_size < $sub_clu_size){
$max_size=$sub_clu_size; ## This is to collect the sizes of clusters to see if it is good.
}
$indup_count= &print_summary_for_divclus(
$count, \%tem2, \%tem,
$ori_cluster_num,
$ori_cluster_size,
$dindom,
$clu_file_name,
\%tem3, \%tem4,
$indup, );
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Local subroutine
#_______________________________________________________________
sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
my(@keys, $indup_count, $x, $m, $percentage_fac);
my $count=$_[0]; # count of cluster
my %tem2=%{$_[1]}; my $num_seq=@keys=sort keys %tem2;
my %tem=%{$_[2]}; my $ori_cluster_num=$_[3];
my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
my $ori_cluster_size=$_[4];
my $dindom=$_[5]; my %tem3=%{$_[7]};
my $indup=$_[9]; my (%internal_dup);
my %tem4=%{$_[8]};
#~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
if($dindom){
for($x=0; $x <@keys; $x++){
@domain_inside_domain=@{&get_domain_inside_domain($tem2{$keys[$x]})};
@domain_inside_domain=@{&remove_dup_in_array(\@domain_inside_domain)};
for($m=0; $m< @domain_inside_domain; $m++){ print " # Dindom: $m : $domain_inside_domain[$m]\n"; }
print "\n";
}
}
#==========================================================================================
#~~~~~~~~~~ Internal duplication ~~~~~~~~~~~~~~
if($indup==1){
# @keys is the same as sub cluster size,
for($x=0; $x < @keys; $x++){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checks each sequence for duplication
#___________________________________________________
my %internal_dup=%{&get_internal_dup_in_a_cluster( $tem2{$keys[$x]} )};
my @dup_keys=keys %internal_dup;
if(@dup_keys > 0){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# This calculates the actual duplicated number rather than jus tthe sequences
#______________________________________________________________________________
$indup_count++;
printf ("%-14s %-12s %-4s", $keys[$x], $new_clus_NAME, $num_seq);
for($m=0; $m< @dup_keys; $m++){
printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
}
}
}
}
#~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
print CLU "Cluster size $num_seq\n";
printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
print "Cluster size $num_seq\n";
printf ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
for($x=0; $x <@keys; $x++){
printf CLU (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
printf (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
}
return($indup_count);
}
}
close(CLU); ## this is a bug fix
if($max_size == $ori_cluster_size){ $good_bad=1;
}else{ $good_bad=0; }
print "\n# Sarah, Do you think the subclusterings are O.K.?" if $verbose;
print "\n# Tell me, if you feel suspicious, jong\@salts.med.harvard.edu\n\n" if $verbose;
return($good_bad, $indup_count, $ori_cluster_size, \@sub_clustering_out_files);
}
#__________________________________________________________________________
# Title : exchange_query_with_match_in_msp
# Usage : @exchanged_msp=@{&exchange_query_with_match_in_msp(\@file)};
# Function :
# Example :
# Keywords : swap_query_with_match_in_msp, invert_query_with_match_in_msp,
# swap_query_seq_with_match_seq_in_msp,
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#----------------------------------------------------------------------------
sub exchange_query_with_match_in_msp{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%exchanged_msp, @sorted_by_query_seq_names, @new_msp_lines);
$open_msp_files_x_opt = 'x';
if($char_opt=~/n/){ $names_only='n' }
%exchanged_msp=%{&open_msp_files(@file, $open_msp_files_x_opt, $names_only )};
@new_msp_lines=values %exchanged_msp;
@sorted_by_query_seq_names=
map{ $_->[0] } sort {$a->[1] cmp $b->[1]} map {/^\d+ +\S+ +\d+ +\d+ +(\S+)/ && [$_, $1] } @new_msp_lines;
return(\@sorted_by_query_seq_names);
}
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub Richardson_alpha_matrix{
%Richardson_alpha_matrix = ( ## ref: Protein Eng. V.8, no9, pp905-913, 1995
'A', 1.80,
'C', 0.70,
'D', 1.00,
'E', 0.80,
'F', 1.30,
'G', 0., ## <<----
'H', 0.69,
'I', 1.14,
'K', 0.94,
'L', 1.14,
'M', 1.20,
'N', 0.78,
'P', 0.19,
'Q', 0.98,
'R', 1.03,
'S', 0.76,
'T', 0.82,
'V', 0.95,
'W', 1.11,
'Y', 1.02
);
return(%Richardson_alpha_matrix);
}
#________________________________________________________________________
# Title : get_segment_shift_rate
# Usage : &get_segment_shift_rate(\%hash_for_errors, \%hash_for_sec_str);
# Function : calculates the secondary structure segment shift rate.
# Example : <input example> First block is for the first hash input
# and Second is for the second hash input.
#
# 1cdg_6taa 00000442222222222242222222222777700000007000000000
# 1cdg_2aaa 00000442222222222242222222222777700000007000000000
# 2aaa_6taa 00000000000000000000000000000000000000000000000000
#
# 1cdg_6taa -------EEE-----------EE--EEEE------EE---------EEE-
# 1cdg_2aaa -------EEE-----------EE--EEEE------EE---------EEE-
# 2aaa_6taa -------EEEEE------EE-EEEEEEEE----EEEE-------EEEEE-
#
# <intermediate output example>
# 2aaa_6taa -------00000---------00000000----0000-------00000-
# 1cdg_6taa -------442---------------2222-----------------000-
# 1cdg_2aaa -------222---------------2222-----------------000-
#
# <Final output>
# 2aaa_6taa 0%
# 1cdg_6taa 67%
# 1cdg_2aaa 67%
#
# Warning :
# Keywords :
# Options : 'p' or 'P' for percentage term(default)
# 'r' or 'R' for ratio term (0.0 - 1.0), where 1 means all the
# segments were wrongly aligned.
# 's' or 'S' for Shift rate (it actually caculates the position shift
# rate for the secondary structure segment.
# 'h' or 'H' for position Shift rate (it actually caculates the position
# shift rate for helical segments). If this is the only option, it
# will show the default percentage term rate for helical segments.
# If used with 'r', it will give you ratio (0.0 - 1.0) for helical
# segment. If used with 's' option, it will give you position shift
# rate for only helical segments.
# 'e' or 'E' for position Shift rate (it actually caculates the position
# shift rate for beta segments). If this is the only option, it will
# show the default percentage term rate for beta segments. If used
# with 'r', it will give you ratio (0.0 - 1.0) for beta. If used
# with 's' option, it will give you position shift rate for only
# beta segments.
# Returns :
# Argument : Two references of hashes. One for error rate the other for sec.
# assignment.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub get_segment_shift_rate{
my($i, $k, $j, @hash, $option_string, %h, %superposed_hash,
$name, %out, $gap_chr, @str1, @str2, %temp, %hash_error, %hash_secondary);
#"""""""""""""""""""""""""""""""""""""""""
# general argument handling #
#"""""""""""""""""""""""""""""""""""""""""
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(\w)$/) ){
$option_string .= $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(\w)$/) ){
$option_string .= $1; }
elsif(ref($_[$k]) eq "HASH") {
%temp = %{$_[$k]};
my(@keys)= sort keys (%temp);
my($temp_seq) = $temp{$keys[0]};
if($temp_seq=~/\d\d+/){
%hash_error = %temp; }
else{ %hash_secondary = %temp; }
}
}#### OUTPUT are : %hash_error & %hash_secondary
#"""""""""""""""""""""""""""""""""""""""""
# general argument handling end #
#"""""""""""""""""""""""""""""""""""""""""
%hash_secondary =%{&tidy_secondary_structure_segments(\%hash_secondary)};
%superposed_hash =%{&superpose_seq_hash(\%hash_error, \%hash_secondary)};
%h=%{&get_wrong_segment_rate(\%superposed_hash)};
return(\%h);
}
#________________________________________________________________________
# Title : get_wrong_segment_rate
# Usage : print_seq_in_block( &get_wrong_segment_rate(\%superposed_hash) );
# Function : Treats the segment as one single big error.
# calculates the wrong segment number compared to the correct ones.
# Example : <input example> hash of 3 keys and values.
# 2aaa_6taa -------00000---------00000000----0000-------00000-
# 1cdg_6taa -------442---------------2222-----------------000-
# 1cdg_2aaa -------222---------------2222-----------------000-
#
# In the above there are two segments wrong in 3 segment blocks = 2/3
# <output example> hash of 3 percentage rates.
#
# 2aaa_6taa 0 %
# 1cdg_6taa 66.6666666666667 %
# 1cdg_2aaa 66.6666666666667 %
#
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_wrong_segment_rate{
my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
$s, $t, $u, $v, $w, $x, $y, $z, %h, $seg_min,
%hash, @keys, @array, @hash, $option_string, $string,
$name, %out, $gap_chr, @str1, @str2, $seg, $len, $wrong_seg, $correct_seg
);
%hash=%{$_[0]};
$seg_min =$_[1];
if($seg_min !~/\d+/){ $seg_min = 3; } ### Default segmin is 3
@keys = sort keys (%hash);
for $k (@keys){
my($string) = $hash{$k}; $string =~s/\,//g;
my(@segments) = split(/[\-\.\ ]+/, $string);
for $seg (@segments){
$len=length($seg);
if( $len >= $seg_min){
if($seg =~/[1-9]/){
$wrong_seg ++; }
else{ $correct_seg ++; }
}
}
$h{$k}= ($wrong_seg/($wrong_seg + $correct_seg)*100).' %';
$wrong_seg=$correct_seg='';
}
\%h;
}
#________________________________________________________________________
# Title : tidy_secondary_structure_segments
# Usage : print_seq_in_block(&tidy_secondary_structure_segments(\%hash, 'e4', 'h4'), 's');
#
# Function : receives any secondary structure assignment hashes and
# tidys up them. That is removes very shoft secondary structure
# regions like( --HH--, -E-, -EE- ) according to the given minimum
# lengths(threshold) of segments by you.
# Example : print_seq_in_block(&tidy_secondary_structure_segments(\%hash, 'e4', 'h4'), 's');
# <makes following into the next block>
#
# 1cdg_2aaa -------EEE-----------EE--EEEE------EE---------EEE-
# 1cdg_6taa -------EEE-----------EE--EEEE------EE---------EEE-
# 2aaa_6taa -------EEEEE------EE-EEEEEEEE----EEEE-------EEEEE-
#
# <example output>
#
# 1cdg_6taa -------------------------EEEE---------------------
# 1cdg_2aaa -------------------------EEEE---------------------
}
elsif( (length($2)- length($c)) == -2){
$ATOM=$1; $RES=$3;
chop($ATOM); chop($RES); chop($ATOM); chop($RES);
print F2 "$ATOM$c$RES$c$5\n";
next;
}
}
}
}
close F2;
##################################################################################
# Final result
##################################################################################
print "\n Files $file1, $file2 are created \n\n\n";
}
#________________________________________________________________________
# Title : tally_2_hashes (used for get_cs_rate_for_pairs_stat.pl )
# Usage : ($ref1, $ref2) = &tally_2_hashes(\%hash1, \%hash2, ['n', 'a', 'p', 'i']);
# %tally_addedup=%{$ref1}; '0' position had addedup value of 1000
# %tally_occurances=%{$ref2}; '0' position had occurred 100 times,
# '0' on average had 10 in its
# corresponding hash positions
# Function : Makes hashes of tallied occurances and summed up values for disits in
# positions.
# calculates the occurances or occurance rates of CS rate positions.
# The hashes should have numbers.
# Example : you put two hash refs. (ass. array) as args (\%hash1, \%hash2)
# The hashes are like; hash1 (name1, 0000011111, name2, 0000122222 );
# hash2 (name3, 1324..1341, name4, 13424444.. );
#
# 1) The resulting 1st hash output is (0, 20, 1, 13, 2, 12)
# which means that 0 added up to 24 in the second arg hash positions
# 1 added up to 15 in the second arg hash positions
# 2 added up to 18 in the second arg hash positions
# 'p' option only works with 'n' or 'a'
# 2) The resulting 2nd hash output is (0, 5, 1, 5)
# which means that 0 occurred 5 times in the first input hash
# 1 occurred 5 times in the first input hash
# 'p' option only works with 'n' or 'a'
# Warning :
# Keywords : tally two hashes of numbers.
# Options : [a n i p]
# Returns : ($ref1, $ref2), ie, two references of hash
# averaging option causes division of 20(added up value)
# by 9(occurance) in the above
# for '0' of the first hash, so (0, 2.222, 1, 2.1666, 2, 2.4 )
# Average is the average of numbers
# average value in 0-9 scale (or 0-100 with 'p' option)
# So, if there are
# seq1 00111110000, The 'a' value of 0 and 1 as in the seq2
# seq2 33000040000 is 0-> 6/6, 1-> 4/5, while the 'n'
# calc would be, 0-> 6 (60%), 1-> 4(40%)
#
# Argument : (\%hash1, \%hash2) or optionally (\%hash1, \%hash2, ['n', 'i', 'p', 'a'])
# 'n' => normalizing, 'p' => percentage out, 'i' => make int out, 'a'=> averaged
# Category :
# Version : 1.2
#--------------------------------------------------------------------
sub tally_2_hashes{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
%hash0 = %{$hash[0]};
%hash1 = %{$hash[1]};
@keys1= keys %hash0; ### No need to sort here as you will return hash at the end
@keys2= keys %hash1;
if($char_opt =~ /p/i ){ $factor =100; }
for($i=0; $i < @keys1; $i++ ){
###################################################
## Gap char detection
###################################################
if($hash0{$keys1[$i]} =~ /([\,\-])\S+[\,\-]/){ $gap_char1 = $1; }else{ $gap_char1=''; }
if($hash1{$keys2[$i]} =~ /([\,\-])\S+[\,\-]/){ $gap_char2 = $1; }else{ $gap_char2=''; }
###################################################
## Split the value string by gap char
###################################################
@string1=split(/$gap_char1/, $hash0{$keys1[$i]});
@string2=split(/$gap_char2/, $hash1{$keys2[$i]});
### @string1 => (0,0,0,0,1,1,1,1,1) @string2 => (3,4,2,13,2,1,23,3)
################################################################
## Main calc part, you get %tally_all_occur and %tally_occur
################################################################
for($j=0; $j < @string1; $j++){
$tally_all_occur{$string1[$j]}++ ; ## <-- number of all the positions
if( ($string2[$j]=~/[\d\^]+/)&&($string1[$j]=~/[\d\^]+/) ){
$tally_occur{$string1[$j]}+=$string2[$j] ; # %tally_occur is for added up counts
} # %tally_all_occur is for only the position
} # occurances of '0', '1' or whatever. To know
# how many '0' entry were you should use this.
####################################################################################
## When options were put, do more calc on %tally_all_occur and %tally_occur
####################################################################################
if($char_opt =~ /a/i ){
print "\n $char_opt ";
my(@cs_rates) = sort keys %tally_all_occur;
for($k=0; $k < @cs_rates; $k++){
if($tally_all_occur{$cs_rates[$k]} == 0){
$tally{$cs_rates[$k]} =0; next;}
if($char_opt =~ /i/i ){
$tally{$cs_rates[$k]}=int($tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]}); }
$out_hash{$title}=join(",", @out_rate);
}
}
return( \%out_hash );
}
#________________________________________________________________________
# Title : get_windows_sc_rate_array
# Usage : @out_rate = @{&get_windows_compos_and_seqid_rate_array(\@seq, \$win_size)};
# Function : actual working part of scan_windows_and_get_compos_seqid_rate
# Example :
# Warning :
# Class :
# Keywords :
# Options :
# Reference :
# Returns : \@ratio_array, \$ratio_whole_seq
# Tips :
# Argument : (\@input, \$window_size); @input => ('ABCDEFG.HIK', 'DFD..ASDFAFS', 'ASDFASDFASAS');
# Input ar => ( 'ABCDEFG
# 'DFD..ASDFAFS'
# 'ASDFASDFASAS' ) as the name of @sequences.
# Todo :
# Author : A Biomatic
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_windows_sc_rate_array{
my($base_level)=1; my($scale)=1; my($window_size, $show_calculation, $redu_window,
@input, @input0, @input1, $variable_win_size, $apply_factor,
@ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $length,
$compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
$ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate,
$winsize_reduc_factor, $largest_win_reached, $ori_win_size);
#""""""""""""""""""""""< handle_arguments{ head Ver 1.3 >"""""""""""""""""""
my(@A )=&handle_arguments( @_ );my( $num_opt )=${$A[7]};my( $char_opt )=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};my(@raw_string)=@{$A[9]};
my($i, $j, $c, $d, $e, $f, $g, $h, $k, $l, $p, $q, $r, $s, $t, $u, $v, $w, $x,$y,$z);
if($debug==1){ print "\n \@hash has \"@hash\"\n \@raw_string has \"@raw_string\"
\@array has \"@array\"\n \@char_opt has \"@char_opt\"\n \@file has \"@file\"
\@string has \"@string\""; }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if( $num_opt =~/^(d+)/){ $window_size= $1; }
if( $char_opt=~/v/i){ $variable_win_size= 'v' }
if( $char_opt=~/s/i){ $show_calculation = 's'}
if( $char_opt=~/r/i){ $redu_window = 'r'}
if( $char_opt=~/f/i){ $apply_factor = 'f'}
if( $char_opt=~/d/i){ $make_gap_dot = 'd'}
if( $char_opt=~/m/i){ $minus_whole_cs = 'm'}
@input = @{$array[0]};
if(defined(${$_[2]})){ $base_level =${$_[2]}; }
if(defined(${$_[3]})){ $scale =${$_[3]}; }
for ($t=0; $t< @input; $t++){ $length=length($input[$t]) if(length($input[$t])>$length);}
if ($length < $window_size){ $window_size = $length; }
#___________ getting ratio for the whole sequence ___________
$compos_whole_seq=${&main::compos_id_percent_array(\@input)}; ## for whole composition rate
$seq_id_whole_seq=${&main::seq_id_percent_array(\@input)};
print "\nComposition ID of the alignment: $compos_whole_seq\%\n";
print "Sequence ID of the alignment: $seq_id_whole_seq\%\n";
if ($seq_id_whole_seq == 0){ $ratio_whole_seq =0; }
else{ $ratio_whole_seq = $compos_whole_seq/$seq_id_whole_seq; }
print "Composition and Sequ. ID Ratio: $ratio_whole_seq\n";
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
########## Initial Window size setting ##############################
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if ( $ratio_whole_seq >9 ){ $window_size= 28; } # $ratio_whole_seq is
elsif( $ratio_whole_seq >8 ){ $window_size= 26; } # a whole CS rate !!
elsif( $ratio_whole_seq >7 ){ $window_size= 24; }
elsif( $ratio_whole_seq >6 ){ $window_size= 22; }
elsif( $ratio_whole_seq >5 ){ $window_size= 20; }
elsif( $ratio_whole_seq >4 ){ $window_size= 18; }
elsif( $ratio_whole_seq >3 ){ $window_size= 16; }
elsif( $ratio_whole_seq >2 ){ $window_size= 12; }
elsif( $ratio_whole_seq >0 ){ $window_size= 8; }
print "Window size used is : $window_size\n";
#$window_size = 10;
$largest_win_reached = 24;
$ori_win_size = $window_size;
#----------- Spliting the seq. into arrays to enable $make_gap_dot var -----
if( $make_gap_dot =~ /^[dD]+/){
$input[0] =~s/,//g; $input[1] =~s/,//g;
@input0 = split('', $input[0]); @input1 = split('', $input[1]);
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# MAIN Calc part
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for ($w=0; $w < $length; $w++){
$largest_win_reached = $window_size if $window_size > $largest_win_reached;
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### FACTOR calculation ##
#####################################################################
$factor= ($window_size/40)*2 if ($apply_factor eq 'f');
#####################################################################
## Turn RATES to DOTs when there are gaps ##
#####################################################################
# @input0, @input1 are the whole length sequence splited.
if($make_gap_dot =~ /^[dD]+/){
if( ($input0[$w] eq '.') || ($input1[$w] eq '.') ){
$ratio_compos_vs_seqid = '.';
push(@ratio_array, $ratio_compos_vs_seqid);
next;
}
}
#####################################################################
### Getting small windows ##
#####################################################################
$offset = $w - int($window_size/2); # $offset starts from -5 when window_size is 10.
$offset = 0 if ($offset < 0);
if($variable_win_size ne 'v'){ $window_size = $ori_win_size;}
$window_1=substr($input[0], $offset, $window_size); # window_1 is one segment
$window_2=substr($input[1], $offset, $window_size); # of defined length(size)
@array_of_2_seq=($window_1, $window_2);
#####################################################################
## This is to remove the common gaps in the two windows ##
#####################################################################
($win_no_gap1, $win_no_gap2) = @{&main::remov_com_column(\@array_of_2_seq)};
#####################################################################
## Go back if the window size is too small due to gaps ##
#####################################################################
if( (length($win_no_gap1) < $ori_win_size)&&($variable_win_size ==1) )
{
$window_size+=1; $w--; next;
}
#####################################################################
## Getting Compos and Seq ids ##
#####################################################################
$compos_id=${&main::compos_id_percent_array(\@array_of_2_seq)};
$seq_id =${&main::seq_id_percent_array(\@array_of_2_seq)};
#####################################################################
#### Go back if the Seq id is bigger than Compos id #######
#####################################################################
if(($variable_win_size eq 'v') && ($seq_id > $compos_id))
{
$window_size+=1; $w--; next;
}
#####################################################################
#### Special ID value handling #######
#####################################################################
if (($seq_id == 0 ) || ($compos_id == 0)){ $compos_id = 1;}
#####################################################################
#### The actual calculation #######
#####################################################################
if( $minus_whole_cs eq 'm' ){ ### this substracts rates with whole CS rate
$ratio_compos_vs_seqid = int( $seq_id/$compos_id*10 - $ratio_whole_seq );
if( $ratio_compos_vs_seqid <= 0){ $ratio_compos_vs_seqid =0; }
if( $ratio_compos_vs_seqid > 9){ $ratio_compos_vs_seqid = 9; }
if( ($apply_factor eq 'f')&&($variable_win_size eq 'v') ){
$ratio_compos_vs_seqid =int($ratio_compos_vs_seqid * $factor); } }
else{
$ratio_compos_vs_seqid = int($seq_id/$compos_id*10);
if( ($apply_factor eq 'f')&&($variable_win_size eq 'v') ){
$ratio_compos_vs_seqid =int($ratio_compos_vs_seqid * $factor); }
if( $ratio_compos_vs_seqid > 9){ $ratio_compos_vs_seqid = 9; }
}
#$ratio_compos_vs_seqid =int(($seq_id/$compos_id)*10-$factor);}
#$seq_id/abs($seq_id-$compos_id+0.1)*
#####################################################################
####### OUT of the loop (at the near to the end ##############
#####################################################################
if( ($w + $window_size/3) > $length ){ $ratio_compos_vs_seqid='.'; }
#####################################################################
#### When 's'how option is set(defined at prompt) #######
#####################################################################
if( $show_calculation eq 's' ){
printf ("SC=%-4s %-45s Seq=%-3.2s Compos=%-3.2s W=%-2s F=%-2s\n",
$ratio_compos_vs_seqid, $win_no_gap1,$seq_id, $compos_id, $window_size, $factor);
printf (" %-45s \n\n", $win_no_gap2);
}
#####################################################################
# Reducing increased window size according to SC rate (option 'r') ##
#####################################################################
if( ($variable_win_size eq 'v')&&($redu_window eq 'r') ){
if( $window_size > $ori_win_size ){
if( $window_size > ($length/2)){ print chr(7);
print "\n The increased window size is over half of seq. suspicious !! \n";
print "\n Disable 'v' (for variable window size), at prompt and run again\n\n";
}
# Options :
# Returns : a reference of a hash.
# Argument : One ref. for hash, one ref. for a scalar.
# Category :
# Version :
#--------------------------------------------------------------------
sub scan_windows_and_get_compos_seqid_rate{
my($base_l)=1;
my($scale)=1; # these are default params.
my(%input)=%{$_[0]};
my($window_size)=${$_[1]};
if(defined(${$_[2]})){ $base_l=${$_[2]}; } ### <---$base_c is the baseline controller for sensitivity.
if(defined(${$_[3]})){ $scale =${$_[3]}; } ### <---$base_c is the baseline controller for sensitivity.
my(@sequences,@out_rate,$i,$title,$window_1,$window_2,$ratio_compos_vs_seqid,@array_of_2_seq,%out_hash );
my(@keys)= keys %input;
my($whole_rate, $whole_rate_ref ,$out_rate_arr_ref);
for ($i=0; $i<=$#keys; $i++){
$sequences[$i]= $input{$keys[$i]}; }
($out_rate_arr_ref,$whole_rate_ref)=&get_windows_compos_and_seqid_rate_array(\@sequences,\$window_size,\$base_l,\$scale);
@out_rate=@{$out_rate_arr_ref}; $whole_rate=${$whole_rate_ref};
$title="CS_rate\($whole_rate\)";
$out_hash{$title}=join(",", @out_rate);
return( \%out_hash );
}
#________________________________________________________________________
# Title : get_windows_cs_rate_array
# Usage : @out_rate = @{&get_windows_cs_rate_array(\@seq, \$win_size)};
# Function : actual working part of scan_windows_and_get_compos_seqid_rate
# Example :
# Warning :
# Keywords :
# Options :
# Returns : \@ratio_array, \$ratio_whole_seq
# Argument : (\@input, \$window_size); @input => ('ABCDEFG.HIK', 'DFD..ASDFAFS', 'ASDFASDFASAS');
# Input ar => ( 'ABCDEFG
# 'DFD..ASDFAFS'
# 'ASDFASDFASAS' ) as the name of @sequences.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_windows_cs_rate_array{
my(@input)=@{$_[0]};
my($base_level)=1;
my($scale)=1;
my($window_size)=${$_[1]};
if(defined(${$_[2]})){ $base_level =${$_[2]}; } if(defined(${$_[3]})){ $scale =${$_[3]}; }
my(@ratio_array, @array_of_2_seq, $seq_id, $offset, $half_of_w_size, $t, $length, $w,
$compos_id, $seq_id, $window_2, $window_1, $compos_whole_seq, $seq_id_whole_seq,
$ratio_whole_seq, $win_rate_div_by_whole_rate, $normalizing_factor, $lowest_rate );
for ($t=0; $t < @input; $t++){
$length=length($input[$t]) if (length($input[$t])>$length); }
if ($length < $window_size){ $window_size = $length; }
#___________ getting ratio for the whole sequence ___________
$compos_whole_seq=${&compos_id_percent_array(\@input)};
$seq_id_whole_seq=${&seq_id_percent_array(\@input)};
if ($seq_id_whole_seq == 0){ $ratio_whole_seq =$compos_whole_seq/10; }
else{ $ratio_whole_seq =$compos_whole_seq/$seq_id_whole_seq; }
#___________ getting ratio for each window sequence ___________
for ($w=0; $w < $length; $w++){
$offset = $w - int($window_size/2); # $offset starts from -5 when window_size is 10.
$offset=0 if ($offset < 0);
$window_1=substr($input[0], $offset, $window_size); # window_1 is one segment
$window_2=substr($input[1], $offset, $window_size); # of defined length(size)
@array_of_2_seq=($window_1, $window_2); # making an array like this = ('ABCDE', 'BDESA')
$compos_id=${&compos_id_percent_array(\@array_of_2_seq)};
$seq_id =${&seq_id_percent_array(\@array_of_2_seq)};
#print "\n offset = $offset Wind1 = $window_1 Wind2 = $window_2 ";
#print " Compos1 = $compos_id Seqid = $seq_id \n";
#______ Handle special case when $seqid is zero > the rate becomes $compos_id/10 ______
if (($seq_id == 0) && ($compos_id != 0)){ $ratio_compos_vs_seqid = $compos_id/10; }
elsif(($seq_id == $compos_id)&&($seq_id == 0)){ $ratio_compos_vs_seqid = 0;}
elsif(($seq_id == $compos_id)&&($seq_id == 100)){ $ratio_compos_vs_seqid = 0;}
else{ $ratio_compos_vs_seqid=($compos_id/$seq_id); }
push(@ratio_array, $ratio_compos_vs_seqid); }
$lowest_rate = ${&min_elem_array(\@ratio_array)};
if($lowest_rate ==0){ $normalizing_factor=1; $ratio_whole_seq=0; }else{
$normalizing_factor=($ratio_whole_seq/$lowest_rate);
}
for (@ratio_array){ # the minimum value becomes equal to the whole seq. rate.
$_ = int($scale*($_*$normalizing_factor - ($ratio_whole_seq*$base_level))); #<<<----
$_= '^' if($_ > 9); $_= '_' if($_ < 0);
}
$ratio_whole_seq=int($ratio_whole_seq);
return( \@ratio_array, \$ratio_whole_seq);
}
#________________________________________________________________________
# Title : read_any_seq_files
# Usage : %out_seq=%{&read_any_seq_files(\$input_file_name)};
# Function : Tries to find given input regardless it is full pathname, with or
# without extension. If not in pwd, it searches the dirs exhaustively.
# Example : (*out1, *out2) =&read_any_seq_files(\$input1, \$input2);
# : (@out_ref_array)=@{&read_any_seq_files(\$input1, \$input2)};
# : (%one_hash_out) =%{&read_any_seq_files(\$input1)};
# Warning :
# Keywords : open_any_seq_files,
# Options :
# Returns : 1 ref. for a HASH of sequence ONLY if there was one hash input
# 1 array (not REF.) of references for multiple hashes.
# Argument : one of more ref. for scalar.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub read_any_seq_files{
my(@out_hash_ref_list, $sub, $o, $ext );
my(@in)=@_;
for($o=0; $o< @in; $o++){
my($found, %out, @file_ext_accepted, $found_file, $sub);
if(ref($_[$o])){
@file_ext_accepted=('msf', 'fasta','jp','aln','ali','pir',
'slx', 'dna','fas','pdb','rms','brk', 'dssp');
if( ! -e ${$in[$o]} or -B ${$in[$o]} or -z ${$in[$o]} ){
print "\n#SUB read_any_seq_files: ${$in[$o]} no seq file exists(or not passed at all) for $0 \n\n",
chr(7);
exit;
}
$found_file=${&find_seq_files($in[$o])};
print "# in read_any_seq_files, \$found_file => $found_file\n";
for $ext(@file_ext_accepted){
$sub ="open\_$ext\_files";
);
%array1 = %{&hash_common(\%array1, \%array2)};
%array2 = %{&hash_common(\%array2, \%array1)};
%array1 = %{&remov_com_column(\%array1)}; # this removes wrong gaps(in '.' form, in MSF)
%array2 = %{&remov_com_column(\%array2)};
@names= keys %array2;
for $name (@names){
@string1 =split('', $array1{$name});
@string2 =split('', $array2{$name}); # ! @string2 is the structural. ! (used)
@seq_position1 = @{&get_posi_sans_gaps(\$array1{$name})};
@seq_position2 = @{&get_posi_sans_gaps(\$array2{$name})}; # @seq_position2 is structural
$len_of_seq =($#seq_position2+1);
push(@whole_length, $len_of_seq);
@position_diffs = @{&get_posi_diff(\@seq_position1, \@seq_position2)};
@position_corrected1 = @{&put_position_back_to_str_seq(\@string2, \@position_diffs)};
#print "@position_corrected1";
$array3{$name}=join(",", @position_corrected1); # array3 is for disply of seq.
} # !! split and join char is ',';
# %array3 has the form. These numbers are position differences between the same sequences
# one from str. one from seq.
# seq1 1,1,2,3,.,2,3,.,1,.,0,0,0,1,1,1,1,1,2
# seq2 1,1,2,1,.,1,3,.,1,.,0,0,1,0,1,1,1,3,2
# seq3 1,1,2,3,.,2,3,.,1,.,1,1,0,0,1,1,1,3,2
my(%final_posi_diffs) =%{&get_posi_diff_hash(\%array3)};
my($sum_of_posi_diffs)=${&sum_hash(\%final_posi_diffs)};
my($av_of_posi_diffs) =$sum_of_posi_diffs/($#names); # dividing by seq number.
my($sum_seq_length) =${&sum_array(\@whole_length)};
my($av_rate) =$av_of_posi_diffs/($sum_seq_length);
&print_seq_in_block(\%final_posi_diffs); # <--- leave this
for (values %final_posi_diffs){
my(@splited) = split(',', $_);
for (@splited){
$out{$_}++ if ($_ =~ /\d+/); }
}
# the final result is %out which has accumulated entries with occurances
}
#________________________________________________________________________
# Title : get_occurances_of_char
# Usage : %occurances_shft_type=%{&get_occurances_of_char(\%final_posi_diffs)};
# %char_occur=%{&get_occurances_of_char(\@ref_array_of_chars)};
# %char_occur=%{&get_occurances_of_char(\$ref_string_of_chars)};
# %char_occur=%{&get_occurances_of_char($string_of_chars)};
#
# Function : gets the numbers of occurances for 1, 2, 3 ... position shifts.
# If hash is given, it only looks at the values.
# If multiple string, array, hash or combinations of these
# are given, it will add up to one single result
# Example :
# Warning :
# Keywords : composition of chars, composition table making,
# make_composition, make composition table
# occurances_of_char, get_char_occurances, occurances
# get_percentage_occurances_of_char, percentage_occurances_of_char
# Options : 'p' for percentage output of the char among others
# 'n' for NO name option when HASH input is given
# Returns : one ref. of hash (a =>5, b=>6, c=>4,,,,,)
# Argument : one ref. of hash (seq1 alsdfjlsj
# seq2 asldfjsld
# seq3 owiurouou);
# Category :
# Version : 1.4
#--------------------------------------------------------------------
sub get_occurances_of_char{
my ($i, %H, $no_name, %out, $N,@splited, $val,$percentage_out,
$split, $sum);
for($i=0; $i< @_; $i++){
if($_[$i]=~/^[\-]?p$/i){
$percentage_out=1; splice(@_, $i, 1); $i--;
}elsif($_[$i]=~/^[\-]?n$/i){
$no_name=1; splice(@_, $i, 1); $i--;
}
}
for($i=0; $i< @_; $i++){
if( ref($_[$i]) eq 'HASH'){
my %H=%{$_[$i]};
my @names=keys %H;
for $key (@names){
for $split ( split(//, $H{$key}) ){
if($no_name==1){ $N=$split
}else{ $N="$key\_$split"; }
$out{$N}++; $sum++
}
}
}elsif(ref($_[$i]) eq 'ARRAY'){
@splited=@{$_[$i]};
for $split (@splited){ $out{$split}++; $sum++ }
}elsif(ref($_[$i]) eq 'SCALAR'){
@splited = split(//, ${$_[$i]});
for $split (@splited){ $out{$split}++; $sum++ }
}elsif( !(ref($_[$i])) ){
@splited = split(//, $_[$i]);
for $split (@splited){ $out{$split}++; $sum++ }
}
}
if($percentage_out==1){
my @keys=keys %out;
my %percent;
for($i=0; $i< @keys; $i++){
$percent{$keys[$i]} = $out{$keys[$i]}/$sum*100;
}
return(\%percent);
}else{
return(\%out);
}
}
#________________________________________________________________________
# Title : make_composition_table
# Usage : %occurances=%{&make_compos_table(\%key_and_value_for_seq)};
# Function : gets the numbers of occurances for 1, 2, 3 ... position shifts.
# Example :
# Warning :
# Keywords : composition of chars, composition table making, make composition table
# make_composition_table, get_composition, get_amino_acid_composition
# protein_composition, make_aa_composition_tablem, aa_composition
# Options :
# Returns : one ref. of hash (a =>5, b=>6, c=>4,,,,,)
# Argument : one ref. of hash (seq1 alsdfjlsj
# seq2 asldfjsld
# seq3 owiurouou);
# Category :
# Version : 1.2
#--------------------------------------------------------------------
sub make_composition_table{
my %input = %{$_[0]};
my (@splited, $split, %out );
for (values %input){
@splited = split(//, $_);
for $split (@splited){ $out{$split}++; }
}
return(\%out);
}
#________________________________________________________________________
# Title : make_composition_ratio_table_simple
# Usage : %occurances=%{&make_compos_ratio_table(\%final_posi_diffs)};
# Function : gets ratio of the numbers of occurances for any chars.
# Example :
# Warning : This pools all the sequences, to not distinct seq composition if
# you put more than one seq.
# Keywords : composition table, composition of chars, composition table making,
# make composition table, make_composition_table
# Options :
# Returns : one ref. of hash (a =>0.05, b=>0.06, c=>0.04,,,,,)
# Argument : one ref. of hash (seq1 alsdfjlsj
# seq2 asldfjsld
# seq3 owiurouou);
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub make_composition_ratio_table_simple{
my %input = %{$_[0]};
my %out;
my (@keys, $i, %ratio_out, $each_char_occur, @splited, $split, $all_occur );
for (values %input){
@splited = split(//, $_);
for $split (@splited){ $out{$split}++; $all_occur ++; }
}
@keys = keys %out;
for ($i=0; $i < @keys; $i ++){
#--------------------------------------------------------------------
sub three_to_one_letter{ my(%aa);
$aa{"ala"} = "a"; $aa{"met"} = "m"; $aa{"asp"} = "d"; $aa{"pro"} = "p";
$aa{"cys"} = "c"; $aa{"asn"} = "n"; $aa{"glu"} = "e"; $aa{"gln"} = "q";
$aa{"phe"} = "f"; $aa{"arg"} = "r"; $aa{"gly"} = "g"; $aa{"ser"} = "s";
$aa{"his"} = "h"; $aa{"thr"} = "t"; $aa{"ile"} = "i"; $aa{"val"} = "v";
$aa{"lys"} = "k"; $aa{"trp"} = "w"; $aa{"leu"} = "l"; $aa{"tyr"} = "y";
return(\%aa);
}
#________________________________________________________________________
# Title : convert_3_to_1_letter
# Usage : %three_letter = &three_to_one_letter ; # takes no arguments (void).
# Function : a hash of one letter to 3 letter amino acid code , returns a hash
# Example :
# Warning :
# Keywords : 321, 3to1 3_to_1 THREE_TO_ONE_LETTER Three_To_One_Letter
# convert_3_to_1, convert_3_to_1_aa_name
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub convert_3_to_1_letter{ my(%aa);
$aa{"ala"} = "a"; $aa{"met"} = "m"; $aa{"asp"} = "d"; $aa{"pro"} = "p";
$aa{"cys"} = "c"; $aa{"asn"} = "n"; $aa{"glu"} = "e"; $aa{"gln"} = "q";
$aa{"phe"} = "f"; $aa{"arg"} = "r"; $aa{"gly"} = "g"; $aa{"ser"} = "s";
$aa{"his"} = "h"; $aa{"thr"} = "t"; $aa{"ile"} = "i"; $aa{"val"} = "v";
$aa{"lys"} = "k"; $aa{"trp"} = "w"; $aa{"leu"} = "l"; $aa{"tyr"} = "y";
return(\%aa);
}
#________________________________________________________________________
# Title : convert_1_to_3_letter
# Usage : %three_letter = &three_to_one_letter ; # takes no arguments (void).
# Function : a hash of one letter to 3 letter amino acid code , returns a hash
# Example :
# Warning :
# Keywords : 123, 1to3 1_to_3 one_TO_three_LETTER One_To_Three_Letter
# convert_1_to_3, convert_1_to_3_aa_name
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub convert_1_to_3_letter{ my(%aa);
$aa{"a"} = "ala"; $aa{"m"} = "met"; $aa{"d"} = "asp"; $aa{"p"} = "pro";
$aa{"c"} = "cys"; $aa{"n"} = "asn"; $aa{"e"} = "glu"; $aa{"q"} = "gln";
$aa{"f"} = "phe"; $aa{"r"} = "arg"; $aa{"g"} = "gly"; $aa{"s"} = "ser";
$aa{"h"} = "his"; $aa{"t"} = "thr"; $aa{"i"} = "ile"; $aa{"v"} = "val";
$aa{"k"} = "lys"; $aa{"w"} = "trp"; $aa{"l"} = "leu"; $aa{"y"} = "tyr";
return(\%aa);
}
#________________________________________________________________________
# Title : amino_acid_compos_id_percent
# Usage : $percent = &amino_acid_compos_id_percent (%any_hash_with_sequences);
# The way identity(composition) is derived is;
#
# Function : gets amino acid composition identity of any given
# number of sequences(at least 2).
# Example :
# Warning :
# Keywords : get_amino_acid_composition, get_protein_composition, composition
# Options :
# Argument : hash of at least 2 sequences.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent{
my(%input)= %{$_[0]};
my(@names)=keys (%input);
my(@temp, $i, $j, $iden, @all_pairs_id, $iden_sum);
my(%compos_table1, %compos_table2, $final_iden, $larger);
for ($i=0; $i < @names; $i ++){ # putting seqs in arrays.
$input{$names[$i]}=~ tr/a-z/A-Z/;
$input{$names[$i]}=~ s/\W//g;
@{"string$i"}= split('', $input{$names[$i]});
$larger = @{"string$i"} if @{"string$i"} > $larger;
}
for ($i=0; $i < @names; $i++){ # to make permutated pairs.
for ($j=$i; $j < @names; $j ++){
if ($j == $i){ next; }
for ($k=0; $k < $larger; $k ++){ # getting composition tables for two seqs.
$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/);
}
$iden = ${&common_compos_id_hash(\%compos_table1, \%compos_table2)};
%compos_table1=(); %compos_table2=();
push (@all_pairs_id, $iden );
}
}
for $iden (@all_pairs_id){ $iden_sum+= $iden; }
if(@all_pairs_id == 0){ @all_pairs_id =1; }
$final_iden=$iden_sum/@all_pairs_id;
\$final_iden;
}
#________________________________________________________________________
# Title : seq_id_percent_array (more than 2 elements array)
# Usage : $percent = &seq_id_percent_array(@any_array_sequences);
# The way identity(pairwise) is derived is;
#
# Function : produces amino acid composition identity of any given number of sequences.
# Example :
# Warning : This can handle 'common gaps' in the sequences
# Keywords : get_percent_composition_identity, seq_composition_identity,
# percent_sequence_composition_id
#
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub seq_id_percent_array{
my(@input, $denominator,@all_pairs_id, $percent_id);
my($largest,$p,$i,$j,$k,$iden_residue_num,$iden,@temp,$iden_sum,$gap_num,$final_iden);
for($d=0; $d<@_; $d++){
if(ref($_[$d]) eq 'ARRAY'){ @input=@{$_[$d]}; }
elsif( (ref($_[$d]) eq 'SCALAR') &&( ${$_[$d]}=~/^[aA]/) ){ $average_len_opt =1 }
elsif( !(ref($_[$d])) && ( $_[$d] =~/^[aA]/) ){ $average_len_opt =1 } }
if ((@input== 1)||( @input== 0)){
print "\n\n \" $0 \" requires at least 2 sequences\n\n";
print "\n Abnormally dying at seq_id_percent_array in $0 \n\n";
print chr(7); exit;}
$shortest=length($input[0]);
my($sans_gap_seq, $length_sum, $average_seq_len);
for($p=0; $p < @input; $p++){
$input[$p]=~ tr/a-z/A-Z/;
$sans_gap_seq=$input[$p];
$sans_gap_seq=~s/\W//g;
$input[$p]=~ s/\W/./g;
(@{"string$p"})=split('', $input[$p]);
$largest = length($input[$p]) if length($input[$p]) > $largest;
$shortest = length($sans_gap_seq) if length($sans_gap_seq) < $shortest;
$length_sum += length($sans_gap_seq);
}
$average_seq_len = $length_sum/@input;
for($i=0; $i< @input; $i++){
for($j=$i+1; $j< @input; $j++){
for ($k=0; $k < $largest; $k ++){ # getting composition tables for two seqs.
if ((${"string$i"}[$k] !~ /\W/)&&(${"string$i"}[$k] eq ${"string$j"}[$k])){
$iden_residue_num++; }
elsif((${"string$i"}[$k] =~ /\W/)&&(${"string$i"}[$k]=~ /\W/)){ $gap_num++; }}
if( $average_len_opt == 1){ $denominator = $average_seq_len; }
else{ $denominator = $shortest; }
if($denominator == 0){ $denominator=1; } # in the above it is 50% rather than 0.07%
$percent_id=($iden_residue_num/($denominator))*100;
push(@all_pairs_id, $percent_id);
undef ($iden_residue_num, $gap_num);
}
}
for (@all_pairs_id){ $iden_sum+=$_; }
$final_iden=$iden_sum/($#all_pairs_id+1);
return( \$final_iden );
}
#________________________________________________________________________
# Title : compos_id_percent_array (more than 2 elements array)
# Usage : $percent = &compos_id_percent_array(@any_array_sequences);
# The way identity(composition) is derived is;
# Function : produces amino acid composition identity of any given number of sequences.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub compos_id_percent_array{
my(@input)=@{$_[0]};
my($largest,$iden,@temp,$iden_sum,$final_iden, @all_pairs_id);
for($i=0; $i<=$#input; $i++){ $input[$i]=~ tr/a-z/A-Z/; $input[$i]=~ s/[\.\-\s]//g;
@temp = split('', $input[$i]); (@{"string$i"})= @temp;
$largest = @{"string$i"} if @{"string$i"} > $largest; }
for($i=0; $i< @input; $i++){ #_________ permutating ___________
for($j=$i+1; $j<=$#input; $j++){
for ($k=0; $k <= $largest; $k ++){ # getting composition tables for two seqs.
$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/); }
$iden =${&calc_compos_id_hash(\%compos_table1, \%compos_table2)};
push(@all_pairs_id, $iden); %compos_table1=(); %compos_table2=(); } }
for $iden (@all_pairs_id){ $iden_sum+=$iden; }
$final_iden=$iden_sum/(@all_pairs_id);
#-----------------------------------------------------
# Input here is like : %hash1= (A,3,B,3,C,4,D,4), %hash2= (A,4,B,1,C,4)
sub calc_compos_id_hash{ # input is like this;
my(%hash1)=%{$_[0]};
my(%hash2)=%{$_[1]};
my(%common_of_the_2)=();
my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
my(@values1) = values (%hash1);
my(@values2) = values (%hash2);
my(@combined_values)=(@values1,@values2);
for $elem (@combined_values){ $sum_residues += $elem; }
if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
else{ for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
$sum_of_the_common_residue_no += $common; }
$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100; }
\$compos_id;
}
#-----------------------------------------------------
return ( \$final_iden ); # final identity for any given set of strings(seq).
}
#________________________________________________________________________________
# Title : compos_id_percent_hash (synonym of amino_acid_compos_id_percent)
# Usage : $percent = &compos_id_percent_hash(%any_hash_with_sequences);
# The way identity(composition) is derived is;
#
# Function : gets amino acid composition identity of any given number of sequences.
# Example :
# Warning :
# Keywords : get_amino_acid_composiiton
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub compos_id_percent_hash{ my(%input, @names);
if(ref($_[0]) eq 'HASH'){ %input= %{$_[0]}; @names= keys %input; }
else{ print "\n hash ref was not passed to compos_id_percent_hash\n"; exit; }
my(@temp, $iden, @all_pairs_id, $i, $j, $k,$iden_sum);
my(%compos_table1, %compos_table2, $final_iden, $larger);
for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
$input{$names[$i]}=~ s/\W//g; @temp = split('', $input{$names[$i]});
(@{"string$i"})=@temp; $larger = @{"string$i"} if @{"string$i"}>$larger;}
for ($i=0; $i < @names; $i++){
for ($j=$i; $j < @names; $j ++){
if ($j == $i){ next; }
for ($k=0; $k < $larger; $k ++){
$compos_table1{${"string$i"}[$k]}++ if (${"string$i"}[$k] =~ /\w/);
$compos_table2{${"string$j"}[$k]}++ if (${"string$j"}[$k] =~ /\w/); }
$iden = ${&common_compos_id_hash(\%compos_table1, \%compos_table2)};
%compos_table1=(); %compos_table2=(); push (@all_pairs_id, $iden); }}
for $iden (@all_pairs_id){ $iden_sum+=$iden; }
$final_iden=$iden_sum/(@all_pairs_id);
return(\$final_iden);
}
#________________________________________________________________________
# Title : common_compos_id_hash (BUG free)
# Usage : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function : actual calculation of identity
# Example : ('A', 200, 'C', 191, D, 99)
# ('A', 290, 'C', 199, D, 100)
# uses only two sequences.
# Warning :
# Keywords :
# Options :
# Returns : ref. of a scaler (in percent) eg) 95
# Argument : two references of hash of seqeunces.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub common_compos_id_hash{
my(%hash1)=%{$_[0]};
my(%hash2)=%{$_[1]};
my(%common_of_the_2)=(); my($common, $compos_id, $sum_of_the_common_residue_no);
my(@values1) = values (%hash1); my(@values2) = values (%hash2);
my(@combined_values)=(@values1, @values2);
my($sum_residues) = ${&sum_array(\@combined_values)};
for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
$sum_of_the_common_residue_no += $common;
}
if( $sum_residues == 0){ $sum_residues =1 }
$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100;
\$compos_id;
}
#________________________________________________________________________
# Title : calc_compos_id_hash (the same as 'common_compos_hash')
# Usage : %hash = &calc_compos_hash(\%any_hash1, \%any_hash1);
# Function : actual calculation of identity
# Example : ('A', 200, 'C', 191, D, 99)
# ('A', 290, 'C', 199, D, 100)
# uses only two sequences.
# Warning :
# Keywords :
# Options :
# Returns : ref. of a scaler (in percent) eg) 95
# Argument : two references of hash of seqeunces.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub calc_compos_id_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; my(%common_of_the_2)=();
my($common, $compos_id, $sum_residues, $sum_of_the_common_residue_no);
my(@values1) = values (%hash1); my(@values2) = values (%hash2);
my(@combined_values)=(@values1,@values2);
for $elem (@combined_values){
$sum_residues += $elem; }
if($sum_residues == 0){ $compos_id =0; } # to prevent Illegal division error.
else{
for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
$sum_of_the_common_residue_no += $common; }
$compos_id = $sum_of_the_common_residue_no/($sum_residues/2)*100; }
\$compos_id;
}
#________________________________________________________________________
# Title : get_percentage
# Usage : %out= %{&get_percentage(\%result, '1')};
# Function : calculates the percentage content of any single char over the whole
# length of strings in it.
# Example : if the string is 'seq ABCDEEEEEFFEFE' given in a hash
# if you put 'A' as one argument, it counts the occurances of 'A'
# and gets the percentage of it.
# Warning : This converts array and string input as ref. into arbitrary hash and
# returns hash
# programmed by A Biomatic
# Keywords : get_percentage_of_char
# Options : None yet.
# Returns : Numerical Percentage
# Argument : ref. for Scalar string or Array of chars or Hash AND 'the target char'
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percentage{
my(@in, $k, $sort, $numerator, $residue, @out_hash_ref, %hash_out );
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k])&& (length($_[$k]) == 1 )){
$numerator = $_[$k];
}
elsif( (ref($_[$k]) eq 'SCALAR') && (length(${$_[$k]}) == 1 )){
$numerator = ${$_[$k]};
}
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); }
elsif(ref($_[$k]) eq "ARRAY") { push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
elsif(ref($_[$k]) eq "SCALAR"){ push(@in, &convert_arr_and_str_2_hash($_[$k], $k));} #<-- conv array to hash.
}
####### final output is => @in of hash ref elements #######
for (@in){ my(%H) = %{$_}; my(@keys)= sort keys %H;
for $name(@keys){
my($numerator_count);
my($seq_len) = length($H{$name}); print "\n $name Sequence length: ", $seq_len, "\n";
my(@string) = split(//, $H{$name});
for $residue (@string){ if($residue =~/^$numerator$/){ $numerator_count ++; }}
my($percent) = $numerator_count/$seq_len *100;
$hash_out{$name}=$percent; }
push(@out_hash_ref, \%hash_out); }
if(@out_hash_ref ==1){ return($out_hash_ref[0]); }
elsif( @out_hash_ref > 1){ return(@out_hash_ref); }
}
#________________________________________________________________________
# Title : pairwise_percent_id (pairwise sequence identity in percentage)
# Usage : $identity = ${&pairwise_percent_id(%arrayinput)};
#
# Function : takes a ref. of a hash of names and sequences, returns
# percent identity.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub pairwise_percent_id{
my($i,$j,$k, @iden_array_ref);
for($i=0; $i< @_; $i++){ my %input= %{$_[$i]}; my @names= sort keys %input;
my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
for ($i=0; $i < @names; $i ++){ $input{$names[$i]}=~ tr/a-z/A-Z/;
@temp = split('', $input{$names[$i]}); (@{"string$i"})=@temp;
$larger = @{"string$i"} if @{"string$i"} > $larger; }
for ($i=0; $i < @names; $i++){ # to make permutated pairs.
for ($j=$i+1; $j < @names; $j ++){
for ($k=0; $k < $larger; $k ++){ # getting composition tables for two seqs.
$iden+=2 if ((${"string$i"}[$k] eq ${"string$j"}[$k])&&(${"string$i"}[$k] =~ /\w/));
$residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
$residue_sum2++ if (${"string$j"}[$k] =~ /\w/); }
$whole_seq_len =($residue_sum1+$residue_sum2);
$percent_for_pair = $iden/$whole_seq_len*100;
push(@percent_for_pair,$percent_for_pair);
$residue_sum1=0; $residue_sum2=0; $iden=0; } }
for $iden (@percent_for_pair){ $iden_sum+=$iden;}
$final_av_iden=$iden_sum/( @percent_for_pair );
push(@iden_array_ref, \$final_av_iden); }
if(@iden_array_ref ==1){ return($iden_array_ref[0]);}else{ return(@iden_array_ref);}
}
#________________________________________________________________________
# Title : get_seq_identity
# Usage : $identity = ${&get_seq_identity(%arrayinput)};
#
# Function : takes a ref. of a hash of names and sequences, returns
# percent identity. NOT composition identity.
# Example :
# Warning :
# Keywords : get_sequence_identity
# Options :
# Returns :
# Argument : hash(es) of sequences.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_seq_identity{
my($i,$j,$k, $c, @iden_array_ref);
for($c=0; $c< @_; $c++){
my %input= %{$_[$c]};
my @names= sort keys %input;
my(@temp, $iden, @all_pairs_id, $whole_seq_len, $residue_sum1,$residue_sum2);
my($final_av_iden, $larger, $percent_for_pair,@percent_for_pair, $iden_sum);
for ($i=0; $i < @names; $i ++){
$input{$names[$i]}=~ tr/a-z/A-Z/;
@temp = split(//, $input{$names[$i]});
@{"string$i"}=@temp;
$larger = @{"string$i"} if @{"string$i"} > $larger; }
for ($i=0; $i < @names; $i++){ # to make permutated pairs.
for ($j=$i+1; $j < @names; $j ++){
for ($k=0; $k < $larger; $k ++){
if ( ${"string$i"}[$k] eq ${"string$j"}[$k] and ${"string$i"}[$k] =~ /\w/){
$iden+=2 ;
}
$residue_sum1++ if (${"string$i"}[$k] =~ /\w/);
$residue_sum2++ if (${"string$j"}[$k] =~ /\w/);
}
$whole_seq_len =($residue_sum1+$residue_sum2);
$percent_for_pair = $iden/$whole_seq_len*100;
push(@percent_for_pair,$percent_for_pair);
$residue_sum1=0;
$residue_sum2=0;
$iden=0;
}
}
for $iden (@percent_for_pair){
$iden_sum+=$iden;
}
if(@percent_for_pair <1){ @percent_for_pair=(1); }
$final_av_iden=$iden_sum/( @percent_for_pair );
push(@iden_array_ref, \$final_av_iden);
}
if(@iden_array_ref ==1){
return($iden_array_ref[0]);
}else{
return(@iden_array_ref);
}
}
#________________________________________________________________________
# Title : get_correct_percent_alignment_rate (made for Bissan)
# Usage : &get_correct_percent_alignment_rate(\$file1, \$file2);
# Function : accepts two files and prints out the sequence identities of the alignment.
# Example :
# Warning : Alpha version, A Biomatic , made for Bissan
# Keywords :
# Options : h # for help
# v # for verbose printouts(prints actual sequences)
# Returns : reference of Scalar for percentage correct alignment(for already
# aligned sequences)
# Argument : two sequence files which have identical sequence names.
# Category :
# Version :
#--------------------------------------------------------------------
sub get_correct_percent_alignment_rate{
my($i, $j, $k, $verbose, @string1, @string2, $larger, $seq_pair_id, @seq_pair_ids );
my(%inputhash1) = %{&read_any_seq_files($_[0])};
my(%inputhash2) = %{&read_any_seq_files($_[1])};
my(@names)= sort keys %inputhash1;
######################################
####### Sub argument handling ########
######################################
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k])&& (length(${$_[$k]}) < 5)){ # when inputs are not ref.
if($_[$k]=~ /^[\-vV]$/){ $verbose = 1; next;}
}
elsif((ref($_[$k]) eq "SCALAR")&&(length(${$_[$k]})<5)){ # when inputs are ref.
if(${$_[$k]}=~ /^[\-vV]$/){$verbose = 1;next;} # should shorter than 5
}
}
for($i =0; $i < @names; $i++){
print "\n\n==== Processing structural $names[$i] against artificial $names[$i]\n";
$inputhash1{$names[$i]} =~ tr/a-z/A-Z/;
$inputhash2{$names[$i]} =~ tr/a-z/A-Z/;
@string1=split(//, $inputhash1{$names[$i]});
@string2=split(//, $inputhash2{$names[$i]});
print "\n The string1 is ",@string1,"\n" if $verbose ==1;
print "\n The string2 is ",@string2,"\n" if $verbose ==1;
(@string2 > @string1) ? ($larger=@string2, $smaller=@string1) : ($larger=@string1, $smaller=@string2);
$true_seq=$inputhash1{$names[$i]};
$true_seq=~s/\W//g;
$true_len=length($true_seq);
print "\n True seq length: $true_len , Whole length inc gap: $larger\n";
for($j = 0; $j < $larger; $j++){
$iden_sum++ if ($string1[$j] eq $string2[$j])&& !($string1[$j]=~/\W/); }
$seq_pair_id =($iden_sum/$true_len) * 100;
print "\nID between structural and artifical alignment is $seq_pair_id \%" , "\n";
push(@seq_pair_ids, $seq_pair_id);
undef( $iden_sum, $seq_pair_id );
}
print "\n", "_"x88, "\n";
my($whole_average_of_the_id)=${&array_average(\@seq_pair_ids)};
print "The whole average is; $whole_average_of_the_id\n";
if(@seq_pair_ids == 1){ return( \$seq_pair_ids[0] ); }
elsif(@seq_pair_ids > 1){ return( \@seq_pair_ids ); }
}
#________________________________________________________________________
# Title : amino_acid_compos_id_percent_trend
# Usage :
# Function :
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub amino_acid_compos_id_percent_trend{
my(%input) = %{$_[0]};
my(@common, @string,@accumu_percent_iden)=(); my(%common_so_far, %compos_table);
my($percent_id_so_far, $length_of_one_seq,$length_of_all_seq, $seq_no)=0;
for $key(keys %input){
$input{$key}=~s/[. \d-]//g;
@string= split(//, $input{$key});
print @string; print "\n";
$length_of_one_seq = $#string+1;
$length_of_all_seq +=$length_of_one_seq;
$seq_no += 1;
%compos_table = &composition_table(@string);
@check = keys (%common_so_far);
if ($#check < 0){ %common_so_far = %compos_table; }
else{ %common_so_far= %{&common_compos_2_hash(\%common_so_far,\%compos_table)};}
for $value(values %common_so_far){ $common_residue_sum +=$value; }
$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
$common_residue_sum =0; }
for $value(values %common_so_far){ $common_residue_sum +=$value; }
$final_percent_id = $common_residue_sum/($length_of_all_seq/$seq_no)*100;
return(\$final_percent_id);
}
#________________________________________________________________________
# Title : composition_table (can handle both nucleic and protein seq)
# Usage : %output = %{&compos_table(@input_array1, @input_array2,,,,)};
# example input
#
# Function : returns a table of alphabet with occurances.
# can handle any char, this converts char to upper case.
# Example :
# Warning : converts all SMALL letters to Capital letters before counting!!
# Keywords :
# Options :
# Returns : %hash1 = ('A',3, 'C',2, 'D',1, 'Q',2, 'S',1), %hash2,,,
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub composition_table{
my($i, @input,%input,$input,$j,@ref_out);
for($i=0;$i<@_; $i++){
my(%alphabet)=();
if( ref($_[$i]) eq 'ARRAY'){ @input=@{$_[$i]}; undef(%alphabet);
for ($j=0; $j<=$#input; $j++){ $input[$j] =~ tr/a-y/A-Y/;
$alphabet{$input[$j]}+=1 if ($input[$j] =~/[A-Y]/) }
push(@ref_out, \%alphabet); }
elsif( ref($_[$i]) eq 'HASH'){ %input=%{$_[$i]};@input=keys %input;undef(%alphabet);
for ($j=0; $j< @input; $j++){ $input[$j] =~ tr/a-y/A-Y/;
$alphabet{$input[$j]}+=1 if ($input[$j] =~/[A-Y]/) }
push(@ref_out, \%alphabet); }
elsif( ref($_[$i]) eq 'SCALAR'){ $input=${$_[$i]}; $input=s/\,//g if $input=~/\,/;
@input=split('', $input); undef(%alphabet);
for ($j=0; $j<@input; $j++){ $input[$j] =~ tr/a-y/A-Y/;
$alphabet{$input[$j]}+=1; }
push(@ref_out, \%alphabet); } }
if(@ref_out ==1){
return($ref_out[0]);
}else{ return(@ref_out); }
}
#________________________________________________________________________
# Title : common_compos_2_hash
# Usage : %hash = &common_compos_hash(\%any_hash1, \%any_hash1);
# Function :
# Example : common gaps means only '.' (dots, not alphabets!!)
# AAA....BBCB
# AAAB..B.BCC --> A.A.....BC. (as in an array)
# A.AAA...BCA
# Warning :
# Keywords :
# Options :
# Returns : a hash (string1, number1, string2, number2, string3, number3, ...)
# Argument : two references of hash of seqeunces.
# Category :
# Version :
#--------------------------------------------------------------------
sub common_compos_2_hash{ my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]};
my(%common_of_the_2)=(); my($common)=0;
for $key1(keys %hash1){
$common=&smaller_one($hash1{$key1}, $hash2{$key1});
if ($common =~ /\d+/){
$common_of_the_2{$key1}=$common; } }
\%common_of_the_2;
}
#________________________________________________________________________
# Title : pair_percent_id_trend
# Usage : @array = &pair_percent_id_trend (%arrayinput);
# Function :
# Example : common gaps means only '.' (dots, not alphabets!!)
# AAA....BBCB
# AAAB..B.BCC --> A.A.....BC. (as in an array)
# A.AAA...BCA
# The resulting array XXXXX..XXXX is literally like so.
# This is to detect absurd gaps in the above.
#
#
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub pair_percent_id_trend{
my(%input) = %{$_[0]};
my(@common, @string,@accumu_percent_iden)=();
my($percent_id_so_far)=0;
for $key(keys %input){
my($len) = &smaller_one($#common, $#string) unless $#common < 0;
$input{$key}=~s/ //g;
@string= split(//, $input{$key});
$length_of_one_seq = $#string+1;
$length_of_all_seq +=$length_of_one_seq;
$seq_no += 1;
for ($k=0; $k <= $len;$k++){
if($#common == -1){
@common = @string;
last;
}
elsif (($string[$k] =~ /^(\W)/)&&($1 ne $previous_non_char)){
$non_char_count +=1;
$previous_non_char=$1;
}
elsif ( $string[$k] eq $common[$k] ){
$common[$k] = $string[$k];
$identical_count +=1;
}elsif( $string[$k] ne $common[$k]){
$common[$k]='.';
}
}
$num_of_iden_char = &count_num_of_char(\@common);
$av_seq_no = $length_of_all_seq/$seq_no;
$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
print "\n percent_id so far = $percent_id_so_far \n";
push(@accumu_percent_iden,$percent_id_so_far);
} # end of for (after all sequences have been run).
$num_of_iden_char = &count_num_of_char(@common);
$av_seq_no = $length_of_all_seq/$seq_no;
$percent_id_so_far = $num_of_iden_char/$av_seq_no*100;
print "\n percent_id so far = $percent_id_so_far \n";
\@accumu_percent_iden; # final ids array.
}
#________________________________________________________________________
# Title : smaller_one
# Usage : $smaller = & smaller_one($var, $var2);
#
# Function : gets smaller value of the two inputs
# Example : will return 5 with &smaller_one(5, 50);
# Warning : gets only digits!!
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub smaller_one{
if (($_[0], $_[1]=~/\d+/)||($_[0] > $_[1])){
return $_[1];
}elsif(($_[0], $_[1]=~/\d+/)||($_[0] <= $_[1])){
return $_[0];
}else{
print "\n I am sub 'smaller_one', the input were not digits \n";
}
}
#________________________________________________________________________
# Title : count_num_of_char
# Usage : $num_char = &count_num_of_char(@input_array_of_single_char);
# Function : takes only ARRAY and counts the number of char. Each elem should be
# a single char.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub count_num_of_char{
my(@input)={$_[0]};
my($num_of_char)=0;
for $elem(@input){ # this is for the percentage of TWO seqs.
if ($elem =~ /\w/){
$num_of_char +=1;
}
}
$num_of_char;
}
#________________________________________________________________________
# Title : remov_com_column2 (this is the older and slower version)
# Usage : %new_string = %{&remov_com_column2(\%input_hash)};
# Function :
# Example : seq1 ABCDE------DDD seq1 ABCDE--DDD
# seq2 ABCDEE-----DD- ==> seq2 ABCDEE-DD-
# seq3 ---DEE----DDE- seq3 ---DEEDDE-
# ^^^^
# from above the 4 columns of gap will be removed
# To remove absurd gaps in multiple sequence alignment
# Warning :
# Keywords :
# Options :
# Returns : a ref. of a hash.
#
# <input hash> <out hash>
#
# Argument : accepts reference for a hash.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub remov_com_column2{
my(%input) = %{$_[0]};
my(@common)=();
my($len)=0;
my(@string)=();
my(@new_string)=();
my(@string2)=();
my(%new_string);
########## Finds common gaps ###########
for $key(keys %input){
$len = &smaller_one($#common, $#string) unless $#common < 0;
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
@string = split(/|| /, $input{$key});
for ($k=0; $k <= $len;$k++){
if($#common == -1){
@common = (@string);
last;
}
if (($string[$k] eq '.')&&($common[$k] eq '.')){
$common[$k]='.';
}else{
$common[$k]='X';
}
}
}
########## removes gaps ###########
for $key2 (keys %input){
@string2 = split(//, $input{$key2});
for ($i=0; $i <= $#string2; $i++){
if ($common[$i] eq $string2[$i]){
print;
}else{ ### If not option is set, just write
%TEM = @in;
$LEN = ${&max_elem_string_array_show_hash( keys %TEM)};
if($LEN > $KL){ $KL = $LEN + $GAP +2};
printf ("%-${KL}s ", $in[$k]); $k++; # print $in[$k], "\t"; $k++;
printf ("%-${VL}s\n",$in[$k]); # print $in[$k], "\n";
}
}
#________________________________________________________
# Title : max_elem_string_array_show_hash
# Keywords : largest string length of array
# Function : gets the largest string length of element of any array of numbers.
# Usage : ($out1, $out2)=@{&max_elem_array(\@array1, \@array2)};
# ($out1) =${&max_elem_array(\@array1) };
# Argument : numerical arrays
# returns : one or more ref. for scalar numbers.
# Version : 1.1
#-------------------------------------------------------
sub max_elem_string_array_show_hash{
my(@input, $i, $max_elem);
@input = @{$_[0]} || @_;
for($i=0; $i< @input ; $i++){
$max_elem = length($input[0]);
if (length($input[$i]) > $max_elem){
$max_elem = length($input[$i]);
}
}
\$max_elem;
}
#####################################insert_gaps_in_seq_hash
}
}
}
#_______________________________________________________________
# Title : open_predator_files
# Usage :
# Function : gets sec. str. prediction of predator and puts in hash
# If 's' option is given, it also gives sequence hash ref
# as the second output ref. This can handle the 2 types
# of output format of predator. So, the output can will
# be different according to inputs.
# Example :
# There are 2 types of output. The short output:>
#
# > MOZ_HUMAN_part
# . . . . .
# 1 LDHKTLYYDVEPFLFYVLTQNDVKGCHLVGYFSKEKHCQQKYNVSCIMIL 50
# ___EEEEEE__HHHHHHH_______EEE____________EEEEEEEEE_
#
# ((-l option for long output )
# NAME MOZ_HUMAN_part
# HEADER |- Residue -| Pred Rel NAli Asn
# PRED 1 MET M c 0.000 0 ?
# PRED 2 ALA A c 0.000 0 ?
#
# Warning :
# Keywords : open_prd_files, open_pred_files, predator, open_prdl_files
# open_pre_files, secondary structure prediction file
# Options : 's' for sequence output as well (\%sec_str, \%seq)
# 'p' for percentage of the sec. str.
# 'a' for accumulated percentage. This will
# set 'p' automatically
# 'n' for NO name when outputing Percentage of chars with
# HASH input to get_occurances_of_char sub.
# $reverse_residue_order=r by r
# Returns :
# Argument :
# Category :
# Version : 1.8
#-----------------------------------------------------------
sub open_predator_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my( @out_ref, $seq_out, %sec_str, %seq, $percent_out, $NO_name_out,
$short_form_out_detected, $long_form_out_detected, $accumulate,
$reverse_residue_order, %rev_sec_str);
if($char_opt=~/s/i){ $seq_out=1 }
if($char_opt=~/a/i){ $accumulate=1 }
if($char_opt=~/p/i){ $percent_out=1 }
if($char_opt=~/n/i){ $NO_name_out='n' }
if($char_opt=~/r/){ $reverse_residue_order='r' }
for($i=0; $i< @file; $i++){
my (%sec_str, %seq) if($accumulate !=1);
open(PREDATOR_FILE, "$file[$i]");
print "\n# (INFO) open_predator_files: opening $file[$i]";
while(<PREDATOR_FILE>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Simple sec str input form
#________________________________________________
if(/\> *(\S+)/){
$name=$1; $short_form_out_detected=1; print "\n# (INFO) \"$name\" was found in $file[$i]";
}elsif($short_form_out_detected and /^[\t ]+([CHE_]+) *$/i){
$sec_str{$name}.=$1;
}elsif($short_form_out_detected and $seq_out==1 and /[ \t]*\d+[ \t]*(\w+)[\t ]*\d+/){
$seq{$name}.=$1;
}elsif($short_form_out_detected and /^[\t ]+\d+ +\S/){
next;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Sec str form HASH input. Complex form
#________________________________________________
}elsif(/^ *NAME +(\S+)/){
$name=$1; $long_form_out_detected=1;
}elsif($long_form_out_detected and /^ *\S+ +(\d+) +(\S+) +(\S+) +(\S) +(\S+) +\S+ +\S+/){
$position=$1;
$residue_3_letter=$2;
$residue_1_letter=$3;
$sec_str_predicted=$4;
$reliability=$5;
$sec_str{$position}=[$residue_1_letter, $sec_str_predicted, $reliability];
}
}
close (PREDATOR_FILE);
print "\n \%sec_str is: ", %sec_str, "\n" if ($debug == 1);
if($seq_out==1){ push(@out_ref, \%sec_str, \%seq);
}elsif($percent_out==1 ){
push(@out_ref, [%{&get_occurances_of_char(\%sec_str, $NO_name_out, 'p')}] );
}elsif($percent_out !=1){ push(@out_ref, \%sec_str) }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If -r option is set (for long form, this does not affect
#____________________________________________________________
if($short_form_out_detected and $reverse_residue_order){
@keys=keys %sec_str;
for($r=0; $r<@keys; $r++){
$rev_sec_str{$keys[$r]}=reverse($sec_str{$keys[$r]});
}
%sec_str=%rev_sec_str;
}
}
print "\n# (INFO) \$long_form_out_detected is returned from open_predator_files\n" if $long_form_out_detected;
print "\n# (INFO) \$short_form_out_detected is returned from open_predator_files\n" if $short_form_out_detected;
if(@out_ref==1){
return($out_ref[0]);
}elsif(@out_ref>1){
return(@out_ref);
}
}
#_______________________________________________________________________________
# Title : open_phd_files
# Usage : &open_phd_files(\$file_name, $options,,,,,);
# Function : open phd files and put sequences in a hash(s) (run open_phd_files.pl to
# get some ideas on how this works. type 'open_phd_files.pl xxx.phdo s',
# it will produce 5 different hashes of secondary structure pred.
# Example :
# Warning : All the spaces are converted to '_'
# Keywords :
# Options : $secondary, $access, $PHD_sec, $Rel_sec, $prH_sec, $prE_sec, $prL_sec,
# $prL_sec, $SUB_sec, $P_3_acc, $PHD_acc, $Rel_acc, $SUB_acc);
# $attach_class_info_in_seq_name=c by c ## this makes seq_name seq_name_PHD_s
# $simple_seq_with_name_hash=s by s
#
# Returns : one or more hashes(ref.) secondary structure prediction of PHD server
# --- The PHD secondary server output which are read by open_phd_files -----
# 1 => PHD sec | HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH HHHHHHH|
# 2 => Rel sec |987544342178899999999987678999998478999999999995679771688999|
# 3 => prH sec |001222323478899999999987778999998678999999999986110115788999
# 4 => prE sec |000010000101000000000000010000000000000000000000000000010000
# 5 => prL sec |987666565410000000000001110000001211000000000002789774100000
# 6 => SUB sec |LLLL
# 7 => P_3 acc |eeeeeeeeee bbeeebbbebbbbebeeee b bbebbebb eebeebe eee eebbeb|
# 8 => PHD acc |988787787630066600060000606667515007007005760671847885760160
# 9 => Rel acc |979685546222352421667053233245604127749164753790316552446141
# 0 => SUB acc |eeeeeeeee
# types of PHD output, like 1 for 'PHD sec', 2 for 'Rel sec' etc.
# Argument : one or more file names and options. Files should be PHD server's result.
# Version : 1.6
#-------------------------------------------------------------------------------
sub open_phd_files{
my(@names, $i, $j,$n, $s, @in, @out_hash_ref_list, $base, @option);
my($secondary, $access, %PHD_sec, %Rel_sec, %prH_sec, %prE_sec, %prL_sec, %prL_sec,
%SUB_sec, %P_3_acc, %PHD_acc, %Rel_acc, %SUB_acc, $simple_seq_with_name_hash,
$PHD_sec_on, $Rel_sec_on, $prH_sec_on, $prE_sec_on, $prL_sec_on, $SUB_sec_on,
$P_3_acc_on, $PHD_acc_on, $Rel_acc_on, $SUB_acc_on, $residues,
$PHD_sec, $Rel_sec, $prH_sec, $prE_sec, $prL_sec, $SUB_sec, $P_3_acc,
}else{
if(@out_hash_ref_list == 1){ return($out_hash_ref_list[0]); }
elsif(@out_hash_ref_list > 1){ return(@out_hash_ref_list); }
}
}
#________________________________________________________________________
# Title : open_dna_files (genbank file opener)
# Usage : ($out, $out2) = @{&open_dna_files(\$inputfile1, \$inputfile2)};
# : (@out) = @{&open_dna_files(\$inputfile1, \$inputfile2)};
# ---------- Example of dna file --- dna files are genbank file format
#
#
# 1 ggatcttgct gaatacatgg tggcacaatt gaaattagat ccgcgaattt
# tcatcaaaac
# 61 agcgggatta tggtcaacaa atccgtaaaa atgaaaagcc tgtcttgcga
# caggcttttt
# 121 tatttgaatg taatcctcac tggtaaacgt ttaacgccaa agacaaaggg
# actagggatc
# 181 gcttcaagct tttcatcatg agcagctttt tcgatacaag ctgacattga
#
# Function : open dna files and put sequences in a hash(s)
# Example :
# Warning :
# Keywords :
# Options :
# Returns : (@out_array_of_refs)
# Argument : (\$inputfile1, \$inputfile2, .... )};
# Category :
# Version :
#--------------------------------------------------------------------
sub open_dna_files{ my(@in)=@_; my(@names, $i,$n, $s, %hash,@out_hash_ref_list);
for($i=0; $i<=$#in; $i++){
if(ref($in[$i])){ unless (-e ${$_[$i]}){ next; }
open(FILE_1,"${$_[$i]}"); undef(%hash);
while(<FILE_1>){ # file1 needs to be xxxx.msf for the moment, automatic later
if(/Name\:\s+(\S+)\s+/){ $n=$1; $n=~s/\,//g; }
if((/\s+\d+\s([acgt ]+)$/)||(/\s\s\s\s+([acgt ]+)$/)){
$s=$1; $s=~s/ //g; $s=~tr/a-z/A-Z/; $hash{$n}.=$s; } }
push(@out_hash_ref_list, \%hash); } }
if(@out_hash_ref_list == 1 ){ return(\%hash); }
elsif(@out_hash_ref_list > 1){ return(@out_hash_ref_list); } # <-- contains (\%out_seq0, \%out_seq1, \%out_seq2, .... )
}
#________________________________________________________________________
# Title : open_tem_files
# Usage : ($r1, $r2, $r3, $r4, $r5)=&open_tem_files(\$infile1, \$inputfile2..)};
# ---------- Example of xxxx
# >P1;1cdg
# sequence
# APDTSVSNKQNFSTDVIYQIFTDRFSDGNPANNPTGAAFDGTCTN-LRLYCGGDWQGIINKINDGYLTGMGVTAI
# >P1;1cdg
# secondary structure and phi angle
# CCCCCCCCCCCCCCCCEEECCHHHHCCCCHHHCCCPHHCCCCPCC-CCCCCPCCHHHHHHHHHCPHHHHHPCCEE
# >P1;1cdg
# solvent accessibility
# TTTTTTTTTTTFFFFFFFFFFFFFFTTTTTTTTTTTTTTTTTFTT-TTTTFFFFFTFFTTTFTTTFFTTFTFTFF
# >P1;1cdg
# DSSP
# CCCCCCCCCCCCCCCCEEECCHHHHCCCCGGGCCCGGGCCCCCCC-CCCCCCCCHHHHHHHHHCCHHHHHCCCEE
# >P1;1cdg
# percentage accessibility
# 67523272360000000000000002213792129b722248085-14110000030015105660028040200
# 2ltn ----TETTSFLITKFSPDQQNLIFQGDGYTT-KEKLTLTK------AVKNTVGRALYSSP
# 1loe ----TETTSFSITKFGPDQQNLIFQGDGYTT-KERLTLTK------AVRNTVGRALYSSP
#
# 2ltn ----CEEEEEEECCCCCCCCCEEEEPCCEEP-PPCEEEEC------CCCPCEEEEEECCC
# 1loe ----CEEEEEEECCCCCCCCCEEEEPCCEEE-PPEEEEEC------CCCPCEEEEEECCC
#
# 2ltn ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFTFFT------TTTTTTFFFFTTTT
# 1loe ----TTTTTTTTTTFTTTTTTFTTTTTFTFT-TTTFFFFT------TTTTTTFFFFTTTT
#
# 2ltn ----CEEEEEEECCCCCCCCCEEEEECCEEC-CCCEEEEC------CCCCCEEEEEECCC
# 1loe ----CEEEEEEECCCCCCCCCEEEEECCEEE-CCEEEEEC------CCCCCEEEEEECCC
#
# 2ltn ----543251b16504681c50422650502-75201006------35681200001453
# 1loe ----6532e1508a07981b50422750404-8a200006------36672200001453
# Function : opens JPO's xxxx.tem file, stores in 5 hashes. (usually one tem file)
# Example :
# Warning :
# Keywords :
# Options : -n, n, or N for removing any gaps in the sequences.
# -s, s, or S for getting only the sequences.
# Returns : ($r1, $r2, $r3, $r4, $r5) <= these are references for hashes.
# Argument : (\$inputfile1, \$inputfile2, .... )};
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub open_tem_files{
my($a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r,
$s, $t, $u, $v, $w, $x, $y, $z, $pwd, $file, $dir, $output, $in_dir,
%hash, @keys, @array, @hash, $option_string, $string, @in, $line,
$name, %out, $gap_chr, @str1, @str2, $num_opt, @file, @dir,
$char_opt, $char_opt_given, $num_opt_given,
@char_options, @file, $original_dir, @read_files, %array_msf, %array_jp,
$jp_file, $error_rate, $id_compos, @dir, @names, $name, $name_found,
@outref, %sequence, %secondary,%solvent_access, %DSSP, %percent_accessibility,
$name_found,$type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc
);
##################################################
##### Start of general argument handling ######
##################################################
for($k=0; $k< @_ ;$k++){
if( !ref($_[$k]) ){
if($_[$k]=~ /^[\-]*(\w)$/){
$char_opt .= "\,$1";
$char_opt_given =1;
}elsif($_[$k]=~ /^\-(\w\w+)$/){ ## When multiple option is given,
my(@char_options) = split(/|\,/, $1); ## '-' should be used. eg. '-HEGI'
$char_opt .= join("\,", @char_options); ## as an option string.
$char_opt_given = 1;
}elsif($_[$k]=~ /^([\-]*\d)$/){
$num_opt .= "\,$1"; ## delimiter is ','
$num_opt_given = 1;
}elsif(-f $_[$k]){ ## When file is given,
push(@file, \$_[$k] );
}elsif(-d $_[$k]){ ## When dir is given,
push(@dir, \$_[$k] ); }
}elsif( ref($_[$k]) ){
if(ref($_[$k]) eq "SCALAR")
{if(${$_[$k]} =~ /^[\-]*(\w)$/){ ## check if it has '-' before option char
$char_opt .= "\,$1"; ## delimiter for option char is ','
$char_opt_given = 1;
}elsif(${$_[$k]}=~ /^\-(\w\w+)$/){ ## When multiple option is given,
my(@char_options) = split(/|\,/, $1); ## '-' should be used. for eg. '-HEGI'
$char_opt .= join("\,", @char_options); ## as an option string.
$char_opt_given =1;
}elsif(${$_[$k]}=~ /^([\-]*\d)$/){
$num_opt .= "\,$1"; ## delimiter is ','
$num_opt_given = 1;
}elsif(-f ${$_[$k]}){ ## When file is given,
push(@file, $_[$k] );
}elsif(-d ${$_[$k]}){ ## When dir is given,
push(@dir, $_[$k] ); }
}elsif(ref($_[$k]) eq "ARRAY"){ ## When ARRAY is given,
push(@array, $_[$k]);
}elsif(ref($_[$k]) eq "HASH"){ ## When HASH is given,
push(@hash, $_[$k]);
}
}
###################################################
## The output of this option handling section is
## one or combination of these:
## $char_opt_given ##<<-- Simple boolean '1' or none
## $num_opt_given ##<<-- Simple boolean '1' or none
## $char_opt, as ('A,B,C')
## $num_opt, as ('1,-2,3')
## @file as (\file1, \file2,...)
## @dir as (\dir1, \dir2,...)
## @array as (\array1, \array2,,,)
## @hash as (\hash1, \hash2,,,,)
###################################################
}
################################################
##### END of general argument handling ######
################################################
for($i=0; $i < @file; $i++){
if(ref($file[$i])){ unless(-T ${$file[$i]}){ next; }
open(FILE_1, "${$file[$i]}");
while(<FILE_1>){
if(/^\>P1\;([\w\-]+)/){ $name=$1; #=================== SEQUENCE
($type_seq, $type_secon, $type_sol, $type_DSSP, $type_acc)=();
}elsif(/^sequence/){ $type_seq = 1;
}elsif(($type_seq ==1)&&(/^([\w\-]+)[\*]*$/)){
my($line) = $1;
if( $char_opt =~ /n/i){ ## to remove the gaps etc.
$line=~s/\W//g;
$sequence{$name}.=$line;
}else{
$sequence{$name}.=$line;
} #from below ============== SECONDARY
}elsif(/^secondary structure and phi angle/){ $type_secon = 1;
}elsif(($type_secon ==1)&&(/^([\w\-]+)[\*]*$/)){
$secondary{$name}.=$1; #from below============= SOLVENT ACCESSIBILITY
}elsif(/^solvent accessibility/){ $type_sol = 1;
}elsif(($type_sol ==1)&&(/^([\w\-]+)[\*]*$/)){
$solvent_access{$name}.=$1; #from below========= DSSP
}elsif(/^DSSP/){ $type_DSSP = 1;
}elsif(($type_DSSP ==1)&&(/^([\w\-]+)[\*]*$/)){
$DSSP{$name}.=$1; #from below=================== PERCENTAGE ACCESSIBILITY
}elsif(/^percentage accessibility/){ $type_acc = 1;
}elsif(($type_acc ==1)&&(/^([\w\-]+)[\*]*$/)){
$percent_accessibility{$name}.=$1; } }
push(@outref,\%sequence,\%secondary,\%solvent_access,\%DSSP,\%percent_accessibility);
} }
if( ($char_opt =~ /s/i) || ( @outref == 1 ) ){
return(\%sequence); }
elsif( @outref > 1){ return(@outref); } # <-- contains (\%sequence,\%secondary,....)
}
#________________________________________________________________________
# Title : open_hlx_files
# Usage :
# Function :
# Example of hlx file (For Bo Nielson)
# Residue Frame Score Probability
# 1 M a 1.00563E+00 2.05479E-03
# 2 T b 1.01814E+00 2.52053E-03
# 3 R c 1.01814E+00 2.52053E-03
# Example :
# Warning :
# Keywords :
# Options :
# Returns : list of ref. for hash(es)
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub open_hlx_files{ my(@in)=@_; my(@names, $i,$n, $s, %hash,@out_hash_ref_list);
for($i=0; $i< @in; $i++){
if(ref($in[$i])){ unless(-e ${$in[$i]}){ next; }}
open(FILE_1, "${$in[$i]}");
while(<FILE_1>){
if(/^[\s]+([\d]+)\s+(\w+)\s+(\w+)\s+\S+\s+(\S+)$/){
$hash_residue{$1}=($2); # residue num is key, residue is value.
$hash_frame{$1}=($3); # residue num is key, frame is value.
$hash_prob{$1}=($4); # residue num is key, probability is value.
}
}
push(@out_hash_ref_list, \%hash_residue, \%hash_frame, \%hash_prob);
}
if($#out_hash_ref_list == 0 ){ return(\%hash_residue); }
elsif($#out_hash_ref_list > 0){ return(@out_hash_ref_list); }
}
#________________________________________________________________________
# Title : open_jp_files (bug free!!)
# Usage : %out_hash=%{&open_jp_files(\$file_name)};
# Function : reads jp files and stores results in a hash.
# Example :
# Warning : All the spaces '-' !!!
# Keywords :
# Options :
# Returns : a reference of a hash for names and their sequences.
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub open_jp_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
if ($i % 400 == 0){
$julian += 365;
} elsif ($i % 4 == 0){
$julian += 366;
} else {
$julian += 365;
}
}
for ($i = 1; $i < $mo; ++$i){
if ($i == 1 || $i == 3 || $i == 5 ||
$i == 7 || $i == 8 || $i == 10 ||
$i == 12) {
$julian += 31;
} elsif ( $i == 4 || $i == 6 ||
$i == 9 || $i == 11) {
$julian += 30;
} elsif ($leapflag eq "TRUE"){
$julian += 29;
} else {
$julian += 28;
}
}
$julian += $dy;
}
#________________________________________________________________________
# Title : opendir_and_go
# Usage : &opendir_and_go($input_dir); #$inputdir='/nfs/ind4/ccpe1/people/A Biomatic /jpo/align';
# Function : open dir and process all files if you wish, and then go in any sub
# dir of it. Using recursion. created by A Biomatic
# if any file is linked, it skips that file.
# Example : as in my 'indexing.pl' for perl file indexer.
# Warning : Seems to work fine.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub opendir_and_go{
my($original_dir)=$_[0];
my(@read_files)=&read_any_dir($original_dir);
foreach $file(@read_files){
my($dir)=$split_path[$#split_path];
if (-l $realfile1){
next;
}elsif (-d $realfile1){
&opendir_and_go($realfile1);
}elsif (-f $realfile1){ #<<------ This is where things match
chdir($original_dir);
if($realfile1 =~/(\d+\-$no\.msf)$/){
@dir=split(/\//, $realfile1);
$dir=$dir[($#dir-1)]; # where am I ?
# $jp_file = $original_dir.'/'.$dir.'.jp';
# %array_msf =&open_msf_files($realfile1);
# %array_jp =&open_jp_files ($jp_file);
# $array_ref_msf = \%array_msf;
# $array_ref_jp = \%array_jp;
# $error_rate =&get_posi_shift_hash($array_ref_msf, $array_ref_jp);
# $id_compos =&amino_acid_compos_id_percent($array_ref_jp);
# push(@rates_accumu,$error_rate);
# push(@compos_id,$id_compos);
}
}
else
{
next;
}
}
}
#________________________________________________________________________
# Title : occurances
# Usage : sort occurances (@any_array_with_repeating_element);
# Function : this is for sort, to sort things according to the higher num. of occu.
# Example :
# Warning : This is from 21 DAYS book, page 373.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub occurances{
$occurance_hash{$a} <=> $occurance_hash{$b};
}
#________________________________________________________________________
# Title : extract_ori_seq
# nt5
# Usage : &extract_ori_seq($input_file, $output_file, $out_seq_no, *array2);
# Function : extract seqs. which are from struc. alignment only. to be analysed.
# after mul. alignment with added seq. you can extract original str.
# sequ. by using this. The output always has ...msff ext.
# *array_ali is the JPO's or true alignment hash.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub extract_ori_seq{
local($input_file, $output_file, $out_seq_no, *array1, *array2) = @_; # something like $dir.$mul_factor.msf
local(%array_ext) = &open_msf_files("$input_file");
%array_ext = &hash_substract(*array_ext, *array2); # getting rid of added seq.
%array_ext = &hash_common(*array_ext, *array1);
open(OUTPUT, ">$output_file"); # this is different from $dir.msff
printf OUTPUT "PileUp\n\n\n";
printf OUTPUT " MSF:%5d Type: P Check: 0 ..\n",$ls;
print OUTPUT "\n\n";
my(@keys3) = ( keys %array_ext );
$max = &max_str_value_hash(%array_ext);
$ls = $max;
$seq3 = ($#keys3+1);
for($j=0; $j < $seq3 ;$j++){
$name3=$keys3[$j];
printf OUTPUT " Name: %10s Len: %5d Check: 0 Weight: 1.00\n",$name3,$ls;
while($is < $ie){
$iee=$is+10;
$iee=$ls if $iee > $ls;
$out.=' ';
while($is < $iee){
$char=substr($string,$is,1);
if($char ne ' '){
$char =~ tr/a-z/A-Z/;
$out.=$char;
$char='.' if $char eq '-';
}
$is++;
}
}
print OUTPUT "$out\n";
}
print OUTPUT "\n"; # open(OUTPUT, ">$dir.msff");
}
} # end of extract_ori_seq
#________________________________________________________________________
# Title : get_pair_homol_array
# Usage : $hom_out_count = ${&get_pair_homol_array(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function : get pair wise seq. !! Number of pair identical residues.
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_pair_homol_array{
my(@input)=@{$_[0]};
$input[0] =~ tr/a-z/A-Z/; # capitalizing.
$input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$input[0]);
my(@string2)= split(//,$input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter, $sum)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
$sum = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}
\$id_counter; # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_percent_homol_arr
# Usage : $homology_out = ${&get_pair_homol(\@any_array_of_2_elem)};= @ar=(ABCDE..., CDEGA..)
# Function : get pair wise seq. identity of any two strings, outputs a scalar (%)
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percent_homol_arr{
my(@input)=@{$_[0]};
$input[0] =~ tr/a-z/A-Z/; # capitalizing.
$input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$input[0]);
my(@string2)= split(//,$input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter, $sum,$percent_homol)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
$sum = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}else{
$percent_homol=($id_counter/$sum)*100;
}
return(\$percent_homol); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_pair_homol_hash
# Usage : $homology_out = & get_pair_homol (%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function : get pair wise seq. identity as a scalar count
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_pair_homol_hash{
my(%input)=@_;
&hash_chk(\%input);
my(@keys_input)= keys (%input);
my(@values_input) = values (%input);
$values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
$values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$values_input[0]);
my(@string2)= split(//,$values_input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}
return ($id_counter); # $id_counter is the homology counter;
}
#________________________________________________________________________
# Title : get_percent_homo_hash
# Usage : $homology_out = &get_pair_homol_hash(%any_hash); , eg) %hash = (name1, ABCDE..., name2, CDEGA..)
# Function : get pair wise seq. identity(%) of any two strings put in as a hash
# Example :
# Warning : reliable, but input seq. strings shouldn't contain spaces.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_percent_homo_hash{
my(%input)=@_;
&hash_chk(\%input);
my(@keys_input)= keys (%input);
my(@values_input) = values (%input);
$values_input[0] =~ tr/a-z/A-Z/; # capitalizing.
$values_input[1] =~ tr/a-z/A-Z/; # capitalizing.
my(@string1)= split(//,$values_input[0]);
my(@string2)= split(//,$values_input[1]);
if (($#string1 == -1) || ($#string2 == -1)){
print "\n One of the string is empty O.K. ? \n";
}
my($larger)= &max($#string1, $#string2);
my($id_counter, $gap_counter, $non_equal_counter,$percent_homol,)=0;
for ($i = 0; $i<=$larger; $i++){
if (($string1[$i] eq '.')|| ($string2[$i] eq '.')){
$gap_counter+=1;
}elsif ($string1[$i] eq $string2[$i]){
$id_counter +=1;
}else{
$non_equal_counter += 1;
}
}
my($sum) = ($id_counter + $gap_counter + $non_equal_counter);
if ($sum != ($larger+1)){
print "\n There is something wrong in getting homology in get_pair_homol \n";
&caller_info;
}else{
$percent_homol=($id_counter/$sum)*100;
}
return ($percent_homol);
}
#________________________________________________________________________
# Title : file_size
# Usage : $outputfilesize = &file_size($input_file_name);
# Function : returns the size of any single testing file
# Example :
# Warning : Q is for quality of this sub. This can't be wrong.
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub file_size { my($infile)=$_[0];
if ( $size=(-s "$infile")){ return $size; }
}
#________________________________________________________________________
# Title : seq_comp_percent2
# Usage : @outarray = &seq_comp_percent2(@any_input_string_array);
# Function : get string seq COMPOSITION identities(a to z). gets array
# of strings and outs array of % numbers
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub seq_comp_percent2{ # simple and basic seq. id. eg. ABC on ABCABC is 50 % identical.
my(@input)=@_;
my(@array_of_ids2, $id2, @char1, @char2);
&array_chk(sort @input);
my($longest_str_size) = &get_long_str_size (@input), "\n";
my($shortest_str_size) = &get_short_str_size(@input), "\n";
print "longest_str_size",$longest_str_size;
print "shortest_str_size",$shortest_str_size;
if (($longest_str_size/$shortest_str_size) > 4){
print "\n The shortest string is less than 1/4 of the longest\n";
print " This is quite meaningless, but will go on\n";
}
for ($i = 0; $i <= $#input ; $i++){
if ($input[$i]=~/(\W)/){
&remove_non_char($input[$i]);
}
@char1 = split(/|\s+|\.+|\-+/, $input[$i]); # splitting into char.
foreach $char (@char1){
if ($char eq ' '){
next;
}
$charcount1{$char} +=1; # making array of ['A' => 6, 'B'=>2...]
}
for($j = $i+1 ; $j <= $#input; $j++){
if ($input[$j]=~/(\W)/){
&remove_non_char($input[$j]);
}
@char2 = split(/|\s+|\.+|\-+/, $input[$j]); # splitting into char.
for $char (@char2){
$charcount2{$char} +=1; # making ary of ['A' => 6, ..]
}
$id2 = &get_id_among_2_2(*charcount1, *charcount2); # gets % id.
push (@array_of_ids2, $id2);
%charcount2=();
}
%charcount1=();
}
@array_of_ids2;
}
######################################################################################
########### file and dir handling stuff #################
###############################################################################
#________________________________________________________________________
# Title : get_full_file_name
# Usage : $any_path = ${&get_full_dir_path($any_directory)}; or &dir_path('.') for pwd.
# Function : returns full directory path (= pwd ), eg. /nfs/ind4/ccpe1/people/A Biomatic
# Example :
# Warning :
# Keywords : get_long_path_name, get_complete_path_name
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_full_file_name{
my($pwd)=`pwd`;
chomp($pwd);
\$pwd;
# Title : hash_output_chk
# Usage : &hash_output_chk(\%outing_hash);
# Function : checks hash output of any subroutine.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub hash_output_chk{
for($i=0; $i<= $#_; $i++){
my(%tem)=%{$_[$i]}; my(@keys)=keys %tem;
for ($j =0; $j<@keys; $j++){
unless(($keys[$j]=~/[\s\S]+/)&&($tem{$keys[$j]}=~/[\s\S]+/)){
print "\n Err. at Hash_output_chk at $0 \n", chr(7); exit;
}
}
}
}
#________________________________________________________________________
# Title : n
# Usage : &n;
# Function : puts one single new line
# Example :
# Warning :
# Keywords : put_new_line, new_line
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub n{
print "\n";
}
#________________________________________________________________________
# Title : cls
# Usage : &cls;
# Function : clears screen
# Example :
# Warning :
# Keywords : clear_screen
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub cls{ my($cls) = `clear`;
print $cls;
}
#________________________________________________________________________
# Title : seq_comp_percent1
# Usage : @outarray = &seq_comp_percent1(@any_input_string_array);
# Function : get string seq identities(a to z). gets array of strings and outs array of % numbers
# Example :
# Warning :
# Keywords :
# Options :
# Returns : one ref. of an array
# Argument : one ref. of an array
# Category :
# Version :
#--------------------------------------------------------------------
sub seq_comp_percent1{ # this is affected by seq. length
my(@input)=@{$_[0]};
my(@array_of_ids1, $id1, @char1, @char2);
&array_chk(\@input);
@input = sort (@input);
$longest_str_size = &get_long_str_size (@input), "\n";
$shortest_str_size = &get_short_str_size(@input), "\n";
if (($longest_str_size/$shortest_str_size) > 4){
print "\n The shortest string is less than 1/4 of the longest\n";
print " This is quite meaningless, but will go on\n";
}
for ($i = 0; $i <= $#input ; $i++){
if ($input[$i]=~/(\W)/){
print "\n Warn: seq($input[$i] contains non char\n";
&remove_non_char($input[$i]);
}
@char1 = split(/|\s+|\.+|\-+/, $input[$i]); # splitting into char.
foreach $char (@char1){
$charcount1{$char} +=1; # making array of ['A' => 6, 'B'=>2...]
}
for($j = $i+1 ; $j <= $#input; $j++){
if ($input[$j]=~/(\W)/){
print "\n Warn: seq($input[$i] contains non char\n";
&remove_non_char($input[$j]);
}
@char2 = split(/|\s+|\.+|\-+/, $input[$j]); # splitting into
foreach $char (@char2){
$charcount2{$char} +=1; # making array of ['A' => 6, 'B'=>2...]
}
$id1 = &get_id_among_2_1(*charcount1, *charcount2); # gets % id.
# print %charcount1,"\n";
push (@array_of_ids1, $id1);
%charcount2=();
}
%charcount1=();
}
\@array_of_ids1;
}
#________________________________________________________________________
# Title : get_id_among_2_1
# Usage : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function : gets the % id of any two sequences, returns in 100.0% format.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version :
#--------------------------------------------------------------------
sub get_id_among_2_1{ # 66.67 % if ABC with ABCABC (due to diff. seq. length.)
local(*hash1, *hash2)= @_;
my($identity, $no_of_same, $sum_of_same, $av);
my(@num_char1)=values %hash1;
my(@num_char2)=values %hash2;
for $key1 (sort keys %hash1){
for $key2 (sort keys %hash2){
if ($key1 eq $key2){
$no_of_same = &min($hash1{$key1},$hash2{$key2});
$sum_of_same += $no_of_same;
last;
}
}
}
$identity = $sum_of_same*2/(&sum_array(@num_char1,@num_char2))*100;
# print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title : get_id_among_2_2
# Usage : $id = &get_id_among_2(*charcount1, *charcount2) <- hashes
# Function : gets the % id of any two sequences, returns in 100.0% format.
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_id_among_2_2{ # eg) 50% if ABC with AABBCC or ABCABC
local(*hash1, *hash2)= @_;
my($identity, $no_of_same, $sum_of_same, $av);
#print %hash1,"\n";
#print %hash2,"\n";
my(@num_char1)=values %hash1;
my(@num_char2)=values %hash2;
for $key1 (sort keys %hash1){
for $key2 (sort keys %hash2){
if ($key1 eq $key2){
$no_of_same = &min($hash1{$key1},$hash2{$key2});
$sum_of_same += $no_of_same;
last;
}
}
}
$seq1=&sum_array(@num_char1);
$seq2=&sum_array(@num_char2);
$longer_seq = &max($seq1, $seq2);
$identity = $sum_of_same/$longer_seq*100;
#print "percent iden = ", $identity, "\n";
}
#________________________________________________________________________
# Title : array_average
# Usage : $output = &array_average(\@any_array);
# Function : (the same as average_array)
# Example :
# Warning : If divided by 0, it will automatically replace it with 1
# Keywords : get_array_average, av_array, average_array, get_average_array
# average_of_array, average_array
# Options :
# Returns : single scaler digit.
# Argument : takes one array reference.
# Category :
# Version : 1.2
#--------------------------------------------------------------------
sub array_average{
my(@input)= @{$_[0]};
my $int_option = ${$_[1]} || $_[1];
my($item,$average,$num,$sum);
my $num_of_elem = @input;
for $item(@input){
if( $item =~ /^$/ ){ ## If it matches nothing. '$item == 0' does not work !!!
$num_of_elem --; ## This is to make sure that the denominator does not
} ## count blank element. (to get correct element number)
else{ $sum += $item; }
}
if($num_of_elem ==0){ $num_of_elem =1; } ## To prevent 'Division by 0' error
if($int_option =~ /[\-]*i[nt]*/){
$average= int( $sum/$num_of_elem );
}else{ $average = $sum/$num_of_elem }
return(\$average);
}
#________________________________________________________________________
# Title : average_array
# Usage : $output = &average_array(\@any_array);
# Function : (the same as array_average)
# Example :
# Warning : If divided by 0, it will automatically replace it with 1
# Keywords :
# Options :
# Returns : single scaler digit.
# Argument : takes one array reference.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub average_array{
my(@input)= @{$_[0]};
my $int_option = ${$_[1]} if ref($_[1]);
my $int_option = $_[1] if !ref($_[1]);
my($item,$average,$num,$sum);
my $num_of_elem = @input;
for $item(@input){
if( $item =~ /^$/ ){ ## If it matches nothing. '$item == 0' does not work !!!
$num_of_elem --; ## This is to make sure that the denominator does not
} ## count blank element. (to get correct element number)
if($_[2]=~/E2=(\S+)/){
$E_value2=$1;
}
for($i=0; $i< @input_interm_lines; $i++){
if($input_interm_lines[$i]=~/\S+ +\d+ +(\S+) +\S+ +\d+ +(\S+) +\S+/){
if($1 < $E_value1 and $2 < $E_value2){
push(@filtered_lines, $input_interm_lines[$i]);
}
}
}
return(\@filtered_lines);
}
#___________________________________________________________________________________
# Title : make_intermediate_sequence_library
# Usage : &make_intermediate_sequence_library(\@files, "FASTA_DB=$owl_db_fasta");
# while @files have either pdbs or pdbg file (PDB grouping file)
# Function : extracts intermediate sequences from OWL fasta database to
# make intermediate seq library
# This looks for /gn0/jong/DB/PDB/PDB95D_against_OWL/E/$msp_file_gz
# and /gn0/jong/DB/PDB/PDB95D_against_OWL/D/$msp_file_gz
# Example :
# Keywords : make_interm_library_for_each_group, make_interm_lib,
# make_intermediate_library, compile_interm_library, create_interm_library,
# Options :
# 'FASTA_DB' for sequence source fasta file eg: "FASTA_DB=$source_db_fasta"
# o for overwrite option (overwrites 1.2.3.fa like file)
# MSP_DIR= for msp seq file result directory
# m= for msp seq file result direc (same as MSP_DIR)
# e= for E value thresh
# $pdbg_file= by p=
# E= for E value thresh
# s= for score thresh
#
# Returns :
# Argument :
# Category :
# Version : 2.0
#-----------------------------------------------------------------------------------
sub make_intermediate_sequence_library{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%hash, @pdbd_seqs, @superfamily, @members, @files, $over_write, $msp_file_gz,
$msp_seq_file_dir, $msp_file, $msp_file_long, $msp_file_gz_long,
$pdbd_seq_long, $final_merged_ISL_fasta,$pdbg_file, $pwd , $range_start,
$range_stop, @files_NOT_processed);
my $source_db_fasta=$ENV{'NRDB_FASTA'}; ## general default
my $evalue_thresh=0.001; # default
my $score_thresh=70; # default
my $range_thresh=10;
my $percent_id_thresh=0.95;
print "\n# (i) Running sub of: make_intermediate_sequence_library\n";
if($vars{'FASTA_DB'}=~/\S+/){ $source_db_fasta=$vars{'FASTA_DB'} }
if($vars{'MSP_DIR'}=~/\S+/){ $msp_seq_file_dir=$vars{'MSP_DIR'} }
if($vars{'p'}=~/\S+/){ $pdbg_file=$vars{'p'}; print "\n# (i) $vars{'p'} is given \n"; } ## PDBG file input
if($vars{'m'}=~/\S+/){ $msp_seq_file_dir=$vars{'m'} }
if($vars{'DB'}=~/\S+/){ $source_db_fasta=$vars{'DB'} }
if($vars{'E'}=~/\S+/){ $evalue_thresh=$vars{'E'} }
if($vars{'e'}=~/\S+/){ $evalue_thresh=$vars{'e'} }
if($vars{'s'}=~/\S+/){ $score_thresh=$vars{'s'} }
if($char_opt=~/o/){ $over_write='o' }
if( !(-s $pdbg_file )){
print "\n# (W) Is your pdbg file given to me? \n\n";
if(-s $file[0]){ $pdbg_file=$file[0];
}else{ die "\n# (E) I can not find \$pdbg_file \n"; }
}else{
print "\n# (i) \$pdbg_file $file[0] does exist, GOOD! \n";
}
print "\n# (1) make_intermediate_sequence_library: \$evalue_thresh is $evalue_thresh, opening $pdbg_file file\n";
open(PDBG, "$pdbg_file");
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Collecting sequence members for superfamilies
#_______________________________________________
while(<PDBG>){
if(/\>(\S+) +(\d+\.\d+\.\d+)\.\d+\.\d+/){
$super_family=$2;
$hash{$super_family} .=" $1"; ## %hash has 'd1cus_ d2eng_ ...'
}
}
close (PDBG);
@superfamily=sort keys %hash;
print "\n# (i) make_intermediate_sequence_library: \@superfamily has @superfamily\n\n" if $verbose;
unless(@superfamily){ die "\n# (E) \@superfamily is too small to go on \n\n"; }
$pwd=`pwd`; chomp($pwd);
chdir($msp_seq_file_dir);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# collecting superfamily member sequences to a hash
#____________________________________________________
for($i=0; $i< @superfamily; $i++){
my (%interm_hash, $out_fasta_file );
my $superfamily=$superfamily[$i];
my @pdbd_seqs=split(/ +/, $hash{$superfamily});
$out_fasta_file="$superfamily[$i]\.fa"; #<----------------- final output file name like 1.2.1.fa
if(-s $out_fasta_file and !$over_write){
print "\n# (1) make_intermediate_sequence_library: $out_fasta_file Already EXISTS and no o opt. skipping\n";
next;
}
if(@pdbd_seqs){ print "\n# (i) \@pdbd_seqs for $superfamily is: @pdbd_seqs\n";
}else{ print "\n# (E) \@pdbd_seqs is empty, strange, error in making $out_fasta_file \n\n";
}
for($j=0; $j< @pdbd_seqs; $j++){
$pdbd_seq=$pdbd_seqs[$j];
$pdbd_seq_long="pdb\_$pdbd_seqs[$j]";
my (@msp_content, $evalue, $sub_dir, $percentage_id, $range_length);
if($pdbd_seq=~/^ *$/){ next; }
$msp_file="$pdbd_seq\.msp";
$msp_file_gz="$pdbd_seq\.msp\.gz";
$msp_file_long="pdb\_$pdbd_seq\.msp"; ## this is to handle Sarah's pdb_ prefixed pdbd files
$msp_file_gz_long="pdb\_$pdbd_seq\.msp\.gz";
print "# (i) I am processing $msp_file <- subroutine:make_intermediate_sequence_library\n";
if($msp_file_gz=~/^([dec])\S/ or $msp_file_long=~/pdb_([dec])\S/){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
# Trying to locate the search result file as like 'd1hlb__.msp.gz' in various dirctory
#_______________________________________________________________________________________
$sub_dir1="\U$1";
$sub_dir2="$1"; ## just in case the subdir name was not in capital
if(-s $msp_file){
open(MSP_FILE, "$msp_file"); @msp_content=<MSP_FILE>; close (MSP_FILE);
}elsif(-s $msp_file_long){
open(MSP_FILE, "$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
print "\n# (i) \@msp_content is read \n";
}elsif( -s "$msp_seq_file_dir\/$sub_dir1/$msp_file_long"){
open(MSP_FILE, "$msp_seq_file_dir\/$sub_dir1/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
print "\n# (i) \@msp_content is read \n";
}elsif( -s "$msp_seq_file_dir\/$sub_dir2/$msp_file_long"){
open(MSP_FILE, "$msp_seq_file_dir\/$sub_dir2/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
print "\n# (i) \@msp_content is read \n";
}elsif( -s "$msp_seq_file_dir\/$msp_file_long"){
open(MSP_FILE, "$msp_seq_file_dir\/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
print "\n# (i) \@msp_content is read \n";
}elsif( -s "$msp_seq_file_dir\/$msp_file_long"){
open(MSP_FILE, "$msp_seq_file_dir\/$msp_file_long"); @msp_content=<MSP_FILE>; close (MSP_FILE);
print "\n# (i) \@msp_content is read \n";
}elsif( -s $msp_file_gz){ @msp_content=`gunzip -c $msp_file_gz`;
}elsif( -s "./$sub_dir1/$msp_file_gz"){ @msp_content=`gunzip -c ./$sub_dir1/$msp_file_gz`;
}elsif( -s "./$sub_dir2/$msp_file_gz"){ @msp_content=`gunzip -c ./$sub_dir2/$msp_file_gz`;
}elsif( -s $msp_file_gz_long){ @msp_content=`gunzip -c $msp_file_gz_long`;
}elsif( -s "./$sub_dir1/$msp_file_gz_long"){ @msp_content=`gunzip -c ./$sub_dir1/$msp_file_gz_long`;
}elsif( -s "./$sub_dir2/$msp_file_gz_long"){ @msp_content=`gunzip -c ./$sub_dir2/$msp_file_gz_long`;
}elsif( -s "$msp_seq_file_dir\/$sub_dir1/$msp_file_gz"){
@msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir1/$msp_file_gz`;
}elsif( -s "$msp_seq_file_dir\/$sub_dir2/$msp_file_gz"){
@msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir2/$msp_file_gz`;
}elsif( -s "$msp_seq_file_dir\/$sub_dir1/$msp_file_gz_long"){
@msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir1/$msp_file_gz_long`;
}elsif( -s "$msp_seq_file_dir\/$sub_dir2/$msp_file_gz_long"){
@msp_content=`gunzip -c $msp_seq_file_dir\/$sub_dir2/$msp_file_gz_long`;
}else{
print "\n# Error Error Error Error Error Error Error Error Error Error \n";
print "# (E) PWD is ", `pwd`, "$msp_seq_file_dir\/$sub_dir2, $msp_seq_file_dir\/$sub_dir1\n";
print "# (E) make_intermediate_sequence_library ($superfamily): cant find $msp_file_gz or $msp_file\n";
push(@files_NOT_processed, $msp_file);
print chr(7), chr(7), chr(7);
next;
}
print "\n# (i) Now I have finished reading in ONE msp file for $pdbd_seq ($superfamily) into \@msp_content";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
# Processing each MSP line
#_______________________________________________________________________________________
my ($identical_one_added, $interm_seq);
for($k=0; $k< @msp_content; $k++){ ## NEW MSP format
if($msp_content[$k]=~/^ *(\S+) +(\S+) +(\S+) +\d+ +\d+ +[pdb_]*$pdbd_seq[_\d+\-\d+]* +(\d+) +(\d+) +(\S+)/){ # [pdb_]* is for Sarah's pdb_ prefix
#if($pdbd_seq eq $6){ next } ## this is to exclude the same seq match
$evalue=$2; $percentage_id=$3;
$range_length=$5-$4; $score =$1; $interm_seq=$6; $range_start=$4, $range_stop=$5;
if($interm_seq=~/(\S+)_\d+\-\d+/){ $interm_seq=$1; }
if($evalue < $evalue_thresh and $range_length > $range_thresh and $score > $score_thresh){
if($percentage_id <= $percent_id_thresh){ ## to keep one pdb seq ($percentage_id=1)
unless($interm_seq=~/^d\d\S/){ # to remove entries like: d1abc_10-20_d2acb__ (two pdb seqs)
$interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
}
}elsif($percentage_id ==1 and $identical_one_added < 1){ ## this is to prevent more than 1 100% id seq
$interm_hash{$superfamily} .=" $interm_seq\_$range_start\-$range_stop\_$pdbd_seq";
$identical_one_added++;
}
}
}elsif($msp_content[$k]=~/^ *(\d+) +(\S+) +\d+ +\d+ +$pdbd_seq +(\d+) +(\d+) +(\S+)/){
$evalue=$2; $score=$1;
$range_length=$4-$3; $interm_seq=$5;
if($evalue < $evalue_thresh and $range_length > $range_thresh
and $score > $score_thresh){
$interm_hash{$superfamily} .=" $interm_seq\_$3\-$4\_$pdbd_seq";
}
}
}
}
}
chdir($pwd);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
#__________________________________
$family_name=$superfamily[$i];
@members=@{ &remove_similar_seqlets($interm_hash{$superfamily[$i]}) };
%seq=%{&fetch_sequence_from_db($source_db_fasta, \@members)};
&write_fasta(\%seq, $out_fasta_file);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Merging (Compiling) the superfamily fasta files into one
#_________________________________________________
$final_merged_ISL_fasta=${&merge_superfam_fasta_files_for_ISL}; ## returns the final compiled_interm_lib.fa file name
print "\n# (E) Following files could not be processed\n @files_NOT_processed \n";
return( \$final_merged_ISL_fasta);
}
#________________________________________________________________________________
# Title : read_machine_readable_sso_lines
# Usage : @out_refs=@{&read_machine_readable_sso_lines(\@SSO, $get_alignment,
# $create_sso, $upper_expect_limit,$new_format, $lower_expect_limit,
# $attach_range_in_names, $attach_range_in_names2)};
# Function :
# Example :
# Keywords : read_m10_sso_lines read_msso_lines
# Options : a c r r2 n
# u= for upper E value limit
# l= for lower E value limit
# Category :
# Version : 1.5
#--------------------------------------------------------------------------------
sub read_machine_readable_sso_lines{
my ($upper_expect_limit, $lower_expect_limit)=(50,0);
my (%match, @out_refs, $query_found, $query_sq_stop, $query_sq_statrt, $match_found,
$match_seq, $match_found2, $i, $j,$match_found3, $overlap, $sw_score,
$match_sq_stop, $match_seq2, $sw_ident, $name_range, $query_seq,
$al_display_start, $match_seq_count);
for($i=0; $i< @_; $i++){
if($_[$i]=~/u=(\S+)/){ $upper_expect_limit=$1 }
( run in 1.667 second using v1.01-cache-2.11-cpan-39bf76dae61 )