Bioinf
view release on metacpan or search on metacpan
if($comm_col =~ /C/i){
%DSSP_common=%{&open_dssp_files( @names, $H, $S, $E, $T, $I, $G, $B, $simplify, 'C')};
@keys_common= keys %DSSP_common;
@stringDSSP_common = split(/|\,/, $DSSP_common{$keys_common[0]});
if($debug2 eq 1){ print __LINE__," \$comm_col is set to: $comm_col \n";
print __LINE__," \@stringDSSP_common is :@stringDSSP_common \n";
}
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Comparing two hashes
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for $name (@names){
#"""""""""""""""" Splitting the sequence string
if($arraySEQ{$name}=~/\,\S+\,/){
@stringSEQ =split(/\,/, $arraySEQ{$name});
@stringSTR=split(/\,/, $arraySTR{$name}); }
else{
@stringSEQ =split(//, $arraySEQ{$name});
@stringSTR=split(//, $arraySTR{$name});
}
print "\n",__LINE__, " \@stringSEQ is @stringSEQ \n" if $debug2 eq 1;
print "\n",__LINE__, " \@stringSTR is @stringSTR \n" if $debug2 eq 1;
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Contracting the SEQ.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@seq_positionSEQ = @{&get_posi_sans_gaps(\$arraySEQ{$name})};
@seq_positionSTR = @{&get_posi_sans_gaps(\$arraySTR{$name})};
#"""""""""""""""" To get secondary structure only calc """"""""""""""""""""""""""""
# It superposes the NON sec. region on @seq_positionSTR to nullify positions.
# get_posi_diff ignores non char positions in calc.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if( ($ss_opt =~ /ss$/i) && ($comm_col !~ /C/i) ){
%DSSP=%{&open_dssp_files($name, $H, $S, $E, $T, $I, $G, $B, $simplify, $comm_col)};
if($debug1 eq 1){
print "\n",__LINE__," open_dssp_files has options \$H ->$H \$S->$S \$E->$E \n";
print "\n",__LINE__," \$T->$T \$I->$I \$G->$B \$simplify->$simplify \$comm_col ->$comm_col\n";
&show_hash( \%DSSP );
}
if(ref(\%DSSP) eq 'HASH'){ # to check it %DSSP was valid, If not it skips overlaying
@stringDSSP = split(/|\,/, $DSSP{$name});
$size_of_stringDSSP = @stringDSSP;
$size_of_seq_positionSTR = @seq_positionSTR;
if($debug2 eq 1){
print "\n",__LINE__," \@stringDSSP is \n @stringDSSP\n";
print "\n",__LINE__," Size of \@stringDSSP is $size_of_stringDSSP\n" ;
print "\n",__LINE__," Size of \@seq_positionSTR is $size_of_seq_positionSTR\n";
print "\n",__LINE__," \$gap_char is \"$gap_char\" \n" ;
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# When the sec. str is not defined in DSSP, I delete the position of
# @stringDSSP to gap(ie. make it blank to exclude error rate calc)
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for($i=0; $i < @stringDSSP; $i++){
if($stringDSSP[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
}
}
}elsif( $comm_col =~ /C/i){
print __LINE__, " Replacing position with \gap_char \"$gap_char\"\n" if $debug2 eq 1;
$ss_opt = 'ss'; # whether it was set or not, make it 'ss'
for($i=0; $i < @stringDSSP_common; $i++){
if($stringDSSP_common[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
}
}
if($debug2 eq 1){
print __LINE__,
print " \@seq_positionSTR is @seq_positionSTR\n";
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# getting Position differences.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@position_diffs = @{&get_posi_diff(\@seq_positionSEQ, \@seq_positionSTR)};
if($debug2 eq 1){
print __LINE__,
print " \@position_diffs is @position_diffs\n";
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# You can have two types of output according to which alignment you compare your
# error rates. (1) Compare to @stringSEQ (2) @stringSTR
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@position_corrected1 = @{&put_position_back_to_str_seq(\@stringSEQ, \@position_diffs)};
$array3{$name}=join(",", @position_corrected1);
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# The final Step for error rate, $LIMIT is to confine error rate in one digit (ie, less than 10)
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
%final_posi_diffs =%{&get_residue_error_rate(\%array3, $LIMIT)};
undef(@whole_length, $len_of_seq);
return(\%final_posi_diffs);
}
#________________________________________________________________________
# Title : get_posi_rates_hash_out (derived from 'get_posi_shift_hash' )
# Usage : %rate_hash = %{&get_posi_shift_hash(\%hash_msf, \%hash_jp)};
# Function : This is to get position specific error rate for line display rather than
# actual final error rate for the alignment.
# Output >>
# seq1_seq2 1110...222...2222
# seq2_seq3 1111....10...1111
# seq1_seq3 1111....0000.0000
#
# Example :
# Warning : split and join char is ','; (space)
# Keywords :
# Options :
# Returns : \%final_posi_diffs;
# Argument : %{&get_posi_rates_hash_out(\%msfo_file, \%jpo_file)};
# Whatever the names, it takes one TRUE structral and one ALIGNED hash.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub get_posi_rates_hash_out{
my(%array1)=%{$_[0]};
#----------------------------------------------------------------------------
sub open_sso_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (@out_refs, @SSO, $create_sso, $parseable, @OUT, @temp_sso_lines,
%match, $attach_range_in_names, $margin, $uppercase_seq_name,
$lowercase_seq_name, $target_seq, $new_format, $get_alignment,
$pvm_version_fasta_out, $original_target_seq, $big_msp_out_file);
my ($upper_expect_limit, $lower_expect_limit)=(50,0);
if($char_opt=~/R/){ $attach_range_in_names2=1; };
if($char_opt=~/r2/){ $attach_range_in_names =1; $attach_range_in_names2=1 };
if($char_opt=~/r/){ $attach_range_in_names =1; };
if($char_opt=~/c/){ $create_sso ='c' };
if($char_opt=~/n/){ $new_format ='n' };
if($char_opt=~/a/){ $get_alignment='a' };
if($char_opt=~/U[pperPPER]*/){ $uppercase_seq_name='U' };
if($char_opt=~/L[owerOWER]*/){ $lowercase_seq_name='L' };
if($vars{'u'}=~/(\.?\d+)/){ $upper_expect_limit = $vars{'u'} };
if($vars{'l'}=~/(\.?\d+)/){ $lower_expect_limit = $vars{'l'} };
if($vars{'m'}=~/\d+/){ $margin = $vars{'m'} };
$attach_range_in_names2=$attach_range_in_names=1;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# opening file input (can handle .gz files)
#_______________________________________________
if(@file < 1 and @array > 0){
for($i=0; $i< @array; $i++){
@sso=@{$array[$i]};
}
print "\n# (INFO) \@sso has ", scalar(@sso), " lines. \n" if $verbose;
if(@sso > 3000){ # if @sso is very big, I remove the useless contents
print "\n# (INFO) open_sso_files: size of \@sso for $file[$i] exceeds 3000 lines, ", scalar(@sso), " !!! \n";
}
push(@OUT, &read_sso_lines(\@sso, $create_sso,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
$attach_range_in_names,
$attach_range_in_names2,
$new_format, $get_alignment) );
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening input FILE!
#_______________________________________________
else{
print "\n# open_sso_files : processing @file \n\n";
for($i=0; $i< @file; $i++){
if($file[$i]=~/\S+\.msp *$/){ $big_msp_out_file=$file[$i]; splice (@file, $i, 1); $i--;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening zipped file
#_______________________________________________
}elsif($file[$i]=~/\S+\.\gz$/ or -B $file[$i]){ ## if file has xxxx.gz extension
my (@sso);
@sso=`gunzip -c $file[$i]`;
if(@sso < 30){ @sso=`zcat $file[$i]`; } # if zcat fails to produce output use gunzip -c
if(@sso > 3000){ # if @sso is very big, I remove the useless contents
print "\n# open_sso_files: size of \@sso for $file[$i] exceeds 3000 lines, ", scalar(@sso), " !!! \n";
}
push(@OUT, &read_sso_lines(\@sso, $create_sso,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
$attach_range_in_names,
$attach_range_in_names2,
$new_format, $get_alignment) );
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening plain file(not zipped)
#_______________________________________________
elsif($file[$i]=~/\S+\.[fsm]?sso/ or $file[$i]=~/\S+\.out/ or $file[$i]=~/\S+\.fso/){
print "\n# (INFO) openning text File format xxxx.[fms]sso $file[$i]";
open(SSO, "$file[$i]") or die "\n# (ERROR) open_sso_files: Failed to open $file[$i]\n";
my @sso=<SSO>;
if(@sso < 30){ @sso=`zcat $file[$i]`; } # if zcat fails to produce output use gunzip -c
if(@sso > 3000){ # if @sso is very big, I remove the useless contents
print "\n# (INFO) open_sso_files: size of \@sso is for $file[$i] exceeds 3000 lines, ",
scalar(@sso), " !!! \n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Calling read_sso_lines sub
#_____________________________________
push(@OUT, &read_sso_lines([@sso], $create_sso,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
$attach_range_in_names,
$attach_range_in_names2,
$new_format, $get_alignment) );
close SSO;
}
}
}
print "\n# \@OUT has ", scalar(@OUT), " elements \n" if $verbose;
return(\@OUT); # @OUT has refs of hashes (\%xxx, \%YYY, \%XXX,,,,)
}
#_____________________________________________________________________________
# Title : open_msp_files
# Usage : %seq=%{&open_msp_files(@file, $names_only)};
# Function : opens Erik Sonhammer's MSPcrunch file output(default).
# This looks up xxxxx.fa files in the pwd (with S opt) and see
# if it can get the sequences as well.
# With 'n' option you can just get the matched sequence
# names with ranges.
# Example : Example output(with 'n' opt):
# d1bi6h1 d1bi6h1_1-24 IBR1_ANACO_20-42 IBR2_ANACO_19-42
# e1bi6.1h1 IBR1_ANACO_38-52 e1bi6.1h1_1-18 IBR2_ANACO_38-52
#
# Keywords : exchange_msp_file_columns,
# Options :
# s -s for size return only
# S -S for the sequences are fetched if equivalent xxxx.fa files are in pwd
# n -n for matched seq NAMEs with ranges only (eg: HI0001_1-12,,), hash ref is out
( run in 0.876 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )