Bioinf
view release on metacpan or search on metacpan
$helix_min=3;
$beta_strand_min=3;
########################################################################
##### general argument handling for options of segment length ######
########################################################################
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^[Hh](\d+)$/) ){
$helix_min = $1; }
elsif( ( !ref($_[$k]) )&&($_[$k]=~ /^[Ee](\d+)$/) ){
$beta_strand_min = $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[Hh](\d+)$/) ){
$helix_min = $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[EeBb](\d+)$/) ){
$beta_strand_min = $1; }
elsif(ref($_[$k]) eq "HASH") { push(@hash, $_[$k]); } }
for($i=0; $i < @hash; $i++){
my(%hash) = %{$hash[$i]};
@keys = sort keys( %hash );
for($j=0; $j < @keys; $j++){
my(@string_segH, @string_segE, @stringout);
$string1=$hash{$keys[$j]};
$gap_char = $1 if ($string1=~ /(\W)/);
##### actual cleaning ####
my(@string) = split(//, $string1);
for($a = 0; $a < @string; $a++){
if($string[$a] !~/[HE]/){ ### if the splited element doesn't match 'H' or 'E'
##### If any of the HH or EE counter is over the given minimum($helix_min,,)
if((@string_segH >= $helix_min)||( @string_segE >=$beta_strand_min)){
push(@stringout, @string_segH, @string_segE, '-');
@string_segH=@string_segE=(); } ## just resetting.
else{ ### if the accumulated 'HH' or 'EE' is smaller than the minimum
for(0.. (@string_segH + @string_segE) ){
push(@stringout, '-'); ### replace the short 'EE' etc with '-'
}
@string_segH=@string_segE=(); ## just resetting.
}
}
elsif($string[$a] =~ /^([Hh])$/){
push(@string_segH, $1); }
elsif($string[$a] =~ /^([Ee])$/){
push(@string_segE, $1); }
}
$hash{$keys[$j]}=join("", @stringout);
}
push(@hash_out, \%hash);
}
if(@hash_out == 1){ return($hash_out[0]);
}elsif( @hash_out > 1 ){ return(@hash_out); }
}
#________________________________________________________________________
# Title : overlay_seq_by_certain_chars
# Usage : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'HE')};
# Function : (name1 000000112324)+(name1 ABC..AD..EFDK ) => (name1 000..00..12324)
# (name2 000000112324)+(name2 --HHH--EEEE-- ) => (name1 ---000--1123--)
# uses the second hash a template for the first sequences. gap_char is
# '-' or '.' or any given char or symbol.
# To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'E')};
# output> with 'E' option >>> "name1 --HHH--1232-"
# Warning : If gap_chr ('H',,,) is not given, it replaces all the
# non-gap chars (normal alphabet), ie,
# it becomes 'superpose_seq_hash'
# Keywords : Overlap, superpose hash, overlay, superpose_seq_hash
# Options : E for replacing All 'E' occurrances in ---EEEE--HHHH----, etc.
# : H for replacing all 'H' " " "
# Returns : one hash ref.
# Argument : 2 ref for hash of identical keys and value length.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub overlay_seq_by_certain_chars{
my($i, $k,$j, $name, @in, %out, $gap_chr, @str1, @str2);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
####### Sub argument handling ######## $gap_chr here can be 'HE' etc.
#______________________________________
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.+)$/) ){
$gap_chr .= $1;
}elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)+$/) ){
$gap_chr .= $1;
}elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); }
}
if($#in < 1){
print "\n overlay_seq_by_certain_chars needs 2 hashes. Error \n"; exit; }
my(%hash1)=%{$in[0]};
my(%hash2)=%{$in[1]};
my(@names1)= sort keys %hash1;
my(@names2)= sort keys %hash2;
(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
for ($j=0; $j < $bigger; $j++){
@str1=split(//, $hash1{$names1[$j]});
@str2=split(//, $hash2{$names2[$j]});
if( ($gap_chr eq '') && ($hash2{$names2[$j]}=~/(\W)/) ){
$gap_chr=$1;
for($i=0; $i < @str2; $i++){
if($str2[$i] =~ /$gap_chr/){ $str1[$i]=$gap_chr;} }
$out{$names1[$j]}=join(",", @str1);
}else{
for($i=0; $i < @str2; $i++){
if($gap_chr =~ /$str2[$i]/){ $str2[$i]=$str1[$i];} }
$out{$names1[$j]}=join(",", @str2); }
}
return(\%out);
}
#________________________________________________________________________
# Title : rev_lines_pdb
# Usage : &rev_lines_pdb(\$ARGV[0]);
# Function : reorders the lines of any pdb files, but takes only C alpha positions.
# Example :
# The INPUT example >
#
# ATOM 191 CA ALA 195 -2.566 8.099 42.827 1.00 12.42 1ENG 256
# ATOM 192 CA ARG 196 -1.401 11.546 41.629 1.00 8.63 1ENG 257
# ATOM 193 CA THR 197 -4.073 13.846 43.107 1.00 9.93 1ENG 258
#
# The OUTPUT example > <first file, called xxxx1.atm >
#
# ATOM 1 CA ALA 1 -2.566 8.099 42.827 1.00 12.42 1ENG 256
# ATOM 2 CA ARG 2 -1.401 11.546 41.629 1.00 8.63 1ENG 257
# ATOM 3 CA THR 3 -4.073 13.846 43.107 1.00 9.93 1ENG 258
#
# <2nd file, called xxxx2.atm >
# ATOM 1 CA THR 1 -4.073 13.846 43.107 1.00 9.93 1ENG 258
# ATOM 2 CA ARG 2 -1.401 11.546 41.629 1.00 8.63 1ENG 257
# ATOM 3 CA ALA 3 -2.566 8.099 42.827 1.00 12.42 1ENG 256
#
# Warning : A Biomatic
# Keywords :
# Options : None
# Returns : directly writes two output files xxxx1.atm xxxx2.atm
# Argument : one pdb coordinate file reference
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub rev_lines_pdb{
my(@lines, $i, $c, @line_rev, $ATOM, $RES );
my($input_file_name) = ${$_[0]};
open(INPUT, "$input_file_name");
while(<INPUT>){ push(@lines, $_); $whole++; }
my($base_name) =${&get_base_name(\$input_file_name)};
my($file1) = "${base_name}1\.atm";
@string1=split(/$gap_char1/, $hash0{$keys1[$i]});
@string2=split(/$gap_char2/, $hash1{$keys2[$i]});
### @string1 => (0,0,0,0,1,1,1,1,1) @string2 => (3,4,2,13,2,1,23,3)
################################################################
## Main calc part, you get %tally_all_occur and %tally_occur
################################################################
for($j=0; $j < @string1; $j++){
$tally_all_occur{$string1[$j]}++ ; ## <-- number of all the positions
if( ($string2[$j]=~/[\d\^]+/)&&($string1[$j]=~/[\d\^]+/) ){
$tally_occur{$string1[$j]}+=$string2[$j] ; # %tally_occur is for added up counts
} # %tally_all_occur is for only the position
} # occurances of '0', '1' or whatever. To know
# how many '0' entry were you should use this.
####################################################################################
## When options were put, do more calc on %tally_all_occur and %tally_occur
####################################################################################
if($char_opt =~ /a/i ){
print "\n $char_opt ";
my(@cs_rates) = sort keys %tally_all_occur;
for($k=0; $k < @cs_rates; $k++){
if($tally_all_occur{$cs_rates[$k]} == 0){
$tally{$cs_rates[$k]} =0; next;}
if($char_opt =~ /i/i ){
$tally{$cs_rates[$k]}=int($tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]}); }
elsif($char_opt !~ /i/i ){
$tally{$cs_rates[$k]}= $tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]};
}
}
}
elsif($char_opt =~ /[np]/i){
my($big_sum, @cs_digits);
@cs_digits = sort keys %tally_occur; # @cs_digits are (0, 1, and 2 )
for(@cs_digits){ $big_sum+=$tally_occur{$_}/$tally_all_occur{$_}; }
for($t=0; $t < @cs_digits; $t++){
if($big_sum ==0){ $tally{$cs_digits[$t]}=0; next; }
else{
if($char_opt =~ /i/i){
$tally{$cs_digits[$t]}= int(($tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor)+0.4999);}
elsif($char_opt !~ /i/i ){
$tally{$cs_digits[$t]}= $tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor;}
}
}
}
}
if($char_opt =~ /[an]/i){
print "\n $char_opt ";
return(\%tally, \%tally_all_occur);}
else{ return(\%tally_occur, \%tally_all_occur);}
}
#________________________________________________________________________
# Title : superpose_seq_hash ( first to second hash) ## the oldest version.
# Usage : %out =%{&superpose_seq_hash(\%hash1, \%hash2)};
# Function : (name1 000000112324)+(name1 ABC..AD..EFD ) => (name1 000..01..324)
# uses the second hash a template for the first sequences. gap_char is
# '-' or '.'
# To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example :
# Warning : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords : overlay sequence, overlay alphabet, superpose sequence,
# Options :
# Returns : one hash ref.
# Argument : 2 refs. for hash of identical keys and value length and gap_chr.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub superpose_seq_hash{
if($debug eq 1){ print __LINE__, " # superpose_seq_hash : \n"; }
my($gap_chr)='-';
my($i, $j, %hash1, %hash2, $name, %out, @str1, @str2);
if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
%hash1=%{$_[0]}; %hash2=%{$_[1]}; }
else{ print "\n superpose_seq_hash needs hash ref\n"; print chr(007); exit; }
my(@names1)=sort keys %hash1; my(@names2)=sort keys %hash2;
(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
for ($j=0; $j < $bigger; $j++){
if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
@str1=split(/|\,/, $hash1{$names1[$j]});
@str2=split(/|\,/, $hash2{$names2[$j]});
for($i=0; $i < @str2; $i++){
if($str2[$i] ne $gap_chr){ $str2[$i]=$str1[$i]; } }
$out{$names1[$j]}=join(",", @str2);
}
return(\%out);
}
#________________________________________________________________________
# Title : overlay_seq_hash ( first to second hash) ## the oldest version.
# Usage : %out =%{&overlay_seq_hash(\%hash1, \%hash2)};
# Function : (name1 000000112324)+(name1 ABC..AD..EFD ) => (name1 000..01..324)
# uses the second hash a template for the first sequences. gap_char is
# '-' or '.'
# To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example :
# Warning : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords :
# Options :
# Returns : one hash ref.
# Argument : 2 refs. for hash of identical keys and value length and gap_chr.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub overlay_seq_hash{
my($gap_chr)='-'; my($i, $j, $name, %out, @str1, @str2);
if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; }
else{ print "\n overlay_seq_hash needs hash ref\n"; print chr(007); exit; }
my(@names1)=keys %hash1; my(@names2)=keys %hash2;
(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
for ($j=0; $j < $bigger; $j++){
if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
@str1=split(//, $hash1{$names1[$j]}); @str2=split(//, $hash2{$names2[$j]});
for($i=0; $i < @str2; $i++){
if(($str2[$i] =~ /\W/)||($str2[$i] =~ //)){ $str1[$i]="$gap_chr";}}
$out{$names1[$j]}=join(",", @str1);
}
\%out;
}
#________________________________________________________________________
# Title : insert_gaps_in_seq_hash ( first to second hash)
# Usage : %out_extended_seq =%{&insert_gaps_in_seq_hash(\%hash1, \%hash2)};
# Function : superpose two hashes of the same sequence or same seq. length sequences,
# but unlike 'superpose_seq_hash', this inserts gaps and extend the
# sequences.
# (name1_sec hHHHHHH EEEEEEE) +
# (name1_seq .CDEABC..AD..EFD..EKST) => (name1_ext .hHHHHH..H...EEE..EEEE)
# In the example, the undefined sec. str. position is replaced as gaps('.')
# Uses the second hash a template for the first sequences. gap_char is
# '-' or '.'
# One rule is that the SECOND hash contains gaps!!
# There are two types of hash input. One is simple seq hash(both args)
# The other is from secondary structure prediction. The hash has contents
# like: $averaged{$position}=[$residue1, $sec_str2, $dif_reliability];
# Example :
# Warning : coded by A Biomatic
# Keywords : superposing sequences with gaps, interpolate_sequences, interpolate_gaps
# Options :
# Returns : one hash ref.
# Argument : 2 ref for hash of identical keys and value length.
# Category :
# Version : 1.3
#--------------------------------------------------------------------
sub insert_gaps_in_seq_hash{
my($gap_char)='-';
my($g, $i, $j, $t, %hash1, %hash2, $bigger, $name, %new_hash_with_gap, @str1,
@posi, @str2, $char_from_gapped_hash, $GAP_CHAR);
my($join_char) =','; ## <<-- This is for the final output joined by this var.
$GAP_CHAR='-';
if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
%hash1=%{$_[0]};
%hash2=%{$_[1]};
}else{
print "\n superpose_seq_hash needs hash ref\n";
print chr(007); exit;
}
my(@names1)=keys %hash1;
my(@names2)=keys %hash2;
if($hash1{$names1[0]}->[0] =~/\w/){ # this means that the input hash is sec str hash type
print "\n# (INFO) insert_gaps_in_seq_hash: You have put sec str pred hash as an input.\n\n";
@posi=split(//, $hash2{$names2[0]});
$gap_char='.';
%arraySTR = %{&hash_common_by_keys(\%arraySTR, \%arraySEQ)};
%arraySEQ = %{&hash_common_by_keys(\%arraySEQ, \%arraySTR)};
%arraySEQ = %{&remov_com_column(\%arraySEQ)};
%arraySTR = %{&remov_com_column(\%arraySTR)};
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if($debug eq 1){
print __LINE__,
" ## sorting sequence names. To make things constant. \n\n"; }
@names= sort keys %arraySTR;
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# If common column of secondary structure representation option $comm_col is set
# open_dssp_files sub routine will get the common seq parts of all the sequences.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if($comm_col =~ /C/i){
%DSSP_common=%{&open_dssp_files( @names, $H, $S, $E, $T, $I, $G, $B, $simplify, 'C')};
@keys_common= keys %DSSP_common;
@stringDSSP_common = split(/|\,/, $DSSP_common{$keys_common[0]});
if($debug2 eq 1){ print __LINE__," \$comm_col is set to: $comm_col \n";
print __LINE__," \@stringDSSP_common is :@stringDSSP_common \n";
}
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Comparing two hashes
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for $name (@names){
#"""""""""""""""" Splitting the sequence string
if($arraySEQ{$name}=~/\,\S+\,/){
@stringSEQ =split(/\,/, $arraySEQ{$name});
@stringSTR=split(/\,/, $arraySTR{$name}); }
else{
@stringSEQ =split(//, $arraySEQ{$name});
@stringSTR=split(//, $arraySTR{$name});
}
print "\n",__LINE__, " \@stringSEQ is @stringSEQ \n" if $debug2 eq 1;
print "\n",__LINE__, " \@stringSTR is @stringSTR \n" if $debug2 eq 1;
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Contracting the SEQ.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@seq_positionSEQ = @{&get_posi_sans_gaps(\$arraySEQ{$name})};
@seq_positionSTR = @{&get_posi_sans_gaps(\$arraySTR{$name})};
#"""""""""""""""" To get secondary structure only calc """"""""""""""""""""""""""""
# It superposes the NON sec. region on @seq_positionSTR to nullify positions.
# get_posi_diff ignores non char positions in calc.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if( ($ss_opt =~ /ss$/i) && ($comm_col !~ /C/i) ){
%DSSP=%{&open_dssp_files($name, $H, $S, $E, $T, $I, $G, $B, $simplify, $comm_col)};
if($debug1 eq 1){
print "\n",__LINE__," open_dssp_files has options \$H ->$H \$S->$S \$E->$E \n";
print "\n",__LINE__," \$T->$T \$I->$I \$G->$B \$simplify->$simplify \$comm_col ->$comm_col\n";
&show_hash( \%DSSP );
}
if(ref(\%DSSP) eq 'HASH'){ # to check it %DSSP was valid, If not it skips overlaying
@stringDSSP = split(/|\,/, $DSSP{$name});
$size_of_stringDSSP = @stringDSSP;
$size_of_seq_positionSTR = @seq_positionSTR;
if($debug2 eq 1){
print "\n",__LINE__," \@stringDSSP is \n @stringDSSP\n";
print "\n",__LINE__," Size of \@stringDSSP is $size_of_stringDSSP\n" ;
print "\n",__LINE__," Size of \@seq_positionSTR is $size_of_seq_positionSTR\n";
print "\n",__LINE__," \$gap_char is \"$gap_char\" \n" ;
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# When the sec. str is not defined in DSSP, I delete the position of
# @stringDSSP to gap(ie. make it blank to exclude error rate calc)
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for($i=0; $i < @stringDSSP; $i++){
if($stringDSSP[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
}
}
}elsif( $comm_col =~ /C/i){
print __LINE__, " Replacing position with \gap_char \"$gap_char\"\n" if $debug2 eq 1;
$ss_opt = 'ss'; # whether it was set or not, make it 'ss'
for($i=0; $i < @stringDSSP_common; $i++){
if($stringDSSP_common[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
}
}
if($debug2 eq 1){
print __LINE__,
print " \@seq_positionSTR is @seq_positionSTR\n";
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# getting Position differences.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@position_diffs = @{&get_posi_diff(\@seq_positionSEQ, \@seq_positionSTR)};
if($debug2 eq 1){
print __LINE__,
print " \@position_diffs is @position_diffs\n";
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# You can have two types of output according to which alignment you compare your
# error rates. (1) Compare to @stringSEQ (2) @stringSTR
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@position_corrected1 = @{&put_position_back_to_str_seq(\@stringSEQ, \@position_diffs)};
$array3{$name}=join(",", @position_corrected1);
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# The final Step for error rate, $LIMIT is to confine error rate in one digit (ie, less than 10)
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
%final_posi_diffs =%{&get_residue_error_rate(\%array3, $LIMIT)};
undef(@whole_length, $len_of_seq);
return(\%final_posi_diffs);
}
#________________________________________________________________________
# Title : get_posi_rates_hash_out (derived from 'get_posi_shift_hash' )
# Usage : %rate_hash = %{&get_posi_shift_hash(\%hash_msf, \%hash_jp)};
# seq2 ABCDEE-----DD- ==> seq2 ABCDEE-DD-
# seq3 ---DEE----DDE- seq3 ---DEEDDE-
# ^^^^
# from above the 4 columns of gap will be removed
# To remove absurd gaps in multiple sequence alignment
# Warning :
# Keywords :
# Options :
# Returns : a ref. of a hash.
#
# <input hash> <out hash>
#
# Argument : accepts reference for a hash.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub remov_com_column2{
my(%input) = %{$_[0]};
my(@common)=();
my($len)=0;
my(@string)=();
my(@new_string)=();
my(@string2)=();
my(%new_string);
########## Finds common gaps ###########
for $key(keys %input){
$len = &smaller_one($#common, $#string) unless $#common < 0;
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
@string = split(/|| /, $input{$key});
for ($k=0; $k <= $len;$k++){
if($#common == -1){
@common = (@string);
last;
}
if (($string[$k] eq '.')&&($common[$k] eq '.')){
$common[$k]='.';
}else{
$common[$k]='X';
}
}
}
########## removes gaps ###########
for $key2 (keys %input){
@string2 = split(//, $input{$key2});
for ($i=0; $i <= $#string2; $i++){
if ($common[$i] eq $string2[$i]){
print;
}else{
push(@new_string, $string2[$i]);
}
}
$new_string{$key2}= join("", @new_string);
@new_string = ();
}
\%new_string;
}
#________________________________________________________________________
# Title : get_common_column (similar to overlay_seq_for_identical_chars )
# Usage : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
# Function : (name1 --EHH--HHEE-- )
# (name2 --HHH--EEEE-- ) ==> result is;
#
# (name1_name2 -- HH-- EE-- )
# to get the identical chars in hash strings of sequences.
#
# Example : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
# output> with 'E' option >>> "name1 --HHH--1232-"
# Following input will give;
# %hash1 = ('s1', '--EHH-CHHEE----EHH--HHEE----EHH--HHEE----EHH-CHHEE--');
# %hash2 = ('s2', '--EEH-CHHEE----EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
# %hash3 = ('s3', '-KEEH-CHHEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
# %hash4 = ('s4', '-TESH-CHEEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#
# s1_s2_s3_s4 --E-H-CH-EE----E-H--HHEE----E-H--HHEE----E-H-CHHEE--
#
# Warning : This gets more than 2 hashes. Not more than that!
#
# Class : get_common_column, get_common_column_in_seq, get common column in sequence
# for secondary structure only representation.
# Keywords : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# get_common_column, get_com_column, get_common_sequence,
# get_common_seq_region, multiply_seq_hash, get_common_column_in_sequence
# Options :
# Reference :
# Returns : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Tips :
# Argument : 2 or more ref for hash of identical keys and value length.
# One optional arg for replacing space char to the given one.
# Author : jong@salt2.med.harvard.edu
# Category :
# Version : 1.6
#--------------------------------------------------------------------
sub get_common_column{
my($i, $k,$j, $name1, $name2, $sorted_name, @in, %out, @out_chars,
$gap_chr, @str1, @str2);
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Sub argument handling $gap_chr here can be 'HE' etc.
#_______________________________________________________
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); } }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Checking if the input hashes were right
#_______________________________________________________
if( (@in < 2) && ( ref($in[0]) eq 'HASH') ){
print "\n", __LINE__, " # get_common_column usually needs 2 hashes. Error \n";
print "\n", __LINE__, " # get_common_column : Anyway, I will just return the single input hash: @in. \n";
%out=%{$in[0]}; # <--- This is essential to return the single input hash!!
goto FINAL;
}
if(@in ==2){ print "\n# (INFO) \@in has 2 elements for ref. of hash, good!\n\n"; }
%out = %{$in[0]}; ## Initializing %out
print "\n",__LINE__, " # get_common_column hashes given are: @in \n" if $debug eq 1;
for( $k=1; $k < @in; $k++){
my(@out_chars); ## <-- Necessary to prevent concatenation.
my(%hash1)=%out;
my(%hash2)=%{$in[$k]};
my(@names1)= sort keys %hash1;
my(@names2)= sort keys %hash2;
$name1 = $names1[0];
$name2 = $names2[0];
@str1=split(/\||\,/, $hash1{$names1[0]});
@str2=split(/\||\,/, $hash2{$names2[0]});
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Trying to guess the correct gap char
#_____________________________________________________
if($hash2{$names2[0]}=~/_/){
$gap_chr='_';
}elsif($hash2{$names2[0]}=~/(\W)\W/){
$gap_chr=$1;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If I fail to split string by ',', then split by ''
#_____________________________________________________
if(@str1 == 1){ @str1=split(//, $hash1{$names1[0]}); }
if(@str2 == 1){ @str2=split(//, $hash2{$names2[0]}); }
for($i=0; $i < @str1; $i++){
if($str1[$i] eq $str2[$i] ){
push(@out_chars, $str1[$i]);
}else{
if( defined($gap_chr) ){
push(@out_chars, $gap_chr);
}else{
push(@out_chars, ' ');
}
}
}
$sorted_name=join('', ($name1, $name2));
%out='';
$out{"$sorted_name"}= join("", @out_chars);
}
FINAL:
if ($debug eq 1){
print "\n",__LINE__, " # get_common_column Final res. \%out :\n",
&show_hash(%out);
}
return(\%out);
}
#________________________________________________________________________
# Title : overlay_seq_for_identical_chars
# Usage : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# Function : (name1 --EHH--HHEE-- )
# (name2 --HHH--EEEE-- ) ==> result is;
#
# (name1_name2 -- HH-- EE-- )
# to get the identical chars in hash strings of sequences.
#
# Example : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# output> with 'E' option >>> "name1 --HHH--1232-"
# Warning : Works only for 2 sequence hashes.
# Keywords : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# Options :
# Returns : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Argument : 2 ref for hash of identical keys and value length. One optional arg for
# replacing space char to the given one.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub overlay_seq_for_identical_chars{
my($i, $k,$j, $name1, $name2, @in, %out, @out_chars, $gap_chr, @str1, @str2);
######################################
####### Sub argument handling ######## $gap_chr here can be 'HE' etc.
######################################
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); } }
if(@in < 2){ print "\n overlay_seq_for_identical_chars needs 2 hashes. Error \n"; exit; }
my(%hash1)=%{$in[0]}; my(%hash2)=%{$in[1]};
my(@names1)=sort keys %hash1; my(@names2)= sort keys %hash2;
$name1 = $names1[0]; $name2 = $names2[0];
@str1=split(/|\,/, $hash1{$names1[0]}); @str2=split(/|\,/, $hash2{$names2[0]});
for($i=0; $i < @str1; $i++){
if($str1[$i] eq $str2[$i] ){
push(@out_chars, $str1[$i]); }
elsif( defined($gap_chr) ){ push(@out_chars, $gap_chr); }
else{ push(@out_chars, ' '); }
}
if( $name2 gt $name1){
$out{"$name1\_$name2"}= join(",", @out_chars); }
else{ $out{"$name2\_$name1"}= join(",", @out_chars); }
\%out;
}
#________________________________________________________________________
# Title : remov_com_column
# Usage : %new_string = %{&remov_com_column(\%hashinput)};
# @out=@{&remov_com_column(\@array3)};
# Function : removes common gap column in seq.
# Example :
# Warning :
# Keywords : remove_com_column, remove_common_column,
# remove_common_gap_column, remov_common_gap_column,
# remove com column
# Options :
# Returns : a ref. of hash(es) and array(s).
#
# name1 ABCDE....DDD name1 ABCDE..DDD
# name2 ABCDEE..DD.. --> name2 ABCDEEDD..
# name3 ...DEE..DDE. name3 ...DEEDDE.
#
# (ABC....CD, ABCD...EE) --> (ABC.CD, ABCDEE)
# from above the two column of dot will be removed
# To remove absurd gaps in multiple sequence alignment. for nt6-hmm.pl
# Argument : accepts reference for hash(es) and array(s).
# Category :
# Version :
#--------------------------------------------------------------------
sub remov_com_column{
my(@hash_ref_out, $d, $gap_char);
for($d=0; $d < @_; $d++){
if(ref($_[$d]) eq 'HASH'){
my($len,@string,@new_string,@string2);
my(%input)=%{$_[$d]};
my(@common);
for (keys %input){
@string = split('', $input{$_});
if(!(defined(@common))){ @common = (@string); }
else{ for ($k=0; $k < @string; $k++){
if (($string[$k] =~ /\W/ )&&($common[$k] =~ /(\W)/)){ $common[$k]= $1;}
elsif(($string[$k] =~ /(\W)/)&&(!(defined($common[$k])))){ $common[$k]=$1;}
else{ $common[$k]='X';} } } }
for (keys %input){ @string2 = split(//, $input{$_});
for ($i=0; $i < @string2; $i++){
if ($common[$i] ne $string2[$i]){ push(@new_string, $string2[$i]); } }
$new_string{$_}= join('', @new_string); @new_string = (); }
push(@hash_ref_out, \%new_string);
}
( run in 0.562 second using v1.01-cache-2.11-cpan-39bf76dae61 )