Bioinf

 view release on metacpan or  search on metacpan

Bioinf.pl  view on Meta::CPAN

	$helix_min=3;
	$beta_strand_min=3;

	########################################################################
	#####   general argument handling  for options of segment length  ######
	########################################################################
	for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^[Hh](\d+)$/) ){
		  $helix_min  = $1;    }
	  elsif( ( !ref($_[$k]) )&&($_[$k]=~ /^[Ee](\d+)$/) ){
		  $beta_strand_min  = $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[Hh](\d+)$/) ){
		  $helix_min  = $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[EeBb](\d+)$/) ){
		  $beta_strand_min  = $1;    }
	  elsif(ref($_[$k]) eq "HASH") { push(@hash,  $_[$k]); }    }

	for($i=0; $i < @hash; $i++){
	  my(%hash) = %{$hash[$i]};
	  @keys = sort keys( %hash );
	  for($j=0; $j < @keys; $j++){
		  my(@string_segH, @string_segE, @stringout);
		  $string1=$hash{$keys[$j]};
		  $gap_char = $1 if ($string1=~ /(\W)/);

		  ##### actual cleaning ####
		  my(@string) = split(//, $string1);
		  for($a = 0; $a < @string; $a++){
			 if($string[$a] !~/[HE]/){ ### if the splited element doesn't match 'H' or 'E'

				 ##### If any of the HH or EE counter is over the given minimum($helix_min,,)
				 if((@string_segH >= $helix_min)||( @string_segE >=$beta_strand_min)){
					 push(@stringout, @string_segH, @string_segE, '-');
					 @string_segH=@string_segE=();     }   ## just resetting.
				 else{  ### if the accumulated 'HH' or 'EE' is smaller than the minimum
					 for(0.. (@string_segH + @string_segE) ){
						push(@stringout, '-'); ### replace the short 'EE' etc with '-'
					 }
					 @string_segH=@string_segE=();  ## just resetting.
				 }
			 }
			 elsif($string[$a] =~ /^([Hh])$/){
				 push(@string_segH, $1); }
			 elsif($string[$a] =~ /^([Ee])$/){
				 push(@string_segE, $1); }
		  }
		  $hash{$keys[$j]}=join("", @stringout);
	  }
	  push(@hash_out, \%hash);
	}
	if(@hash_out == 1){ return($hash_out[0]);
	}elsif(  @hash_out > 1 ){ return(@hash_out); }
}






#________________________________________________________________________
# Title     : overlay_seq_by_certain_chars
# Usage     : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'HE')};
# Function  : (name1 000000112324)+(name1  ABC..AD..EFDK ) => (name1 000..00..12324)
#             (name2 000000112324)+(name2  --HHH--EEEE-- ) => (name1 ---000--1123--)
#             uses the second hash a template for the first sequences. gap_char is
#             '-' or '.' or any given char or symbol.
#             To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example   : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'E')};
#             output> with 'E' option >>> "name1     --HHH--1232-"
# Warning   : If gap_chr ('H',,,) is not given, it replaces all the
#             non-gap chars (normal alphabet), ie,
#             it becomes 'superpose_seq_hash'
# Keywords  : Overlap, superpose hash, overlay, superpose_seq_hash
# Options   : E for replacing All 'E' occurrances in ---EEEE--HHHH----, etc.
#             : H for replacing all 'H'  "     " "
# Returns   : one hash ref.
# Argument  : 2 ref for hash of identical keys and value length.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub overlay_seq_by_certain_chars{
	my($i, $k,$j, $name, @in, %out, $gap_chr, @str1, @str2);
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	####### Sub argument handling ########  $gap_chr here can be 'HE' etc.
	#______________________________________
	for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.+)$/) ){
		  $gap_chr  .= $1;
	  }elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)+$/) ){
		  $gap_chr  .= $1;
	  }elsif(ref($_[$k]) eq "HASH") { push(@in,  $_[$k]); }
	}

	if($#in < 1){
	  print "\n overlay_seq_by_certain_chars needs 2 hashes. Error \n"; exit; }
	my(%hash1)=%{$in[0]};
	my(%hash2)=%{$in[1]};
	my(@names1)= sort keys %hash1;
	my(@names2)= sort keys %hash2;
	(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
	for ($j=0; $j < $bigger; $j++){
	  @str1=split(//, $hash1{$names1[$j]});
	  @str2=split(//, $hash2{$names2[$j]});
	  if( ($gap_chr eq '') && ($hash2{$names2[$j]}=~/(\W)/) ){
		  $gap_chr=$1;
		  for($i=0; $i < @str2; $i++){
			  if($str2[$i] =~ /$gap_chr/){ $str1[$i]=$gap_chr;}     }
		  $out{$names1[$j]}=join(",", @str1);
	  }else{
		  for($i=0; $i < @str2; $i++){
			  if($gap_chr =~ /$str2[$i]/){ $str2[$i]=$str1[$i];}    }
		  $out{$names1[$j]}=join(",", @str2);    }
	}
	return(\%out);
}



#________________________________________________________________________
# Title     : rev_lines_pdb
# Usage     : &rev_lines_pdb(\$ARGV[0]);
# Function  : reorders the lines of any pdb files, but takes only C alpha positions.
# Example   :
#             The INPUT example >
#
#             ATOM    191  CA  ALA   195      -2.566   8.099  42.827  1.00 12.42      1ENG 256
#             ATOM    192  CA  ARG   196      -1.401  11.546  41.629  1.00  8.63      1ENG 257
#             ATOM    193  CA  THR   197      -4.073  13.846  43.107  1.00  9.93      1ENG 258
#
#             The OUTPUT example >             <first file, called  xxxx1.atm >
#
#             ATOM      1  CA  ALA     1      -2.566   8.099  42.827  1.00 12.42      1ENG 256
#             ATOM      2  CA  ARG     2      -1.401  11.546  41.629  1.00  8.63      1ENG 257
#             ATOM      3  CA  THR     3      -4.073  13.846  43.107  1.00  9.93      1ENG 258
#
#                                           <2nd file, called  xxxx2.atm >
#             ATOM      1  CA  THR     1      -4.073  13.846  43.107  1.00  9.93      1ENG 258
#             ATOM      2  CA  ARG     2      -1.401  11.546  41.629  1.00  8.63      1ENG 257
#             ATOM      3  CA  ALA     3      -2.566   8.099  42.827  1.00 12.42      1ENG 256
#
# Warning   : A Biomatic
# Keywords  :
# Options   : None
# Returns   : directly writes two output files  xxxx1.atm  xxxx2.atm
# Argument  : one pdb coordinate file reference
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub rev_lines_pdb{
	 my(@lines, $i, $c, @line_rev, $ATOM, $RES );
	 my($input_file_name) = ${$_[0]};
	 open(INPUT, "$input_file_name");
	 while(<INPUT>){  push(@lines, $_); $whole++;  }
	 my($base_name) =${&get_base_name(\$input_file_name)};
	 my($file1) = "${base_name}1\.atm";

Bioinf.pl  view on Meta::CPAN

	  @string1=split(/$gap_char1/, $hash0{$keys1[$i]});
	  @string2=split(/$gap_char2/, $hash1{$keys2[$i]});
	  ### @string1 => (0,0,0,0,1,1,1,1,1) @string2 => (3,4,2,13,2,1,23,3)


	  ################################################################
	  ##  Main calc part, you get %tally_all_occur and %tally_occur
	  ################################################################
	  for($j=0; $j < @string1; $j++){
		  $tally_all_occur{$string1[$j]}++ ; ## <-- number of all the positions
		  if( ($string2[$j]=~/[\d\^]+/)&&($string1[$j]=~/[\d\^]+/) ){
			  $tally_occur{$string1[$j]}+=$string2[$j] ; # %tally_occur is for added up counts
		  }                                             # %tally_all_occur is for only the position
	  }                                                #  occurances of '0', '1' or whatever. To know
																		#  how many '0' entry were you should use this.
	  ####################################################################################
	  ##  When options were put, do more calc on %tally_all_occur and %tally_occur
	  ####################################################################################
	  if($char_opt =~ /a/i ){
		 print "\n           $char_opt ";
		 my(@cs_rates) = sort keys %tally_all_occur;
		 for($k=0; $k < @cs_rates; $k++){
			 if($tally_all_occur{$cs_rates[$k]} == 0){
				 $tally{$cs_rates[$k]} =0; next;}
			 if($char_opt =~ /i/i ){
				 $tally{$cs_rates[$k]}=int($tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]}); }
			 elsif($char_opt !~ /i/i ){
				 $tally{$cs_rates[$k]}= $tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]};
			 }
		 }
	  }
	  elsif($char_opt =~ /[np]/i){
		 my($big_sum, @cs_digits);
		 @cs_digits = sort keys %tally_occur;  # @cs_digits are (0, 1, and 2 )
		 for(@cs_digits){ $big_sum+=$tally_occur{$_}/$tally_all_occur{$_};   }
		 for($t=0; $t < @cs_digits; $t++){
			if($big_sum ==0){ $tally{$cs_digits[$t]}=0; next; }
			else{
			  if($char_opt =~ /i/i){
				 $tally{$cs_digits[$t]}= int(($tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor)+0.4999);}
			  elsif($char_opt !~ /i/i ){
				 $tally{$cs_digits[$t]}= $tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor;}
			}
		 }
	  }
	}
	if($char_opt =~ /[an]/i){
		 print "\n           $char_opt ";
	  return(\%tally, \%tally_all_occur);}
	else{ return(\%tally_occur, \%tally_all_occur);}
}
#________________________________________________________________________
# Title     : superpose_seq_hash   ( first to second hash) ## the oldest version.
# Usage     : %out =%{&superpose_seq_hash(\%hash1, \%hash2)};
# Function  : (name1 000000112324)+(name1  ABC..AD..EFD ) => (name1 000..01..324)
#             uses the second hash a template for the first sequences. gap_char is
#             '-' or '.'
#             To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example   :
# Warning   : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords  : overlay sequence, overlay alphabet, superpose sequence,
# Options   :
# Returns   : one hash ref.
# Argument  : 2 refs. for hash of identical keys and value length and gap_chr.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub superpose_seq_hash{
	if($debug eq 1){ print __LINE__, " # superpose_seq_hash : \n"; }
	my($gap_chr)='-';
	my($i, $j, %hash1, %hash2, $name, %out, @str1, @str2);

	if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
	  %hash1=%{$_[0]}; %hash2=%{$_[1]}; }
	else{ print "\n superpose_seq_hash needs hash ref\n"; print chr(007); exit; }

	my(@names1)=sort keys %hash1; my(@names2)=sort keys %hash2;
	(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;

	for ($j=0; $j < $bigger; $j++){
	 if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
		@str1=split(/|\,/, $hash1{$names1[$j]});
		@str2=split(/|\,/, $hash2{$names2[$j]});
		for($i=0; $i < @str2; $i++){
		  if($str2[$i] ne $gap_chr){ $str2[$i]=$str1[$i];  } }
		$out{$names1[$j]}=join(",", @str2);
	}
	return(\%out);
}


#________________________________________________________________________
# Title     : overlay_seq_hash   ( first to second hash) ## the oldest version.
# Usage     : %out =%{&overlay_seq_hash(\%hash1, \%hash2)};
# Function  : (name1 000000112324)+(name1  ABC..AD..EFD ) => (name1 000..01..324)
#             uses the second hash a template for the first sequences. gap_char is
#             '-' or '.'
#             To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example   :
# Warning   : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords  :
# Options   :
# Returns   : one hash ref.
# Argument  : 2 refs. for hash of identical keys and value length and gap_chr.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub overlay_seq_hash{
	my($gap_chr)='-'; my($i, $j, $name, %out, @str1, @str2);

	if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
	  my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; }
	else{ print "\n overlay_seq_hash needs hash ref\n"; print chr(007); exit; }

	my(@names1)=keys %hash1; my(@names2)=keys %hash2;
	(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;

	for ($j=0; $j < $bigger; $j++){
	 if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
		@str1=split(//, $hash1{$names1[$j]}); @str2=split(//, $hash2{$names2[$j]});
		for($i=0; $i < @str2; $i++){
		  if(($str2[$i] =~ /\W/)||($str2[$i] =~ //)){ $str1[$i]="$gap_chr";}}
		$out{$names1[$j]}=join(",", @str1);
	}
	\%out;
}


#________________________________________________________________________
# Title     : insert_gaps_in_seq_hash  ( first to second hash)
# Usage     : %out_extended_seq =%{&insert_gaps_in_seq_hash(\%hash1, \%hash2)};
# Function  : superpose two hashes of the same sequence or same seq. length sequences,
#             but unlike 'superpose_seq_hash', this inserts gaps and extend the
#             sequences.
#             (name1_sec  hHHHHHH EEEEEEE) +
#             (name1_seq  .CDEABC..AD..EFD..EKST) => (name1_ext  .hHHHHH..H...EEE..EEEE)
#             In the example, the undefined sec. str. position is replaced as gaps('.')
#             Uses the second hash a template for the first sequences. gap_char is
#             '-' or '.'
#             One rule is that the SECOND hash contains gaps!!
#             There are two types of hash input. One is simple seq hash(both args)
#              The other is from secondary structure prediction. The hash has contents
#              like: $averaged{$position}=[$residue1, $sec_str2, $dif_reliability];
# Example   :
# Warning   : coded by A Biomatic
# Keywords  : superposing sequences with gaps, interpolate_sequences, interpolate_gaps
# Options   :
# Returns   : one hash ref.
# Argument  : 2 ref for hash of identical keys and value length.
# Category  :
# Version   : 1.3
#--------------------------------------------------------------------
sub insert_gaps_in_seq_hash{
    my($gap_char)='-';
    my($g, $i, $j, $t, %hash1, %hash2, $bigger, $name, %new_hash_with_gap, @str1,
       @posi, @str2, $char_from_gapped_hash, $GAP_CHAR);
    my($join_char) =',';  ## <<-- This is for the final output joined by this var.
    $GAP_CHAR='-';

    if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
       %hash1=%{$_[0]};
       %hash2=%{$_[1]};
    }else{
       print "\n superpose_seq_hash needs hash ref\n";
       print chr(007); exit;
    }
    my(@names1)=keys %hash1;
    my(@names2)=keys %hash2;

    if($hash1{$names1[0]}->[0] =~/\w/){ # this means that the input hash is sec str hash type
        print "\n# (INFO) insert_gaps_in_seq_hash: You have put sec str pred hash as an input.\n\n";
        @posi=split(//, $hash2{$names2[0]});

Bioinf.pl  view on Meta::CPAN


	$gap_char='.';

	%arraySTR = %{&hash_common_by_keys(\%arraySTR, \%arraySEQ)};
	%arraySEQ = %{&hash_common_by_keys(\%arraySEQ, \%arraySTR)};
	%arraySEQ = %{&remov_com_column(\%arraySEQ)};
	%arraySTR = %{&remov_com_column(\%arraySTR)};

	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if($debug eq 1){
		print __LINE__,
		" ## sorting sequence names. To make things constant. \n\n";  }
	@names= sort keys %arraySTR;
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	#  If common column of secondary structure representation option $comm_col is set
	#  open_dssp_files sub routine will get the common seq parts of all the sequences.
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if($comm_col =~ /C/i){
		%DSSP_common=%{&open_dssp_files( @names, $H, $S, $E, $T, $I, $G, $B, $simplify, 'C')};
		@keys_common= keys %DSSP_common;
		@stringDSSP_common = split(/|\,/, $DSSP_common{$keys_common[0]});
		if($debug2 eq 1){ print __LINE__," \$comm_col is set to: $comm_col \n";
			print __LINE__," \@stringDSSP_common is :@stringDSSP_common \n";
		}
	}

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	# Comparing two hashes
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	for $name (@names){
		#"""""""""""""""" Splitting the sequence string
		if($arraySEQ{$name}=~/\,\S+\,/){
			@stringSEQ =split(/\,/, $arraySEQ{$name});
			@stringSTR=split(/\,/, $arraySTR{$name});  }
		else{
			@stringSEQ =split(//, $arraySEQ{$name});
			@stringSTR=split(//, $arraySTR{$name});
		}
		print "\n",__LINE__, " \@stringSEQ  is  @stringSEQ \n" if $debug2 eq 1;
		print "\n",__LINE__, " \@stringSTR  is  @stringSTR \n" if $debug2 eq 1;

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   Contracting  the SEQ.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@seq_positionSEQ = @{&get_posi_sans_gaps(\$arraySEQ{$name})};
		@seq_positionSTR = @{&get_posi_sans_gaps(\$arraySTR{$name})};

		#"""""""""""""""" To get secondary structure only calc  """"""""""""""""""""""""""""
		# It superposes the NON sec. region on  @seq_positionSTR to nullify positions.
		#  get_posi_diff ignores non char positions in calc.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		if( ($ss_opt =~ /ss$/i) && ($comm_col !~ /C/i) ){
			%DSSP=%{&open_dssp_files($name, $H, $S, $E, $T, $I, $G, $B, $simplify, $comm_col)};
			if($debug1 eq 1){
			   print "\n",__LINE__," open_dssp_files has options \$H ->$H \$S->$S \$E->$E \n";
			   print "\n",__LINE__," \$T->$T \$I->$I \$G->$B \$simplify->$simplify \$comm_col ->$comm_col\n";
			   &show_hash( \%DSSP );
			}
			if(ref(\%DSSP) eq 'HASH'){ # to check it %DSSP was valid, If not it skips overlaying
				@stringDSSP = split(/|\,/, $DSSP{$name});
				$size_of_stringDSSP = @stringDSSP;
				$size_of_seq_positionSTR = @seq_positionSTR;
				if($debug2 eq 1){
					  print "\n",__LINE__," \@stringDSSP is \n @stringDSSP\n";
					  print "\n",__LINE__," Size of \@stringDSSP      is $size_of_stringDSSP\n" ;
					  print "\n",__LINE__," Size of \@seq_positionSTR is $size_of_seq_positionSTR\n";
					  print "\n",__LINE__," \$gap_char is \"$gap_char\" \n" ;
				}
				#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
				#   When the sec. str is not defined in DSSP, I delete the position of
				#   @stringDSSP to gap(ie. make it blank to exclude error rate calc)
				#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
				for($i=0; $i < @stringDSSP; $i++){
					if($stringDSSP[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
				}
			}
		}elsif( $comm_col =~ /C/i){
				print __LINE__, " Replacing position with \gap_char \"$gap_char\"\n" if $debug2 eq 1;
				$ss_opt = 'ss'; # whether it was set or not, make it 'ss'
				for($i=0; $i < @stringDSSP_common; $i++){
					if($stringDSSP_common[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
				}
		}

		if($debug2 eq 1){
			print __LINE__,
			print " \@seq_positionSTR is  @seq_positionSTR\n";
		}

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   getting Position differences.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@position_diffs  = @{&get_posi_diff(\@seq_positionSEQ, \@seq_positionSTR)};

		if($debug2 eq 1){
			print __LINE__,
			print " \@position_diffs is  @position_diffs\n";
		}

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#  You can have two types of output according to which alignment you compare your
		#   error rates. (1) Compare to @stringSEQ   (2) @stringSTR
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@position_corrected1 = @{&put_position_back_to_str_seq(\@stringSEQ, \@position_diffs)};
		$array3{$name}=join(",", @position_corrected1);

	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	# The final Step for error rate, $LIMIT is to confine error rate in one digit (ie, less than 10)
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	%final_posi_diffs =%{&get_residue_error_rate(\%array3, $LIMIT)};

	undef(@whole_length, $len_of_seq);
	return(\%final_posi_diffs);
}

#________________________________________________________________________
# Title     : get_posi_rates_hash_out (derived from 'get_posi_shift_hash' )
# Usage     : %rate_hash = %{&get_posi_shift_hash(\%hash_msf, \%hash_jp)};

Bioinf.pl  view on Meta::CPAN

#             seq2  ABCDEE-----DD-  ==>    seq2  ABCDEE-DD-
#             seq3  ---DEE----DDE-         seq3  ---DEEDDE-
#                         ^^^^
#             from above the 4 columns of gap will be removed
#             To remove absurd gaps in multiple sequence alignment
# Warning   :
# Keywords  :
# Options   :
# Returns   : a ref. of a hash.
#
#               <input hash>                   <out hash>
#
# Argument  : accepts reference for a hash.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub remov_com_column2{
	my(%input) = %{$_[0]};
	my(@common)=();
	my($len)=0;
	my(@string)=();
	my(@new_string)=();
	my(@string2)=();
	my(%new_string);

	########## Finds common gaps ###########
	for $key(keys %input){
	  $len  = &smaller_one($#common, $#string) unless $#common < 0;
		  sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
	  @string = split(/|| /, $input{$key});
	  for ($k=0; $k <= $len;$k++){
		  if($#common == -1){
			  @common = (@string);
			  last;
		  }
		  if (($string[$k] eq '.')&&($common[$k] eq '.')){
			  $common[$k]='.';
		  }else{
			  $common[$k]='X';
		  }
	  }
	}

	########## removes gaps ###########
	for $key2 (keys %input){
		@string2 = split(//, $input{$key2});
		for ($i=0; $i <= $#string2; $i++){
		  if ($common[$i] eq $string2[$i]){
			  print;
		  }else{
			  push(@new_string, $string2[$i]);
		  }
		}
		$new_string{$key2}= join("", @new_string);
		@new_string = ();
	}
	\%new_string;
}

#________________________________________________________________________
# Title     : get_common_column   (similar to overlay_seq_for_identical_chars )
# Usage     : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
# Function  : (name1         --EHH--HHEE-- )
#             (name2         --HHH--EEEE-- ) ==> result is;
#
#             (name1_name2   -- HH--  EE-- )
#             to get the identical chars in hash strings of sequences.
#
# Example   : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
#             output> with 'E' option >>> "name1     --HHH--1232-"
#   Following input will give;
#  %hash1 = ('s1', '--EHH-CHHEE----EHH--HHEE----EHH--HHEE----EHH-CHHEE--');
#  %hash2 = ('s2', '--EEH-CHHEE----EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#  %hash3 = ('s3', '-KEEH-CHHEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#  %hash4 = ('s4', '-TESH-CHEEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#
#    s1_s2_s3_s4    --E-H-CH-EE----E-H--HHEE----E-H--HHEE----E-H-CHHEE--
#
# Warning   : This gets more than 2 hashes. Not more than that!
#
# Class     : get_common_column, get_common_column_in_seq, get common column in sequence
#             for secondary structure only representation.
# Keywords  : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
#             get_common_column, get_com_column, get_common_sequence,
#             get_common_seq_region, multiply_seq_hash, get_common_column_in_sequence
# Options   :
# Reference :
# Returns   : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Tips      :
# Argument  : 2 or more ref for hash of identical keys and value length.
#             One optional arg for replacing space char to the given one.
# Author    : jong@salt2.med.harvard.edu
# Category  :
# Version   : 1.6
#--------------------------------------------------------------------
sub get_common_column{
  my($i, $k,$j, $name1, $name2, $sorted_name, @in, %out, @out_chars,
     $gap_chr, @str1, @str2);
  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  Sub argument handling     $gap_chr here can be 'HE' etc.
  #_______________________________________________________
  for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
          elsif(ref($_[$k]) eq "HASH") { push(@in,  $_[$k]); }    }

  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  Checking if the input hashes were right
  #_______________________________________________________
  if( (@in < 2) && ( ref($in[0]) eq 'HASH') ){
	  print "\n", __LINE__, " # get_common_column usually needs 2 hashes. Error \n";
	  print "\n", __LINE__, " # get_common_column : Anyway, I will just return the single input hash:  @in. \n";
	  %out=%{$in[0]}; # <--- This is essential to return the single input hash!!
	  goto FINAL;
  }
  if(@in ==2){  print "\n# (INFO) \@in has 2 elements for ref. of hash, good!\n\n"; }

  %out = %{$in[0]};  ## Initializing %out
  print "\n",__LINE__, " # get_common_column hashes given are: @in \n" if $debug eq 1;

  for( $k=1; $k < @in; $k++){
      my(@out_chars);   ## <-- Necessary to prevent concatenation.
      my(%hash1)=%out;
      my(%hash2)=%{$in[$k]};
      my(@names1)= sort keys %hash1;
      my(@names2)= sort keys %hash2;
      $name1 = $names1[0];
      $name2 = $names2[0];
      @str1=split(/\||\,/, $hash1{$names1[0]});
      @str2=split(/\||\,/, $hash2{$names2[0]});

      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Trying to guess the correct gap char
      #_____________________________________________________
      if($hash2{$names2[0]}=~/_/){
          $gap_chr='_';
      }elsif($hash2{$names2[0]}=~/(\W)\W/){
          $gap_chr=$1;
      }

      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      #  If I fail to split string by ',', then split by ''
      #_____________________________________________________
      if(@str1 == 1){  @str1=split(//, $hash1{$names1[0]}); }
      if(@str2 == 1){  @str2=split(//, $hash2{$names2[0]}); }
      for($i=0; $i < @str1; $i++){
          if($str1[$i] eq $str2[$i] ){
              push(@out_chars, $str1[$i]);
          }else{
              if( defined($gap_chr) ){
                   push(@out_chars, $gap_chr);
              }else{
                   push(@out_chars, ' ');
              }
          }
      }
      $sorted_name=join('', ($name1, $name2));
      %out='';
      $out{"$sorted_name"}= join("", @out_chars);

  }
  FINAL:
  if ($debug eq 1){
	  print "\n",__LINE__, " # get_common_column Final res. \%out :\n",
	  &show_hash(%out);
  }
  return(\%out);
}

#________________________________________________________________________
# Title     : overlay_seq_for_identical_chars
# Usage     : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# Function  : (name1         --EHH--HHEE-- )
#             (name2         --HHH--EEEE-- ) ==> result is;
#
#             (name1_name2   -- HH--  EE-- )
#             to get the identical chars in hash strings of sequences.
#
# Example   : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
#             output> with 'E' option >>> "name1     --HHH--1232-"
# Warning   : Works only for 2 sequence hashes.
# Keywords  : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# Options   :
# Returns   : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Argument  : 2 ref for hash of identical keys and value length. One optional arg for
#             replacing space char to the given one.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub overlay_seq_for_identical_chars{
	my($i, $k,$j, $name1, $name2, @in, %out, @out_chars, $gap_chr, @str1, @str2);
	######################################
	####### Sub argument handling ########  $gap_chr here can be 'HE' etc.
	######################################
	for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
	  elsif(ref($_[$k]) eq "HASH") { push(@in,  $_[$k]); }    }

	if(@in < 2){ print "\n overlay_seq_for_identical_chars needs 2 hashes. Error \n"; exit; }
	my(%hash1)=%{$in[0]}; my(%hash2)=%{$in[1]};
	my(@names1)=sort keys %hash1; my(@names2)= sort keys %hash2;
	$name1 = $names1[0]; $name2 = $names2[0];
	@str1=split(/|\,/, $hash1{$names1[0]}); @str2=split(/|\,/, $hash2{$names2[0]});
	for($i=0; $i < @str1; $i++){
	  if($str1[$i] eq $str2[$i] ){
		  push(@out_chars, $str1[$i]); }
	  elsif( defined($gap_chr) ){ push(@out_chars, $gap_chr); }
	  else{ push(@out_chars, ' '); }
	}
	if( $name2 gt $name1){
	  $out{"$name1\_$name2"}= join(",", @out_chars); }
	else{  $out{"$name2\_$name1"}= join(",", @out_chars); }
	\%out;
}

#________________________________________________________________________
# Title     : remov_com_column
# Usage     : %new_string = %{&remov_com_column(\%hashinput)};
#             @out=@{&remov_com_column(\@array3)};
# Function  : removes common gap column in seq.
# Example   :
# Warning   :
# Keywords  : remove_com_column, remove_common_column,
#             remove_common_gap_column, remov_common_gap_column,
#             remove com column
# Options   :
# Returns   : a ref. of  hash(es) and array(s).
#
#             name1   ABCDE....DDD       name1  ABCDE..DDD
#             name2   ABCDEE..DD..  -->  name2  ABCDEEDD..
#             name3   ...DEE..DDE.       name3  ...DEEDDE.
#
#             (ABC....CD, ABCD...EE) --> (ABC.CD, ABCDEE)
#             from above the two column of dot will be removed
#             To remove absurd gaps in multiple sequence alignment. for nt6-hmm.pl
# Argument  : accepts reference for hash(es) and array(s).
# Category  :
# Version   :
#--------------------------------------------------------------------
sub remov_com_column{
	my(@hash_ref_out, $d, $gap_char);
	for($d=0; $d < @_; $d++){
	  if(ref($_[$d]) eq 'HASH'){
	      my($len,@string,@new_string,@string2);
		  my(%input)=%{$_[$d]};
		  my(@common);
		  for (keys %input){
			  @string = split('', $input{$_});
			  if(!(defined(@common))){ @common = (@string);  }
			  else{ for ($k=0; $k < @string; $k++){
				 if (($string[$k] =~ /\W/ )&&($common[$k] =~ /(\W)/)){ $common[$k]= $1;}
				 elsif(($string[$k] =~ /(\W)/)&&(!(defined($common[$k])))){ $common[$k]=$1;}
				 else{ $common[$k]='X';} } } }
		  for (keys %input){ @string2 = split(//, $input{$_});
			  for ($i=0; $i < @string2; $i++){
				 if ($common[$i] ne $string2[$i]){ push(@new_string, $string2[$i]); } }
			  $new_string{$_}= join('', @new_string); @new_string = ();      }
		  push(@hash_ref_out, \%new_string);
	  }



( run in 0.562 second using v1.01-cache-2.11-cpan-39bf76dae61 )