Bioinf

 view release on metacpan or  search on metacpan

Bioinf.pm  view on Meta::CPAN

        filter_seq_DB_by_seq_length find_central_seq_msp_chunk find_central_sequence find_low_complexity_region
        find_program_in_path find_seq_file_old find_seq_files find_source_perl_library
        follow_seqlet_link fromJulian full_pwd_path geanfammer
        geanfammer_main get_added_matched_regions_in_msp get_all_dirs_from_ENV get_all_msp_files
        get_av_and_sd_seq_length get_av_seq_length get_average_sequence_size get_averaged_prediction
        get_base_names get_column get_common_array_entry get_common_column
        get_common_hash_keys get_correct_percent_alignment_rate get_date get_dir_names_only
        get_domain_inside_domain get_each_posi_diff_hash get_extension_names get_false_positive_seq_matches
        get_file_dir_names get_file_extensions get_first_seq_in_alignment get_full_dir_names
        get_full_file_name get_full_path_dir_names get_full_pwd_path get_gap_positions
        get_hash_value_average get_high_score_blocks get_homology_info_of_seq_pairs get_host_by_addr
        get_host_by_name get_id_among_2 get_id_among_2_1 get_id_among_2_2
        get_internal_dup_in_a_cluster get_isearch_result_stat get_largest_element get_largest_file
        get_linked_sequence get_linux_kernel_version get_longest_str_size get_max_hash_by_value
        get_median get_median get_msp_enquiry_sequence get_msp_matched_sequence
        get_msp_range get_multiple_array_entry get_occurances_of_char get_occurances_of_shift_type_hash
        get_occurances_of_shift_type_hash_all get_overlapping_range get_overlapping_seq_match_size get_pair_homol_array
        get_pair_homol_hash get_path_dirs_from_ENV get_pdb_file_start_number get_peptide_occurance
        get_percent_homo_hash get_percent_homol_arr get_percentage get_perl_keywords
        get_posi_diff get_posi_diff_abs get_posi_diff_and_rms_hash get_posi_diff_hash
        get_posi_rates_hash_out get_posi_rates_hash_out get_posi_rates_hash_out_compact get_posi_rates_hash_out_jp
        get_posi_rates_hash_out_msf get_posi_sans_gaps get_posi_shift_hash get_posi_shift_hash_rms
        get_posi_shift_rate get_posi_shift_rms_hash get_posi_shift_rms_whole get_position_shift_rate
        get_probable_half get_pwd_dir get_pwd_dir_name get_residue_error_rate
        get_scop_correcting_pairs get_sd_of_length_diff get_segment_shift_rate get_seq_fragments
        get_seq_hash_sans_gaps get_seq_identity get_seqblock get_sequence_complexity
        get_sequence_number get_shortest_str_size get_smallest_file get_stat_FASTA_search_result_in_msp_0_files
        get_sub_hash get_subdir_names get_subroutine_calls get_time
        get_total_memory_size_in_linux get_unix_shell_name get_whole_pwd_path get_windows_cs_rate_array
        get_windows_sc_rate_array get_wrong_segment_rate handle_arguments handle_arguments_old
        hash_average hash_catenate hash_chk hash_common
        hash_common2 hash_common_by_keys hash_no_common hash_output_chk
        hash_stat_for_all hash_substract_by_keys hostname if_file_older_than_x_days
        import_ENV_vars initialize_code insert_gaps_in_seq_hash insert_lines_anywhere
        interm_lib_search is_html isroman key_ready
        link_ranges load_mount_info mail_it make_2D_aa_residue_matrix_array
        make_2D_identity_matrix make_2D_identity_matrix_array make_6_frame_dna_sequences make_cdf_file
        make_clustering_summary make_composition_ratio_table make_composition_ratio_table_simple make_composition_table
        make_fasta_files_from_msp_1_files make_filtered_list make_hmm_from_alignment make_intermediate_sequence_library
        make_one_array make_pairs_from_hash make_random_sequence make_reverse_seq_database
        make_scrambled_seq_database make_seq_alignment_length_even make_seq_index_file make_sequence_match_table
        make_singlet_list_from_pdb_entries make_standalone_subroutines make_swiss_index make_template_from_sec_str
        max max_elem_array max_elem_string_array max_elem_string_array
        max_str_key_hash max_str_value_hash maximum merge_array
        merge_arrays_by_common_elements merge_hash merge_many_arrays merge_sequence_alignments
        merge_sequence_in_msp_chunk merge_sequence_in_msp_file merge_similar_ranges merge_similar_seqlets
        merge_superfam_fasta_files_for_ISL min min_elem_array min_str_key_hash
        min_str_value_hash minimum msf_permute_array_write msf_permute_hash_write
        msp_single_link_hash mv n normalize_numbers
        numerically occurances one_to_three_letter open_ali_files
        open_aln_files open_brk_files open_cel_files open_clu_files
        open_dna_files open_dssp_files open_embl_files open_fasta_files
        open_fil_file open_hlx_files open_hmmfs_files open_hmmls_files
        open_jp_files open_lottery_file open_msf_files open_msf_jp_files
        open_msp_files open_out_files open_pdb_files open_pdbg_files
        open_phd_files open_pir_files open_predator_files open_rms_files
        open_rms_files2 open_sdb_files open_self open_seq_alignment_files
        open_seq_files open_sequence_index_files open_slx_files open_sso_files
        open_sst_files open_sst_files_with_gap open_stride_dat_files open_stride_dat_files
        open_subdir_and_go_in_and_do open_swissprot_seq_files open_tem_files opendir_and_go
        opendir_and_go_in_and_do_something opendir_and_go_rand_fasta opendir_and_go_rand_fasta_and_clustal overlay_seq_by_certain_chars
        overlay_seq_for_identical_chars overlay_seq_hash pair_percent_id_trend pairwise_iden_pos
        pairwise_percent_id parse_arguments permute permute_binary
        pick_random_files pick_random_hash_pairs pir_permute_array_write pir_permute_hash_write
        pir_write pir_write plot_histogram_horizontally plot_vertically
        print_clusfile_from_hash print_in_block print_seq_in_block print_seq_in_block_old
        print_seq_in_block_with_print print_seq_in_columns produce_random_numbers push_if_not_already
        push_if_not_already put_gaps_every_x_position_in_string put_gaps_every_x_position_in_string_special put_gaps_in_hash
        put_msp_lines_to_hash_from_bla put_position_back_to_str_seq put_slash_before_special_chars pwd_dir_name
        pwd_path rand_DNA_seq_generate rand_RNA_seq_generate rand_sequence_mul_array
        rand_sequence_one_array rand_sequence_one_string rand_word randomise_file_contents
        randomise_lines read_all_head_boxes read_any_dir read_any_dir2
        read_any_dir_for_dir read_any_dir_simple read_any_seq_files read_blast_hits
        read_correct_head_box read_dir_names_only read_file_extension_names_only read_file_names_only
        read_first_head_box read_fssp_files read_full_dir_names read_head_box
        read_head_box2 read_head_boxes read_hssp_no_inserts read_machine_readable_sso_lines
        read_machine_unreadable_sso_lines read_option_table read_seq_matrix_files read_sso_lines
        read_sst_files read_subroutines remov_com_column remov_com_column2
        remov_common_gap remove_dup_in_array remove_dup_in_hash remove_dup_match_in_msp_files
        remove_dup_seq_entry remove_elements_by_name remove_elements_by_pattern remove_elements_by_position
        remove_file_extension remove_mail_header_in_files remove_non_char remove_repetitives_in_array
        remove_similar_seqlets remove_small_files remove_text replace_lines
        replace_subroutines replace_text reset_all_the_vars reset_shell_environment
        rev_abs_numerically rev_lines_pdb rev_sequence_mul_array reverse_hash
        reverse_sequences roman rotate_seq round
        round_number round_numbers run_fasta_sequence_search scale_for_horizontal_histogram
        scan_win_and_get_cs_rate_pairs scan_win_and_get_sc_rate_pairs scan_win_get_average scan_window_and_calc_average
        scan_window_and_calc_something scan_windows_and_get_compos_seqid_rate scoring scramble_array
        scramble_sequences sd se search_files_in_subdir
        search_palindromes search_self self_self_search send_mail
        sep seq_comp_percent1 seq_comp_percent2 seq_id_percent_array
        seq_to_regexp set_debug set_debug_option set_special_options
        shift_word shift_word_recursively show_array show_default_help
        show_hash show_in_fasta show_options show_subclusterings
        smaller_one sort_by_E_values sort_by_cluster_size sort_by_column
        sort_by_column_bigger_first sort_by_digits_in_string sort_by_hash_values sort_by_keys
        sort_files_by_size sort_files_by_time sort_hash_by_keys sort_hash_by_value
        sort_hash_by_value_and_make_array sort_hash_value_by_column sort_string_by_length sort_words_in_string
        split_fasta_files split_file_by_string split_files split_sequence
        sqrt_array square_array sso_to_msp ssp_permute_array_write
        ssp_permute_hash_write ssp_write steve_permute_array strip_rotated_seq
        strip_sequence_ranges substract_array substract_hash_by_keys substract_hash_by_values
        sum_array sum_digits_in_string sum_hash_values sum_hash_values_of_string
        sum_of_array sum_of_squared_array sum_x_mul_y_arrays superpose_hash
        superpose_seq_hash take_file_name takeout_subroutines tally_2_hashes
        tell_seq_length tempname three_to_one_letter tidy_secondary_structure_segments
        time_date toJulian transform_values trim_numbers
        update_subroutines weighted_av weighted_average word_wrap
        write_aln_files write_ardf_files write_c3al_files write_c3ss_files
        write_dof_files write_evss_files write_fasta write_fasta_array
        write_fasta_seq_by_seq write_gcg_file write_gcg_genbank_file write_genbank_file
        write_good_bad_list_in_divide_clusters write_head_box write_html_headbox write_iss_file
        write_jp write_modeller_ali_file write_modeller_top_file write_mprf_files
        write_msf write_msp3_files write_msp_files write_nhco_files
        write_parf_files write_pdbg_files write_pir_file write_prdl_files
        write_pred_files write_primer_file write_rdif2_files write_rdif_files
        write_reverse_seq_files write_sdb_file write_seq_files write_staden_file
        write_subroutines x_mul_y_arrays );

#!/usr/bin/perl
#______________________________________________________________________
# Title     : Bio::Bioinf (for Bio Perlogical) or bio_lib.pl

Bioinf.pm  view on Meta::CPAN

	$helix_min=3;
	$beta_strand_min=3;

	########################################################################
	#####   general argument handling  for options of segment length  ######
	########################################################################
	for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^[Hh](\d+)$/) ){
		  $helix_min  = $1;    }
	  elsif( ( !ref($_[$k]) )&&($_[$k]=~ /^[Ee](\d+)$/) ){
		  $beta_strand_min  = $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[Hh](\d+)$/) ){
		  $helix_min  = $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[EeBb](\d+)$/) ){
		  $beta_strand_min  = $1;    }
	  elsif(ref($_[$k]) eq "HASH") { push(@hash,  $_[$k]); }    }

	for($i=0; $i < @hash; $i++){
	  my(%hash) = %{$hash[$i]};
	  @keys = sort keys( %hash );
	  for($j=0; $j < @keys; $j++){
		  my(@string_segH, @string_segE, @stringout);
		  $string1=$hash{$keys[$j]};
		  $gap_char = $1 if ($string1=~ /(\W)/);

		  ##### actual cleaning ####
		  my(@string) = split(//, $string1);
		  for($a = 0; $a < @string; $a++){
			 if($string[$a] !~/[HE]/){ ### if the splited element doesn't match 'H' or 'E'

				 ##### If any of the HH or EE counter is over the given minimum($helix_min,,)
				 if((@string_segH >= $helix_min)||( @string_segE >=$beta_strand_min)){
					 push(@stringout, @string_segH, @string_segE, '-');
					 @string_segH=@string_segE=();     }   ## just resetting.
				 else{  ### if the accumulated 'HH' or 'EE' is smaller than the minimum
					 for(0.. (@string_segH + @string_segE) ){
						push(@stringout, '-'); ### replace the short 'EE' etc with '-'
					 }
					 @string_segH=@string_segE=();  ## just resetting.
				 }
			 }
			 elsif($string[$a] =~ /^([Hh])$/){
				 push(@string_segH, $1); }
			 elsif($string[$a] =~ /^([Ee])$/){
				 push(@string_segE, $1); }
		  }
		  $hash{$keys[$j]}=join("", @stringout);
	  }
	  push(@hash_out, \%hash);
	}
	if(@hash_out == 1){ return($hash_out[0]);
	}elsif(  @hash_out > 1 ){ return(@hash_out); }
}






#________________________________________________________________________
# Title     : overlay_seq_by_certain_chars
# Usage     : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'HE')};
# Function  : (name1 000000112324)+(name1  ABC..AD..EFDK ) => (name1 000..00..12324)
#             (name2 000000112324)+(name2  --HHH--EEEE-- ) => (name1 ---000--1123--)
#             uses the second hash a template for the first sequences. gap_char is
#             '-' or '.' or any given char or symbol.
#             To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example   : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'E')};
#             output> with 'E' option >>> "name1     --HHH--1232-"
# Warning   : If gap_chr ('H',,,) is not given, it replaces all the
#             non-gap chars (normal alphabet), ie,
#             it becomes 'superpose_seq_hash'
# Keywords  : Overlap, superpose hash, overlay, superpose_seq_hash
# Options   : E for replacing All 'E' occurrances in ---EEEE--HHHH----, etc.
#             : H for replacing all 'H'  "     " "
# Returns   : one hash ref.
# Argument  : 2 ref for hash of identical keys and value length.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub overlay_seq_by_certain_chars{
	my($i, $k,$j, $name, @in, %out, $gap_chr, @str1, @str2);
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	####### Sub argument handling ########  $gap_chr here can be 'HE' etc.
	#______________________________________
	for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.+)$/) ){
		  $gap_chr  .= $1;
	  }elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)+$/) ){
		  $gap_chr  .= $1;
	  }elsif(ref($_[$k]) eq "HASH") { push(@in,  $_[$k]); }
	}

	if($#in < 1){
	  print "\n overlay_seq_by_certain_chars needs 2 hashes. Error \n"; exit; }
	my(%hash1)=%{$in[0]};
	my(%hash2)=%{$in[1]};
	my(@names1)= sort keys %hash1;
	my(@names2)= sort keys %hash2;
	(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
	for ($j=0; $j < $bigger; $j++){
	  @str1=split(//, $hash1{$names1[$j]});
	  @str2=split(//, $hash2{$names2[$j]});
	  if( ($gap_chr eq '') && ($hash2{$names2[$j]}=~/(\W)/) ){
		  $gap_chr=$1;
		  for($i=0; $i < @str2; $i++){
			  if($str2[$i] =~ /$gap_chr/){ $str1[$i]=$gap_chr;}     }
		  $out{$names1[$j]}=join(",", @str1);
	  }else{
		  for($i=0; $i < @str2; $i++){
			  if($gap_chr =~ /$str2[$i]/){ $str2[$i]=$str1[$i];}    }
		  $out{$names1[$j]}=join(",", @str2);    }
	}
	return(\%out);
}



#________________________________________________________________________
# Title     : rev_lines_pdb
# Usage     : &rev_lines_pdb(\$ARGV[0]);
# Function  : reorders the lines of any pdb files, but takes only C alpha positions.
# Example   :
#             The INPUT example >
#
#             ATOM    191  CA  ALA   195      -2.566   8.099  42.827  1.00 12.42      1ENG 256
#             ATOM    192  CA  ARG   196      -1.401  11.546  41.629  1.00  8.63      1ENG 257
#             ATOM    193  CA  THR   197      -4.073  13.846  43.107  1.00  9.93      1ENG 258
#
#             The OUTPUT example >             <first file, called  xxxx1.atm >
#
#             ATOM      1  CA  ALA     1      -2.566   8.099  42.827  1.00 12.42      1ENG 256
#             ATOM      2  CA  ARG     2      -1.401  11.546  41.629  1.00  8.63      1ENG 257
#             ATOM      3  CA  THR     3      -4.073  13.846  43.107  1.00  9.93      1ENG 258
#
#                                           <2nd file, called  xxxx2.atm >
#             ATOM      1  CA  THR     1      -4.073  13.846  43.107  1.00  9.93      1ENG 258
#             ATOM      2  CA  ARG     2      -1.401  11.546  41.629  1.00  8.63      1ENG 257
#             ATOM      3  CA  ALA     3      -2.566   8.099  42.827  1.00 12.42      1ENG 256
#
# Warning   : A Biomatic
# Keywords  :
# Options   : None
# Returns   : directly writes two output files  xxxx1.atm  xxxx2.atm
# Argument  : one pdb coordinate file reference
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub rev_lines_pdb{
	 my(@lines, $i, $c, @line_rev, $ATOM, $RES );
	 my($input_file_name) = ${$_[0]};
	 open(INPUT, "$input_file_name");
	 while(<INPUT>){  push(@lines, $_); $whole++;  }
	 my($base_name) =${&get_base_name(\$input_file_name)};
	 my($file1) = "${base_name}1\.atm";

Bioinf.pm  view on Meta::CPAN

	  @string1=split(/$gap_char1/, $hash0{$keys1[$i]});
	  @string2=split(/$gap_char2/, $hash1{$keys2[$i]});
	  ### @string1 => (0,0,0,0,1,1,1,1,1) @string2 => (3,4,2,13,2,1,23,3)


	  ################################################################
	  ##  Main calc part, you get %tally_all_occur and %tally_occur
	  ################################################################
	  for($j=0; $j < @string1; $j++){
		  $tally_all_occur{$string1[$j]}++ ; ## <-- number of all the positions
		  if( ($string2[$j]=~/[\d\^]+/)&&($string1[$j]=~/[\d\^]+/) ){
			  $tally_occur{$string1[$j]}+=$string2[$j] ; # %tally_occur is for added up counts
		  }                                             # %tally_all_occur is for only the position
	  }                                                #  occurances of '0', '1' or whatever. To know
																		#  how many '0' entry were you should use this.
	  ####################################################################################
	  ##  When options were put, do more calc on %tally_all_occur and %tally_occur
	  ####################################################################################
	  if($char_opt =~ /a/i ){
		 print "\n           $char_opt ";
		 my(@cs_rates) = sort keys %tally_all_occur;
		 for($k=0; $k < @cs_rates; $k++){
			 if($tally_all_occur{$cs_rates[$k]} == 0){
				 $tally{$cs_rates[$k]} =0; next;}
			 if($char_opt =~ /i/i ){
				 $tally{$cs_rates[$k]}=int($tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]}); }
			 elsif($char_opt !~ /i/i ){
				 $tally{$cs_rates[$k]}= $tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]};
			 }
		 }
	  }
	  elsif($char_opt =~ /[np]/i){
		 my($big_sum, @cs_digits);
		 @cs_digits = sort keys %tally_occur;  # @cs_digits are (0, 1, and 2 )
		 for(@cs_digits){ $big_sum+=$tally_occur{$_}/$tally_all_occur{$_};   }
		 for($t=0; $t < @cs_digits; $t++){
			if($big_sum ==0){ $tally{$cs_digits[$t]}=0; next; }
			else{
			  if($char_opt =~ /i/i){
				 $tally{$cs_digits[$t]}= int(($tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor)+0.4999);}
			  elsif($char_opt !~ /i/i ){
				 $tally{$cs_digits[$t]}= $tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor;}
			}
		 }
	  }
	}
	if($char_opt =~ /[an]/i){
		 print "\n           $char_opt ";
	  return(\%tally, \%tally_all_occur);}
	else{ return(\%tally_occur, \%tally_all_occur);}
}
#________________________________________________________________________
# Title     : superpose_seq_hash   ( first to second hash) ## the oldest version.
# Usage     : %out =%{&superpose_seq_hash(\%hash1, \%hash2)};
# Function  : (name1 000000112324)+(name1  ABC..AD..EFD ) => (name1 000..01..324)
#             uses the second hash a template for the first sequences. gap_char is
#             '-' or '.'
#             To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example   :
# Warning   : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords  : overlay sequence, overlay alphabet, superpose sequence,
# Options   :
# Returns   : one hash ref.
# Argument  : 2 refs. for hash of identical keys and value length and gap_chr.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub superpose_seq_hash{
	if($debug eq 1){ print __LINE__, " # superpose_seq_hash : \n"; }
	my($gap_chr)='-';
	my($i, $j, %hash1, %hash2, $name, %out, @str1, @str2);

	if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
	  %hash1=%{$_[0]}; %hash2=%{$_[1]}; }
	else{ print "\n superpose_seq_hash needs hash ref\n"; print chr(007); exit; }

	my(@names1)=sort keys %hash1; my(@names2)=sort keys %hash2;
	(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;

	for ($j=0; $j < $bigger; $j++){
	 if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
		@str1=split(/|\,/, $hash1{$names1[$j]});
		@str2=split(/|\,/, $hash2{$names2[$j]});
		for($i=0; $i < @str2; $i++){
		  if($str2[$i] ne $gap_chr){ $str2[$i]=$str1[$i];  } }
		$out{$names1[$j]}=join(",", @str2);
	}
	return(\%out);
}


#________________________________________________________________________
# Title     : overlay_seq_hash   ( first to second hash) ## the oldest version.
# Usage     : %out =%{&overlay_seq_hash(\%hash1, \%hash2)};
# Function  : (name1 000000112324)+(name1  ABC..AD..EFD ) => (name1 000..01..324)
#             uses the second hash a template for the first sequences. gap_char is
#             '-' or '.'
#             To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example   :
# Warning   : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords  :
# Options   :
# Returns   : one hash ref.
# Argument  : 2 refs. for hash of identical keys and value length and gap_chr.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub overlay_seq_hash{
	my($gap_chr)='-'; my($i, $j, $name, %out, @str1, @str2);

	if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
	  my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; }
	else{ print "\n overlay_seq_hash needs hash ref\n"; print chr(007); exit; }

	my(@names1)=keys %hash1; my(@names2)=keys %hash2;
	(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;

	for ($j=0; $j < $bigger; $j++){
	 if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
		@str1=split(//, $hash1{$names1[$j]}); @str2=split(//, $hash2{$names2[$j]});
		for($i=0; $i < @str2; $i++){
		  if(($str2[$i] =~ /\W/)||($str2[$i] =~ //)){ $str1[$i]="$gap_chr";}}
		$out{$names1[$j]}=join(",", @str1);
	}
	\%out;
}


#________________________________________________________________________
# Title     : insert_gaps_in_seq_hash  ( first to second hash)
# Usage     : %out_extended_seq =%{&insert_gaps_in_seq_hash(\%hash1, \%hash2)};
# Function  : superpose two hashes of the same sequence or same seq. length sequences,
#             but unlike 'superpose_seq_hash', this inserts gaps and extend the
#             sequences.
#             (name1_sec  hHHHHHH EEEEEEE) +
#             (name1_seq  .CDEABC..AD..EFD..EKST) => (name1_ext  .hHHHHH..H...EEE..EEEE)
#             In the example, the undefined sec. str. position is replaced as gaps('.')
#             Uses the second hash a template for the first sequences. gap_char is
#             '-' or '.'
#             One rule is that the SECOND hash contains gaps!!
#             There are two types of hash input. One is simple seq hash(both args)
#              The other is from secondary structure prediction. The hash has contents
#              like: $averaged{$position}=[$residue1, $sec_str2, $dif_reliability];
# Example   :
# Warning   : coded by A Biomatic
# Keywords  : superposing sequences with gaps, interpolate_sequences, interpolate_gaps
# Options   :
# Returns   : one hash ref.
# Argument  : 2 ref for hash of identical keys and value length.
# Category  :
# Version   : 1.3
#--------------------------------------------------------------------
sub insert_gaps_in_seq_hash{
    my($gap_char)='-';
    my($g, $i, $j, $t, %hash1, %hash2, $bigger, $name, %new_hash_with_gap, @str1,
       @posi, @str2, $char_from_gapped_hash, $GAP_CHAR);
    my($join_char) =',';  ## <<-- This is for the final output joined by this var.
    $GAP_CHAR='-';

    if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
       %hash1=%{$_[0]};
       %hash2=%{$_[1]};
    }else{
       print "\n superpose_seq_hash needs hash ref\n";
       print chr(007); exit;
    }
    my(@names1)=keys %hash1;
    my(@names2)=keys %hash2;

    if($hash1{$names1[0]}->[0] =~/\w/){ # this means that the input hash is sec str hash type
        print "\n# (INFO) insert_gaps_in_seq_hash: You have put sec str pred hash as an input.\n\n";
        @posi=split(//, $hash2{$names2[0]});

Bioinf.pm  view on Meta::CPAN


	$gap_char='.';

	%arraySTR = %{&hash_common_by_keys(\%arraySTR, \%arraySEQ)};
	%arraySEQ = %{&hash_common_by_keys(\%arraySEQ, \%arraySTR)};
	%arraySEQ = %{&remov_com_column(\%arraySEQ)};
	%arraySTR = %{&remov_com_column(\%arraySTR)};

	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if($debug eq 1){
		print __LINE__,
		" ## sorting sequence names. To make things constant. \n\n";  }
	@names= sort keys %arraySTR;
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	#  If common column of secondary structure representation option $comm_col is set
	#  open_dssp_files sub routine will get the common seq parts of all the sequences.
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if($comm_col =~ /C/i){
		%DSSP_common=%{&open_dssp_files( @names, $H, $S, $E, $T, $I, $G, $B, $simplify, 'C')};
		@keys_common= keys %DSSP_common;
		@stringDSSP_common = split(/|\,/, $DSSP_common{$keys_common[0]});
		if($debug2 eq 1){ print __LINE__," \$comm_col is set to: $comm_col \n";
			print __LINE__," \@stringDSSP_common is :@stringDSSP_common \n";
		}
	}

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	# Comparing two hashes
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	for $name (@names){
		#"""""""""""""""" Splitting the sequence string
		if($arraySEQ{$name}=~/\,\S+\,/){
			@stringSEQ =split(/\,/, $arraySEQ{$name});
			@stringSTR=split(/\,/, $arraySTR{$name});  }
		else{
			@stringSEQ =split(//, $arraySEQ{$name});
			@stringSTR=split(//, $arraySTR{$name});
		}
		print "\n",__LINE__, " \@stringSEQ  is  @stringSEQ \n" if $debug2 eq 1;
		print "\n",__LINE__, " \@stringSTR  is  @stringSTR \n" if $debug2 eq 1;

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   Contracting  the SEQ.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@seq_positionSEQ = @{&get_posi_sans_gaps(\$arraySEQ{$name})};
		@seq_positionSTR = @{&get_posi_sans_gaps(\$arraySTR{$name})};

		#"""""""""""""""" To get secondary structure only calc  """"""""""""""""""""""""""""
		# It superposes the NON sec. region on  @seq_positionSTR to nullify positions.
		#  get_posi_diff ignores non char positions in calc.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		if( ($ss_opt =~ /ss$/i) && ($comm_col !~ /C/i) ){
			%DSSP=%{&open_dssp_files($name, $H, $S, $E, $T, $I, $G, $B, $simplify, $comm_col)};
			if($debug1 eq 1){
			   print "\n",__LINE__," open_dssp_files has options \$H ->$H \$S->$S \$E->$E \n";
			   print "\n",__LINE__," \$T->$T \$I->$I \$G->$B \$simplify->$simplify \$comm_col ->$comm_col\n";
			   &show_hash( \%DSSP );
			}
			if(ref(\%DSSP) eq 'HASH'){ # to check it %DSSP was valid, If not it skips overlaying
				@stringDSSP = split(/|\,/, $DSSP{$name});
				$size_of_stringDSSP = @stringDSSP;
				$size_of_seq_positionSTR = @seq_positionSTR;
				if($debug2 eq 1){
					  print "\n",__LINE__," \@stringDSSP is \n @stringDSSP\n";
					  print "\n",__LINE__," Size of \@stringDSSP      is $size_of_stringDSSP\n" ;
					  print "\n",__LINE__," Size of \@seq_positionSTR is $size_of_seq_positionSTR\n";
					  print "\n",__LINE__," \$gap_char is \"$gap_char\" \n" ;
				}
				#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
				#   When the sec. str is not defined in DSSP, I delete the position of
				#   @stringDSSP to gap(ie. make it blank to exclude error rate calc)
				#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
				for($i=0; $i < @stringDSSP; $i++){
					if($stringDSSP[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
				}
			}
		}elsif( $comm_col =~ /C/i){
				print __LINE__, " Replacing position with \gap_char \"$gap_char\"\n" if $debug2 eq 1;
				$ss_opt = 'ss'; # whether it was set or not, make it 'ss'
				for($i=0; $i < @stringDSSP_common; $i++){
					if($stringDSSP_common[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
				}
		}

		if($debug2 eq 1){
			print __LINE__,
			print " \@seq_positionSTR is  @seq_positionSTR\n";
		}

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   getting Position differences.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@position_diffs  = @{&get_posi_diff(\@seq_positionSEQ, \@seq_positionSTR)};

		if($debug2 eq 1){
			print __LINE__,
			print " \@position_diffs is  @position_diffs\n";
		}

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#  You can have two types of output according to which alignment you compare your
		#   error rates. (1) Compare to @stringSEQ   (2) @stringSTR
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@position_corrected1 = @{&put_position_back_to_str_seq(\@stringSEQ, \@position_diffs)};
		$array3{$name}=join(",", @position_corrected1);

	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	# The final Step for error rate, $LIMIT is to confine error rate in one digit (ie, less than 10)
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	%final_posi_diffs =%{&get_residue_error_rate(\%array3, $LIMIT)};

	undef(@whole_length, $len_of_seq);
	return(\%final_posi_diffs);
}

#________________________________________________________________________
# Title     : get_posi_rates_hash_out (derived from 'get_posi_shift_hash' )
# Usage     : %rate_hash = %{&get_posi_shift_hash(\%hash_msf, \%hash_jp)};

Bioinf.pm  view on Meta::CPAN

#             seq2  ABCDEE-----DD-  ==>    seq2  ABCDEE-DD-
#             seq3  ---DEE----DDE-         seq3  ---DEEDDE-
#                         ^^^^
#             from above the 4 columns of gap will be removed
#             To remove absurd gaps in multiple sequence alignment
# Warning   :
# Keywords  :
# Options   :
# Returns   : a ref. of a hash.
#
#               <input hash>                   <out hash>
#
# Argument  : accepts reference for a hash.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub remov_com_column2{
	my(%input) = %{$_[0]};
	my(@common)=();
	my($len)=0;
	my(@string)=();
	my(@new_string)=();
	my(@string2)=();
	my(%new_string);

	########## Finds common gaps ###########
	for $key(keys %input){
	  $len  = &smaller_one($#common, $#string) unless $#common < 0;
		  sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
	  @string = split(/|| /, $input{$key});
	  for ($k=0; $k <= $len;$k++){
		  if($#common == -1){
			  @common = (@string);
			  last;
		  }
		  if (($string[$k] eq '.')&&($common[$k] eq '.')){
			  $common[$k]='.';
		  }else{
			  $common[$k]='X';
		  }
	  }
	}

	########## removes gaps ###########
	for $key2 (keys %input){
		@string2 = split(//, $input{$key2});
		for ($i=0; $i <= $#string2; $i++){
		  if ($common[$i] eq $string2[$i]){
			  print;
		  }else{
			  push(@new_string, $string2[$i]);
		  }
		}
		$new_string{$key2}= join("", @new_string);
		@new_string = ();
	}
	\%new_string;
}

#________________________________________________________________________
# Title     : get_common_column   (similar to overlay_seq_for_identical_chars )
# Usage     : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
# Function  : (name1         --EHH--HHEE-- )
#             (name2         --HHH--EEEE-- ) ==> result is;
#
#             (name1_name2   -- HH--  EE-- )
#             to get the identical chars in hash strings of sequences.
#
# Example   : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
#             output> with 'E' option >>> "name1     --HHH--1232-"
#   Following input will give;
#  %hash1 = ('s1', '--EHH-CHHEE----EHH--HHEE----EHH--HHEE----EHH-CHHEE--');
#  %hash2 = ('s2', '--EEH-CHHEE----EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#  %hash3 = ('s3', '-KEEH-CHHEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#  %hash4 = ('s4', '-TESH-CHEEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#
#    s1_s2_s3_s4    --E-H-CH-EE----E-H--HHEE----E-H--HHEE----E-H-CHHEE--
#
# Warning   : This gets more than 2 hashes. Not more than that!
#
# Class     : get_common_column, get_common_column_in_seq, get common column in sequence
#             for secondary structure only representation.
# Keywords  : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
#             get_common_column, get_com_column, get_common_sequence,
#             get_common_seq_region, multiply_seq_hash, get_common_column_in_sequence
# Options   :
# Reference :
# Returns   : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Tips      :
# Argument  : 2 or more ref for hash of identical keys and value length.
#             One optional arg for replacing space char to the given one.
# Author    : jong@salt2.med.harvard.edu
# Category  :
# Version   : 1.6
#--------------------------------------------------------------------
sub get_common_column{
  my($i, $k,$j, $name1, $name2, $sorted_name, @in, %out, @out_chars,
     $gap_chr, @str1, @str2);
  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  Sub argument handling     $gap_chr here can be 'HE' etc.
  #_______________________________________________________
  for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
          elsif(ref($_[$k]) eq "HASH") { push(@in,  $_[$k]); }    }

  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  Checking if the input hashes were right
  #_______________________________________________________
  if( (@in < 2) && ( ref($in[0]) eq 'HASH') ){
	  print "\n", __LINE__, " # get_common_column usually needs 2 hashes. Error \n";
	  print "\n", __LINE__, " # get_common_column : Anyway, I will just return the single input hash:  @in. \n";
	  %out=%{$in[0]}; # <--- This is essential to return the single input hash!!
	  goto FINAL;
  }
  if(@in ==2){  print "\n# (INFO) \@in has 2 elements for ref. of hash, good!\n\n"; }

  %out = %{$in[0]};  ## Initializing %out
  print "\n",__LINE__, " # get_common_column hashes given are: @in \n" if $debug eq 1;

  for( $k=1; $k < @in; $k++){
      my(@out_chars);   ## <-- Necessary to prevent concatenation.
      my(%hash1)=%out;
      my(%hash2)=%{$in[$k]};
      my(@names1)= sort keys %hash1;
      my(@names2)= sort keys %hash2;
      $name1 = $names1[0];
      $name2 = $names2[0];
      @str1=split(/\||\,/, $hash1{$names1[0]});
      @str2=split(/\||\,/, $hash2{$names2[0]});

      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Trying to guess the correct gap char
      #_____________________________________________________
      if($hash2{$names2[0]}=~/_/){
          $gap_chr='_';
      }elsif($hash2{$names2[0]}=~/(\W)\W/){
          $gap_chr=$1;
      }

      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      #  If I fail to split string by ',', then split by ''
      #_____________________________________________________
      if(@str1 == 1){  @str1=split(//, $hash1{$names1[0]}); }
      if(@str2 == 1){  @str2=split(//, $hash2{$names2[0]}); }
      for($i=0; $i < @str1; $i++){
          if($str1[$i] eq $str2[$i] ){
              push(@out_chars, $str1[$i]);
          }else{
              if( defined($gap_chr) ){
                   push(@out_chars, $gap_chr);
              }else{
                   push(@out_chars, ' ');
              }
          }
      }
      $sorted_name=join('', ($name1, $name2));
      %out='';
      $out{"$sorted_name"}= join("", @out_chars);

  }
  FINAL:
  if ($debug eq 1){
	  print "\n",__LINE__, " # get_common_column Final res. \%out :\n",
	  &show_hash(%out);
  }
  return(\%out);
}

#________________________________________________________________________
# Title     : overlay_seq_for_identical_chars
# Usage     : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# Function  : (name1         --EHH--HHEE-- )
#             (name2         --HHH--EEEE-- ) ==> result is;
#
#             (name1_name2   -- HH--  EE-- )
#             to get the identical chars in hash strings of sequences.
#
# Example   : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
#             output> with 'E' option >>> "name1     --HHH--1232-"
# Warning   : Works only for 2 sequence hashes.
# Keywords  : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# Options   :
# Returns   : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Argument  : 2 ref for hash of identical keys and value length. One optional arg for
#             replacing space char to the given one.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub overlay_seq_for_identical_chars{
	my($i, $k,$j, $name1, $name2, @in, %out, @out_chars, $gap_chr, @str1, @str2);
	######################################
	####### Sub argument handling ########  $gap_chr here can be 'HE' etc.
	######################################
	for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
	  elsif(ref($_[$k]) eq "HASH") { push(@in,  $_[$k]); }    }

	if(@in < 2){ print "\n overlay_seq_for_identical_chars needs 2 hashes. Error \n"; exit; }
	my(%hash1)=%{$in[0]}; my(%hash2)=%{$in[1]};
	my(@names1)=sort keys %hash1; my(@names2)= sort keys %hash2;
	$name1 = $names1[0]; $name2 = $names2[0];
	@str1=split(/|\,/, $hash1{$names1[0]}); @str2=split(/|\,/, $hash2{$names2[0]});
	for($i=0; $i < @str1; $i++){
	  if($str1[$i] eq $str2[$i] ){
		  push(@out_chars, $str1[$i]); }
	  elsif( defined($gap_chr) ){ push(@out_chars, $gap_chr); }
	  else{ push(@out_chars, ' '); }
	}
	if( $name2 gt $name1){
	  $out{"$name1\_$name2"}= join(",", @out_chars); }
	else{  $out{"$name2\_$name1"}= join(",", @out_chars); }
	\%out;
}

#________________________________________________________________________
# Title     : remov_com_column
# Usage     : %new_string = %{&remov_com_column(\%hashinput)};
#             @out=@{&remov_com_column(\@array3)};
# Function  : removes common gap column in seq.
# Example   :
# Warning   :
# Keywords  : remove_com_column, remove_common_column,
#             remove_common_gap_column, remov_common_gap_column,
#             remove com column
# Options   :
# Returns   : a ref. of  hash(es) and array(s).
#
#             name1   ABCDE....DDD       name1  ABCDE..DDD
#             name2   ABCDEE..DD..  -->  name2  ABCDEEDD..
#             name3   ...DEE..DDE.       name3  ...DEEDDE.
#
#             (ABC....CD, ABCD...EE) --> (ABC.CD, ABCDEE)
#             from above the two column of dot will be removed
#             To remove absurd gaps in multiple sequence alignment. for nt6-hmm.pl
# Argument  : accepts reference for hash(es) and array(s).
# Category  :
# Version   :
#--------------------------------------------------------------------
sub remov_com_column{
	my(@hash_ref_out, $d, $gap_char);
	for($d=0; $d < @_; $d++){
	  if(ref($_[$d]) eq 'HASH'){
	      my($len,@string,@new_string,@string2);
		  my(%input)=%{$_[$d]};
		  my(@common);
		  for (keys %input){
			  @string = split('', $input{$_});
			  if(!(defined(@common))){ @common = (@string);  }
			  else{ for ($k=0; $k < @string; $k++){
				 if (($string[$k] =~ /\W/ )&&($common[$k] =~ /(\W)/)){ $common[$k]= $1;}
				 elsif(($string[$k] =~ /(\W)/)&&(!(defined($common[$k])))){ $common[$k]=$1;}
				 else{ $common[$k]='X';} } } }
		  for (keys %input){ @string2 = split(//, $input{$_});
			  for ($i=0; $i < @string2; $i++){
				 if ($common[$i] ne $string2[$i]){ push(@new_string, $string2[$i]); } }
			  $new_string{$_}= join('', @new_string); @new_string = ();      }
		  push(@hash_ref_out, \%new_string);
	  }



( run in 1.074 second using v1.01-cache-2.11-cpan-39bf76dae61 )