Bioinf
view release on metacpan or search on metacpan
filter_seq_DB_by_seq_length find_central_seq_msp_chunk find_central_sequence find_low_complexity_region
find_program_in_path find_seq_file_old find_seq_files find_source_perl_library
follow_seqlet_link fromJulian full_pwd_path geanfammer
geanfammer_main get_added_matched_regions_in_msp get_all_dirs_from_ENV get_all_msp_files
get_av_and_sd_seq_length get_av_seq_length get_average_sequence_size get_averaged_prediction
get_base_names get_column get_common_array_entry get_common_column
get_common_hash_keys get_correct_percent_alignment_rate get_date get_dir_names_only
get_domain_inside_domain get_each_posi_diff_hash get_extension_names get_false_positive_seq_matches
get_file_dir_names get_file_extensions get_first_seq_in_alignment get_full_dir_names
get_full_file_name get_full_path_dir_names get_full_pwd_path get_gap_positions
get_hash_value_average get_high_score_blocks get_homology_info_of_seq_pairs get_host_by_addr
get_host_by_name get_id_among_2 get_id_among_2_1 get_id_among_2_2
get_internal_dup_in_a_cluster get_isearch_result_stat get_largest_element get_largest_file
get_linked_sequence get_linux_kernel_version get_longest_str_size get_max_hash_by_value
get_median get_median get_msp_enquiry_sequence get_msp_matched_sequence
get_msp_range get_multiple_array_entry get_occurances_of_char get_occurances_of_shift_type_hash
get_occurances_of_shift_type_hash_all get_overlapping_range get_overlapping_seq_match_size get_pair_homol_array
get_pair_homol_hash get_path_dirs_from_ENV get_pdb_file_start_number get_peptide_occurance
get_percent_homo_hash get_percent_homol_arr get_percentage get_perl_keywords
get_posi_diff get_posi_diff_abs get_posi_diff_and_rms_hash get_posi_diff_hash
get_posi_rates_hash_out get_posi_rates_hash_out get_posi_rates_hash_out_compact get_posi_rates_hash_out_jp
get_posi_rates_hash_out_msf get_posi_sans_gaps get_posi_shift_hash get_posi_shift_hash_rms
get_posi_shift_rate get_posi_shift_rms_hash get_posi_shift_rms_whole get_position_shift_rate
get_probable_half get_pwd_dir get_pwd_dir_name get_residue_error_rate
get_scop_correcting_pairs get_sd_of_length_diff get_segment_shift_rate get_seq_fragments
get_seq_hash_sans_gaps get_seq_identity get_seqblock get_sequence_complexity
get_sequence_number get_shortest_str_size get_smallest_file get_stat_FASTA_search_result_in_msp_0_files
get_sub_hash get_subdir_names get_subroutine_calls get_time
get_total_memory_size_in_linux get_unix_shell_name get_whole_pwd_path get_windows_cs_rate_array
get_windows_sc_rate_array get_wrong_segment_rate handle_arguments handle_arguments_old
hash_average hash_catenate hash_chk hash_common
hash_common2 hash_common_by_keys hash_no_common hash_output_chk
hash_stat_for_all hash_substract_by_keys hostname if_file_older_than_x_days
import_ENV_vars initialize_code insert_gaps_in_seq_hash insert_lines_anywhere
interm_lib_search is_html isroman key_ready
link_ranges load_mount_info mail_it make_2D_aa_residue_matrix_array
make_2D_identity_matrix make_2D_identity_matrix_array make_6_frame_dna_sequences make_cdf_file
make_clustering_summary make_composition_ratio_table make_composition_ratio_table_simple make_composition_table
make_fasta_files_from_msp_1_files make_filtered_list make_hmm_from_alignment make_intermediate_sequence_library
make_one_array make_pairs_from_hash make_random_sequence make_reverse_seq_database
make_scrambled_seq_database make_seq_alignment_length_even make_seq_index_file make_sequence_match_table
make_singlet_list_from_pdb_entries make_standalone_subroutines make_swiss_index make_template_from_sec_str
max max_elem_array max_elem_string_array max_elem_string_array
max_str_key_hash max_str_value_hash maximum merge_array
merge_arrays_by_common_elements merge_hash merge_many_arrays merge_sequence_alignments
merge_sequence_in_msp_chunk merge_sequence_in_msp_file merge_similar_ranges merge_similar_seqlets
merge_superfam_fasta_files_for_ISL min min_elem_array min_str_key_hash
min_str_value_hash minimum msf_permute_array_write msf_permute_hash_write
msp_single_link_hash mv n normalize_numbers
numerically occurances one_to_three_letter open_ali_files
open_aln_files open_brk_files open_cel_files open_clu_files
open_dna_files open_dssp_files open_embl_files open_fasta_files
open_fil_file open_hlx_files open_hmmfs_files open_hmmls_files
open_jp_files open_lottery_file open_msf_files open_msf_jp_files
open_msp_files open_out_files open_pdb_files open_pdbg_files
open_phd_files open_pir_files open_predator_files open_rms_files
open_rms_files2 open_sdb_files open_self open_seq_alignment_files
open_seq_files open_sequence_index_files open_slx_files open_sso_files
open_sst_files open_sst_files_with_gap open_stride_dat_files open_stride_dat_files
open_subdir_and_go_in_and_do open_swissprot_seq_files open_tem_files opendir_and_go
opendir_and_go_in_and_do_something opendir_and_go_rand_fasta opendir_and_go_rand_fasta_and_clustal overlay_seq_by_certain_chars
overlay_seq_for_identical_chars overlay_seq_hash pair_percent_id_trend pairwise_iden_pos
pairwise_percent_id parse_arguments permute permute_binary
pick_random_files pick_random_hash_pairs pir_permute_array_write pir_permute_hash_write
pir_write pir_write plot_histogram_horizontally plot_vertically
print_clusfile_from_hash print_in_block print_seq_in_block print_seq_in_block_old
print_seq_in_block_with_print print_seq_in_columns produce_random_numbers push_if_not_already
push_if_not_already put_gaps_every_x_position_in_string put_gaps_every_x_position_in_string_special put_gaps_in_hash
put_msp_lines_to_hash_from_bla put_position_back_to_str_seq put_slash_before_special_chars pwd_dir_name
pwd_path rand_DNA_seq_generate rand_RNA_seq_generate rand_sequence_mul_array
rand_sequence_one_array rand_sequence_one_string rand_word randomise_file_contents
randomise_lines read_all_head_boxes read_any_dir read_any_dir2
read_any_dir_for_dir read_any_dir_simple read_any_seq_files read_blast_hits
read_correct_head_box read_dir_names_only read_file_extension_names_only read_file_names_only
read_first_head_box read_fssp_files read_full_dir_names read_head_box
read_head_box2 read_head_boxes read_hssp_no_inserts read_machine_readable_sso_lines
read_machine_unreadable_sso_lines read_option_table read_seq_matrix_files read_sso_lines
read_sst_files read_subroutines remov_com_column remov_com_column2
remov_common_gap remove_dup_in_array remove_dup_in_hash remove_dup_match_in_msp_files
remove_dup_seq_entry remove_elements_by_name remove_elements_by_pattern remove_elements_by_position
remove_file_extension remove_mail_header_in_files remove_non_char remove_repetitives_in_array
remove_similar_seqlets remove_small_files remove_text replace_lines
replace_subroutines replace_text reset_all_the_vars reset_shell_environment
rev_abs_numerically rev_lines_pdb rev_sequence_mul_array reverse_hash
reverse_sequences roman rotate_seq round
round_number round_numbers run_fasta_sequence_search scale_for_horizontal_histogram
scan_win_and_get_cs_rate_pairs scan_win_and_get_sc_rate_pairs scan_win_get_average scan_window_and_calc_average
scan_window_and_calc_something scan_windows_and_get_compos_seqid_rate scoring scramble_array
scramble_sequences sd se search_files_in_subdir
search_palindromes search_self self_self_search send_mail
sep seq_comp_percent1 seq_comp_percent2 seq_id_percent_array
seq_to_regexp set_debug set_debug_option set_special_options
shift_word shift_word_recursively show_array show_default_help
show_hash show_in_fasta show_options show_subclusterings
smaller_one sort_by_E_values sort_by_cluster_size sort_by_column
sort_by_column_bigger_first sort_by_digits_in_string sort_by_hash_values sort_by_keys
sort_files_by_size sort_files_by_time sort_hash_by_keys sort_hash_by_value
sort_hash_by_value_and_make_array sort_hash_value_by_column sort_string_by_length sort_words_in_string
split_fasta_files split_file_by_string split_files split_sequence
sqrt_array square_array sso_to_msp ssp_permute_array_write
ssp_permute_hash_write ssp_write steve_permute_array strip_rotated_seq
strip_sequence_ranges substract_array substract_hash_by_keys substract_hash_by_values
sum_array sum_digits_in_string sum_hash_values sum_hash_values_of_string
sum_of_array sum_of_squared_array sum_x_mul_y_arrays superpose_hash
superpose_seq_hash take_file_name takeout_subroutines tally_2_hashes
tell_seq_length tempname three_to_one_letter tidy_secondary_structure_segments
time_date toJulian transform_values trim_numbers
update_subroutines weighted_av weighted_average word_wrap
write_aln_files write_ardf_files write_c3al_files write_c3ss_files
write_dof_files write_evss_files write_fasta write_fasta_array
write_fasta_seq_by_seq write_gcg_file write_gcg_genbank_file write_genbank_file
write_good_bad_list_in_divide_clusters write_head_box write_html_headbox write_iss_file
write_jp write_modeller_ali_file write_modeller_top_file write_mprf_files
write_msf write_msp3_files write_msp_files write_nhco_files
write_parf_files write_pdbg_files write_pir_file write_prdl_files
write_pred_files write_primer_file write_rdif2_files write_rdif_files
write_reverse_seq_files write_sdb_file write_seq_files write_staden_file
write_subroutines x_mul_y_arrays );
#!/usr/bin/perl
#______________________________________________________________________
# Title : Bio::Bioinf (for Bio Perlogical) or bio_lib.pl
$helix_min=3;
$beta_strand_min=3;
########################################################################
##### general argument handling for options of segment length ######
########################################################################
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^[Hh](\d+)$/) ){
$helix_min = $1; }
elsif( ( !ref($_[$k]) )&&($_[$k]=~ /^[Ee](\d+)$/) ){
$beta_strand_min = $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[Hh](\d+)$/) ){
$helix_min = $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^[EeBb](\d+)$/) ){
$beta_strand_min = $1; }
elsif(ref($_[$k]) eq "HASH") { push(@hash, $_[$k]); } }
for($i=0; $i < @hash; $i++){
my(%hash) = %{$hash[$i]};
@keys = sort keys( %hash );
for($j=0; $j < @keys; $j++){
my(@string_segH, @string_segE, @stringout);
$string1=$hash{$keys[$j]};
$gap_char = $1 if ($string1=~ /(\W)/);
##### actual cleaning ####
my(@string) = split(//, $string1);
for($a = 0; $a < @string; $a++){
if($string[$a] !~/[HE]/){ ### if the splited element doesn't match 'H' or 'E'
##### If any of the HH or EE counter is over the given minimum($helix_min,,)
if((@string_segH >= $helix_min)||( @string_segE >=$beta_strand_min)){
push(@stringout, @string_segH, @string_segE, '-');
@string_segH=@string_segE=(); } ## just resetting.
else{ ### if the accumulated 'HH' or 'EE' is smaller than the minimum
for(0.. (@string_segH + @string_segE) ){
push(@stringout, '-'); ### replace the short 'EE' etc with '-'
}
@string_segH=@string_segE=(); ## just resetting.
}
}
elsif($string[$a] =~ /^([Hh])$/){
push(@string_segH, $1); }
elsif($string[$a] =~ /^([Ee])$/){
push(@string_segE, $1); }
}
$hash{$keys[$j]}=join("", @stringout);
}
push(@hash_out, \%hash);
}
if(@hash_out == 1){ return($hash_out[0]);
}elsif( @hash_out > 1 ){ return(@hash_out); }
}
#________________________________________________________________________
# Title : overlay_seq_by_certain_chars
# Usage : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'HE')};
# Function : (name1 000000112324)+(name1 ABC..AD..EFDK ) => (name1 000..00..12324)
# (name2 000000112324)+(name2 --HHH--EEEE-- ) => (name1 ---000--1123--)
# uses the second hash a template for the first sequences. gap_char is
# '-' or '.' or any given char or symbol.
# To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example : %out =%{&overlay_seq_by_certain_chars(\%hash1, \%hash2, 'E')};
# output> with 'E' option >>> "name1 --HHH--1232-"
# Warning : If gap_chr ('H',,,) is not given, it replaces all the
# non-gap chars (normal alphabet), ie,
# it becomes 'superpose_seq_hash'
# Keywords : Overlap, superpose hash, overlay, superpose_seq_hash
# Options : E for replacing All 'E' occurrances in ---EEEE--HHHH----, etc.
# : H for replacing all 'H' " " "
# Returns : one hash ref.
# Argument : 2 ref for hash of identical keys and value length.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub overlay_seq_by_certain_chars{
my($i, $k,$j, $name, @in, %out, $gap_chr, @str1, @str2);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
####### Sub argument handling ######## $gap_chr here can be 'HE' etc.
#______________________________________
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.+)$/) ){
$gap_chr .= $1;
}elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)+$/) ){
$gap_chr .= $1;
}elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); }
}
if($#in < 1){
print "\n overlay_seq_by_certain_chars needs 2 hashes. Error \n"; exit; }
my(%hash1)=%{$in[0]};
my(%hash2)=%{$in[1]};
my(@names1)= sort keys %hash1;
my(@names2)= sort keys %hash2;
(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
for ($j=0; $j < $bigger; $j++){
@str1=split(//, $hash1{$names1[$j]});
@str2=split(//, $hash2{$names2[$j]});
if( ($gap_chr eq '') && ($hash2{$names2[$j]}=~/(\W)/) ){
$gap_chr=$1;
for($i=0; $i < @str2; $i++){
if($str2[$i] =~ /$gap_chr/){ $str1[$i]=$gap_chr;} }
$out{$names1[$j]}=join(",", @str1);
}else{
for($i=0; $i < @str2; $i++){
if($gap_chr =~ /$str2[$i]/){ $str2[$i]=$str1[$i];} }
$out{$names1[$j]}=join(",", @str2); }
}
return(\%out);
}
#________________________________________________________________________
# Title : rev_lines_pdb
# Usage : &rev_lines_pdb(\$ARGV[0]);
# Function : reorders the lines of any pdb files, but takes only C alpha positions.
# Example :
# The INPUT example >
#
# ATOM 191 CA ALA 195 -2.566 8.099 42.827 1.00 12.42 1ENG 256
# ATOM 192 CA ARG 196 -1.401 11.546 41.629 1.00 8.63 1ENG 257
# ATOM 193 CA THR 197 -4.073 13.846 43.107 1.00 9.93 1ENG 258
#
# The OUTPUT example > <first file, called xxxx1.atm >
#
# ATOM 1 CA ALA 1 -2.566 8.099 42.827 1.00 12.42 1ENG 256
# ATOM 2 CA ARG 2 -1.401 11.546 41.629 1.00 8.63 1ENG 257
# ATOM 3 CA THR 3 -4.073 13.846 43.107 1.00 9.93 1ENG 258
#
# <2nd file, called xxxx2.atm >
# ATOM 1 CA THR 1 -4.073 13.846 43.107 1.00 9.93 1ENG 258
# ATOM 2 CA ARG 2 -1.401 11.546 41.629 1.00 8.63 1ENG 257
# ATOM 3 CA ALA 3 -2.566 8.099 42.827 1.00 12.42 1ENG 256
#
# Warning : A Biomatic
# Keywords :
# Options : None
# Returns : directly writes two output files xxxx1.atm xxxx2.atm
# Argument : one pdb coordinate file reference
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub rev_lines_pdb{
my(@lines, $i, $c, @line_rev, $ATOM, $RES );
my($input_file_name) = ${$_[0]};
open(INPUT, "$input_file_name");
while(<INPUT>){ push(@lines, $_); $whole++; }
my($base_name) =${&get_base_name(\$input_file_name)};
my($file1) = "${base_name}1\.atm";
@string1=split(/$gap_char1/, $hash0{$keys1[$i]});
@string2=split(/$gap_char2/, $hash1{$keys2[$i]});
### @string1 => (0,0,0,0,1,1,1,1,1) @string2 => (3,4,2,13,2,1,23,3)
################################################################
## Main calc part, you get %tally_all_occur and %tally_occur
################################################################
for($j=0; $j < @string1; $j++){
$tally_all_occur{$string1[$j]}++ ; ## <-- number of all the positions
if( ($string2[$j]=~/[\d\^]+/)&&($string1[$j]=~/[\d\^]+/) ){
$tally_occur{$string1[$j]}+=$string2[$j] ; # %tally_occur is for added up counts
} # %tally_all_occur is for only the position
} # occurances of '0', '1' or whatever. To know
# how many '0' entry were you should use this.
####################################################################################
## When options were put, do more calc on %tally_all_occur and %tally_occur
####################################################################################
if($char_opt =~ /a/i ){
print "\n $char_opt ";
my(@cs_rates) = sort keys %tally_all_occur;
for($k=0; $k < @cs_rates; $k++){
if($tally_all_occur{$cs_rates[$k]} == 0){
$tally{$cs_rates[$k]} =0; next;}
if($char_opt =~ /i/i ){
$tally{$cs_rates[$k]}=int($tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]}); }
elsif($char_opt !~ /i/i ){
$tally{$cs_rates[$k]}= $tally_occur{$cs_rates[$k]}/$tally_all_occur{$cs_rates[$k]};
}
}
}
elsif($char_opt =~ /[np]/i){
my($big_sum, @cs_digits);
@cs_digits = sort keys %tally_occur; # @cs_digits are (0, 1, and 2 )
for(@cs_digits){ $big_sum+=$tally_occur{$_}/$tally_all_occur{$_}; }
for($t=0; $t < @cs_digits; $t++){
if($big_sum ==0){ $tally{$cs_digits[$t]}=0; next; }
else{
if($char_opt =~ /i/i){
$tally{$cs_digits[$t]}= int(($tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor)+0.4999);}
elsif($char_opt !~ /i/i ){
$tally{$cs_digits[$t]}= $tally_occur{$cs_digits[$t]}/$tally_all_occur{$cs_digits[$t]}/$big_sum*$factor;}
}
}
}
}
if($char_opt =~ /[an]/i){
print "\n $char_opt ";
return(\%tally, \%tally_all_occur);}
else{ return(\%tally_occur, \%tally_all_occur);}
}
#________________________________________________________________________
# Title : superpose_seq_hash ( first to second hash) ## the oldest version.
# Usage : %out =%{&superpose_seq_hash(\%hash1, \%hash2)};
# Function : (name1 000000112324)+(name1 ABC..AD..EFD ) => (name1 000..01..324)
# uses the second hash a template for the first sequences. gap_char is
# '-' or '.'
# To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example :
# Warning : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords : overlay sequence, overlay alphabet, superpose sequence,
# Options :
# Returns : one hash ref.
# Argument : 2 refs. for hash of identical keys and value length and gap_chr.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub superpose_seq_hash{
if($debug eq 1){ print __LINE__, " # superpose_seq_hash : \n"; }
my($gap_chr)='-';
my($i, $j, %hash1, %hash2, $name, %out, @str1, @str2);
if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
%hash1=%{$_[0]}; %hash2=%{$_[1]}; }
else{ print "\n superpose_seq_hash needs hash ref\n"; print chr(007); exit; }
my(@names1)=sort keys %hash1; my(@names2)=sort keys %hash2;
(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
for ($j=0; $j < $bigger; $j++){
if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
@str1=split(/|\,/, $hash1{$names1[$j]});
@str2=split(/|\,/, $hash2{$names2[$j]});
for($i=0; $i < @str2; $i++){
if($str2[$i] ne $gap_chr){ $str2[$i]=$str1[$i]; } }
$out{$names1[$j]}=join(",", @str2);
}
return(\%out);
}
#________________________________________________________________________
# Title : overlay_seq_hash ( first to second hash) ## the oldest version.
# Usage : %out =%{&overlay_seq_hash(\%hash1, \%hash2)};
# Function : (name1 000000112324)+(name1 ABC..AD..EFD ) => (name1 000..01..324)
# uses the second hash a template for the first sequences. gap_char is
# '-' or '.'
# To insert gaps rather than overlap, use insert_gaps_in_seq_hash
# Example :
# Warning : Accepts only two HASHes and many possible gap_chr. Default gap is '-'
# Keywords :
# Options :
# Returns : one hash ref.
# Argument : 2 refs. for hash of identical keys and value length and gap_chr.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub overlay_seq_hash{
my($gap_chr)='-'; my($i, $j, $name, %out, @str1, @str2);
if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
my(%hash1)=%{$_[0]}; my(%hash2)=%{$_[1]}; }
else{ print "\n overlay_seq_hash needs hash ref\n"; print chr(007); exit; }
my(@names1)=keys %hash1; my(@names2)=keys %hash2;
(@names1 > @names2)? $bigger=@names1 : $bigger=@names2;
for ($j=0; $j < $bigger; $j++){
if($hash2{$names2[$j]}=~/(\W)/){ $gap_chr = $1; }
@str1=split(//, $hash1{$names1[$j]}); @str2=split(//, $hash2{$names2[$j]});
for($i=0; $i < @str2; $i++){
if(($str2[$i] =~ /\W/)||($str2[$i] =~ //)){ $str1[$i]="$gap_chr";}}
$out{$names1[$j]}=join(",", @str1);
}
\%out;
}
#________________________________________________________________________
# Title : insert_gaps_in_seq_hash ( first to second hash)
# Usage : %out_extended_seq =%{&insert_gaps_in_seq_hash(\%hash1, \%hash2)};
# Function : superpose two hashes of the same sequence or same seq. length sequences,
# but unlike 'superpose_seq_hash', this inserts gaps and extend the
# sequences.
# (name1_sec hHHHHHH EEEEEEE) +
# (name1_seq .CDEABC..AD..EFD..EKST) => (name1_ext .hHHHHH..H...EEE..EEEE)
# In the example, the undefined sec. str. position is replaced as gaps('.')
# Uses the second hash a template for the first sequences. gap_char is
# '-' or '.'
# One rule is that the SECOND hash contains gaps!!
# There are two types of hash input. One is simple seq hash(both args)
# The other is from secondary structure prediction. The hash has contents
# like: $averaged{$position}=[$residue1, $sec_str2, $dif_reliability];
# Example :
# Warning : coded by A Biomatic
# Keywords : superposing sequences with gaps, interpolate_sequences, interpolate_gaps
# Options :
# Returns : one hash ref.
# Argument : 2 ref for hash of identical keys and value length.
# Category :
# Version : 1.3
#--------------------------------------------------------------------
sub insert_gaps_in_seq_hash{
my($gap_char)='-';
my($g, $i, $j, $t, %hash1, %hash2, $bigger, $name, %new_hash_with_gap, @str1,
@posi, @str2, $char_from_gapped_hash, $GAP_CHAR);
my($join_char) =','; ## <<-- This is for the final output joined by this var.
$GAP_CHAR='-';
if((ref($_[0]) eq 'HASH')&&(ref($_[1]) eq 'HASH')){
%hash1=%{$_[0]};
%hash2=%{$_[1]};
}else{
print "\n superpose_seq_hash needs hash ref\n";
print chr(007); exit;
}
my(@names1)=keys %hash1;
my(@names2)=keys %hash2;
if($hash1{$names1[0]}->[0] =~/\w/){ # this means that the input hash is sec str hash type
print "\n# (INFO) insert_gaps_in_seq_hash: You have put sec str pred hash as an input.\n\n";
@posi=split(//, $hash2{$names2[0]});
$gap_char='.';
%arraySTR = %{&hash_common_by_keys(\%arraySTR, \%arraySEQ)};
%arraySEQ = %{&hash_common_by_keys(\%arraySEQ, \%arraySTR)};
%arraySEQ = %{&remov_com_column(\%arraySEQ)};
%arraySTR = %{&remov_com_column(\%arraySTR)};
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if($debug eq 1){
print __LINE__,
" ## sorting sequence names. To make things constant. \n\n"; }
@names= sort keys %arraySTR;
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# If common column of secondary structure representation option $comm_col is set
# open_dssp_files sub routine will get the common seq parts of all the sequences.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if($comm_col =~ /C/i){
%DSSP_common=%{&open_dssp_files( @names, $H, $S, $E, $T, $I, $G, $B, $simplify, 'C')};
@keys_common= keys %DSSP_common;
@stringDSSP_common = split(/|\,/, $DSSP_common{$keys_common[0]});
if($debug2 eq 1){ print __LINE__," \$comm_col is set to: $comm_col \n";
print __LINE__," \@stringDSSP_common is :@stringDSSP_common \n";
}
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Comparing two hashes
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for $name (@names){
#"""""""""""""""" Splitting the sequence string
if($arraySEQ{$name}=~/\,\S+\,/){
@stringSEQ =split(/\,/, $arraySEQ{$name});
@stringSTR=split(/\,/, $arraySTR{$name}); }
else{
@stringSEQ =split(//, $arraySEQ{$name});
@stringSTR=split(//, $arraySTR{$name});
}
print "\n",__LINE__, " \@stringSEQ is @stringSEQ \n" if $debug2 eq 1;
print "\n",__LINE__, " \@stringSTR is @stringSTR \n" if $debug2 eq 1;
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Contracting the SEQ.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@seq_positionSEQ = @{&get_posi_sans_gaps(\$arraySEQ{$name})};
@seq_positionSTR = @{&get_posi_sans_gaps(\$arraySTR{$name})};
#"""""""""""""""" To get secondary structure only calc """"""""""""""""""""""""""""
# It superposes the NON sec. region on @seq_positionSTR to nullify positions.
# get_posi_diff ignores non char positions in calc.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if( ($ss_opt =~ /ss$/i) && ($comm_col !~ /C/i) ){
%DSSP=%{&open_dssp_files($name, $H, $S, $E, $T, $I, $G, $B, $simplify, $comm_col)};
if($debug1 eq 1){
print "\n",__LINE__," open_dssp_files has options \$H ->$H \$S->$S \$E->$E \n";
print "\n",__LINE__," \$T->$T \$I->$I \$G->$B \$simplify->$simplify \$comm_col ->$comm_col\n";
&show_hash( \%DSSP );
}
if(ref(\%DSSP) eq 'HASH'){ # to check it %DSSP was valid, If not it skips overlaying
@stringDSSP = split(/|\,/, $DSSP{$name});
$size_of_stringDSSP = @stringDSSP;
$size_of_seq_positionSTR = @seq_positionSTR;
if($debug2 eq 1){
print "\n",__LINE__," \@stringDSSP is \n @stringDSSP\n";
print "\n",__LINE__," Size of \@stringDSSP is $size_of_stringDSSP\n" ;
print "\n",__LINE__," Size of \@seq_positionSTR is $size_of_seq_positionSTR\n";
print "\n",__LINE__," \$gap_char is \"$gap_char\" \n" ;
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# When the sec. str is not defined in DSSP, I delete the position of
# @stringDSSP to gap(ie. make it blank to exclude error rate calc)
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for($i=0; $i < @stringDSSP; $i++){
if($stringDSSP[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
}
}
}elsif( $comm_col =~ /C/i){
print __LINE__, " Replacing position with \gap_char \"$gap_char\"\n" if $debug2 eq 1;
$ss_opt = 'ss'; # whether it was set or not, make it 'ss'
for($i=0; $i < @stringDSSP_common; $i++){
if($stringDSSP_common[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
}
}
if($debug2 eq 1){
print __LINE__,
print " \@seq_positionSTR is @seq_positionSTR\n";
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# getting Position differences.
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@position_diffs = @{&get_posi_diff(\@seq_positionSEQ, \@seq_positionSTR)};
if($debug2 eq 1){
print __LINE__,
print " \@position_diffs is @position_diffs\n";
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# You can have two types of output according to which alignment you compare your
# error rates. (1) Compare to @stringSEQ (2) @stringSTR
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@position_corrected1 = @{&put_position_back_to_str_seq(\@stringSEQ, \@position_diffs)};
$array3{$name}=join(",", @position_corrected1);
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# The final Step for error rate, $LIMIT is to confine error rate in one digit (ie, less than 10)
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
%final_posi_diffs =%{&get_residue_error_rate(\%array3, $LIMIT)};
undef(@whole_length, $len_of_seq);
return(\%final_posi_diffs);
}
#________________________________________________________________________
# Title : get_posi_rates_hash_out (derived from 'get_posi_shift_hash' )
# Usage : %rate_hash = %{&get_posi_shift_hash(\%hash_msf, \%hash_jp)};
# seq2 ABCDEE-----DD- ==> seq2 ABCDEE-DD-
# seq3 ---DEE----DDE- seq3 ---DEEDDE-
# ^^^^
# from above the 4 columns of gap will be removed
# To remove absurd gaps in multiple sequence alignment
# Warning :
# Keywords :
# Options :
# Returns : a ref. of a hash.
#
# <input hash> <out hash>
#
# Argument : accepts reference for a hash.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub remov_com_column2{
my(%input) = %{$_[0]};
my(@common)=();
my($len)=0;
my(@string)=();
my(@new_string)=();
my(@string2)=();
my(%new_string);
########## Finds common gaps ###########
for $key(keys %input){
$len = &smaller_one($#common, $#string) unless $#common < 0;
sub smaller_one{if ($_[0] > $_[1]){ $_[1];}else{$_[0];}}
@string = split(/|| /, $input{$key});
for ($k=0; $k <= $len;$k++){
if($#common == -1){
@common = (@string);
last;
}
if (($string[$k] eq '.')&&($common[$k] eq '.')){
$common[$k]='.';
}else{
$common[$k]='X';
}
}
}
########## removes gaps ###########
for $key2 (keys %input){
@string2 = split(//, $input{$key2});
for ($i=0; $i <= $#string2; $i++){
if ($common[$i] eq $string2[$i]){
print;
}else{
push(@new_string, $string2[$i]);
}
}
$new_string{$key2}= join("", @new_string);
@new_string = ();
}
\%new_string;
}
#________________________________________________________________________
# Title : get_common_column (similar to overlay_seq_for_identical_chars )
# Usage : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
# Function : (name1 --EHH--HHEE-- )
# (name2 --HHH--EEEE-- ) ==> result is;
#
# (name1_name2 -- HH-- EE-- )
# to get the identical chars in hash strings of sequences.
#
# Example : %out =%{&get_common_column(\%hash1, \%hash2, '-')};
# output> with 'E' option >>> "name1 --HHH--1232-"
# Following input will give;
# %hash1 = ('s1', '--EHH-CHHEE----EHH--HHEE----EHH--HHEE----EHH-CHHEE--');
# %hash2 = ('s2', '--EEH-CHHEE----EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
# %hash3 = ('s3', '-KEEH-CHHEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
# %hash4 = ('s4', '-TESH-CHEEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#
# s1_s2_s3_s4 --E-H-CH-EE----E-H--HHEE----E-H--HHEE----E-H-CHHEE--
#
# Warning : This gets more than 2 hashes. Not more than that!
#
# Class : get_common_column, get_common_column_in_seq, get common column in sequence
# for secondary structure only representation.
# Keywords : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# get_common_column, get_com_column, get_common_sequence,
# get_common_seq_region, multiply_seq_hash, get_common_column_in_sequence
# Options :
# Reference :
# Returns : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Tips :
# Argument : 2 or more ref for hash of identical keys and value length.
# One optional arg for replacing space char to the given one.
# Author : jong@salt2.med.harvard.edu
# Category :
# Version : 1.6
#--------------------------------------------------------------------
sub get_common_column{
my($i, $k,$j, $name1, $name2, $sorted_name, @in, %out, @out_chars,
$gap_chr, @str1, @str2);
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Sub argument handling $gap_chr here can be 'HE' etc.
#_______________________________________________________
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); } }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Checking if the input hashes were right
#_______________________________________________________
if( (@in < 2) && ( ref($in[0]) eq 'HASH') ){
print "\n", __LINE__, " # get_common_column usually needs 2 hashes. Error \n";
print "\n", __LINE__, " # get_common_column : Anyway, I will just return the single input hash: @in. \n";
%out=%{$in[0]}; # <--- This is essential to return the single input hash!!
goto FINAL;
}
if(@in ==2){ print "\n# (INFO) \@in has 2 elements for ref. of hash, good!\n\n"; }
%out = %{$in[0]}; ## Initializing %out
print "\n",__LINE__, " # get_common_column hashes given are: @in \n" if $debug eq 1;
for( $k=1; $k < @in; $k++){
my(@out_chars); ## <-- Necessary to prevent concatenation.
my(%hash1)=%out;
my(%hash2)=%{$in[$k]};
my(@names1)= sort keys %hash1;
my(@names2)= sort keys %hash2;
$name1 = $names1[0];
$name2 = $names2[0];
@str1=split(/\||\,/, $hash1{$names1[0]});
@str2=split(/\||\,/, $hash2{$names2[0]});
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Trying to guess the correct gap char
#_____________________________________________________
if($hash2{$names2[0]}=~/_/){
$gap_chr='_';
}elsif($hash2{$names2[0]}=~/(\W)\W/){
$gap_chr=$1;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If I fail to split string by ',', then split by ''
#_____________________________________________________
if(@str1 == 1){ @str1=split(//, $hash1{$names1[0]}); }
if(@str2 == 1){ @str2=split(//, $hash2{$names2[0]}); }
for($i=0; $i < @str1; $i++){
if($str1[$i] eq $str2[$i] ){
push(@out_chars, $str1[$i]);
}else{
if( defined($gap_chr) ){
push(@out_chars, $gap_chr);
}else{
push(@out_chars, ' ');
}
}
}
$sorted_name=join('', ($name1, $name2));
%out='';
$out{"$sorted_name"}= join("", @out_chars);
}
FINAL:
if ($debug eq 1){
print "\n",__LINE__, " # get_common_column Final res. \%out :\n",
&show_hash(%out);
}
return(\%out);
}
#________________________________________________________________________
# Title : overlay_seq_for_identical_chars
# Usage : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# Function : (name1 --EHH--HHEE-- )
# (name2 --HHH--EEEE-- ) ==> result is;
#
# (name1_name2 -- HH-- EE-- )
# to get the identical chars in hash strings of sequences.
#
# Example : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# output> with 'E' option >>> "name1 --HHH--1232-"
# Warning : Works only for 2 sequence hashes.
# Keywords : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# Options :
# Returns : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Argument : 2 ref for hash of identical keys and value length. One optional arg for
# replacing space char to the given one.
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub overlay_seq_for_identical_chars{
my($i, $k,$j, $name1, $name2, @in, %out, @out_chars, $gap_chr, @str1, @str2);
######################################
####### Sub argument handling ######## $gap_chr here can be 'HE' etc.
######################################
for($k=0; $k< @_ ;$k++){
if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
$gap_chr .= $1; }
elsif(ref($_[$k]) eq "HASH") { push(@in, $_[$k]); } }
if(@in < 2){ print "\n overlay_seq_for_identical_chars needs 2 hashes. Error \n"; exit; }
my(%hash1)=%{$in[0]}; my(%hash2)=%{$in[1]};
my(@names1)=sort keys %hash1; my(@names2)= sort keys %hash2;
$name1 = $names1[0]; $name2 = $names2[0];
@str1=split(/|\,/, $hash1{$names1[0]}); @str2=split(/|\,/, $hash2{$names2[0]});
for($i=0; $i < @str1; $i++){
if($str1[$i] eq $str2[$i] ){
push(@out_chars, $str1[$i]); }
elsif( defined($gap_chr) ){ push(@out_chars, $gap_chr); }
else{ push(@out_chars, ' '); }
}
if( $name2 gt $name1){
$out{"$name1\_$name2"}= join(",", @out_chars); }
else{ $out{"$name2\_$name1"}= join(",", @out_chars); }
\%out;
}
#________________________________________________________________________
# Title : remov_com_column
# Usage : %new_string = %{&remov_com_column(\%hashinput)};
# @out=@{&remov_com_column(\@array3)};
# Function : removes common gap column in seq.
# Example :
# Warning :
# Keywords : remove_com_column, remove_common_column,
# remove_common_gap_column, remov_common_gap_column,
# remove com column
# Options :
# Returns : a ref. of hash(es) and array(s).
#
# name1 ABCDE....DDD name1 ABCDE..DDD
# name2 ABCDEE..DD.. --> name2 ABCDEEDD..
# name3 ...DEE..DDE. name3 ...DEEDDE.
#
# (ABC....CD, ABCD...EE) --> (ABC.CD, ABCDEE)
# from above the two column of dot will be removed
# To remove absurd gaps in multiple sequence alignment. for nt6-hmm.pl
# Argument : accepts reference for hash(es) and array(s).
# Category :
# Version :
#--------------------------------------------------------------------
sub remov_com_column{
my(@hash_ref_out, $d, $gap_char);
for($d=0; $d < @_; $d++){
if(ref($_[$d]) eq 'HASH'){
my($len,@string,@new_string,@string2);
my(%input)=%{$_[$d]};
my(@common);
for (keys %input){
@string = split('', $input{$_});
if(!(defined(@common))){ @common = (@string); }
else{ for ($k=0; $k < @string; $k++){
if (($string[$k] =~ /\W/ )&&($common[$k] =~ /(\W)/)){ $common[$k]= $1;}
elsif(($string[$k] =~ /(\W)/)&&(!(defined($common[$k])))){ $common[$k]=$1;}
else{ $common[$k]='X';} } } }
for (keys %input){ @string2 = split(//, $input{$_});
for ($i=0; $i < @string2; $i++){
if ($common[$i] ne $string2[$i]){ push(@new_string, $string2[$i]); } }
$new_string{$_}= join('', @new_string); @new_string = (); }
push(@hash_ref_out, \%new_string);
}
( run in 1.074 second using v1.01-cache-2.11-cpan-39bf76dae61 )