Bioinf
view release on metacpan or search on metacpan
}elsif($file[$i]=~/^(\S+)\.out\.gz/ or $file[$i]=~/^(\S+)\.[mfs]?sso\.gz/){
$big_out_msp1="\U$1".".msp";
$big_out_msp2="\U$1"."_all".".msp";
}
}
}
if(defined($big_out_msp)){
$big_out_msp1=$big_out_msp2=$big_out_msp;
print "\n# \$big_out_msp is defined as \'$big_out_msp\'\n";
}else{
print "\n# convert_sso_to_msp: You did not define the big MSP file out format, so $big_out_msp1 \n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) When File was given to this sub routine
#__________________________________________
if(@file == 1){ ## ONE single file input??
print "# one file @file is given, OUT will be: $big_out_msp1 \n";
@sso=@{&open_sso_files(@file, $add_range, $add_range2,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
"m=$margin",
$new_format,
"s=$big_out_msp")};
push(@final_out, &write_msp_files(@sso, $big_out_msp1,
$single_out_opt, $add_range) );
}elsif(@file > 1){ ## MOre than 1 file input??
@sso=@{&open_sso_files(@file, $add_range, $add_range2,
"l=$lower_expect_limit",
"u=$upper_expect_limit",
"m=$margin",
$new_format)};
push(@final_out, @{&write_msp_files(@sso, $big_out_msp2,
$single_out_opt, $add_range)} ); ## concatenates all the hash ref to one
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) When NO File but ARRAY is given
# Here, you can have SSO files created
#__________________________________________
elsif(@array >=1){
print "\n# In convert_sso_to_msp, \@array is given rather than \@file";
@sso=@{&open_sso_files(@array, "u=$upper_expect_limit", $add_range2,
"l=$lower_expect_limit", $add_range, $create_sso,
"m=$margin", $new_format)};
push(@final_out, @{&write_msp_files(@sso, $big_out_msp,
$single_out_opt, $add_range)} );
}
return(\@final_out);
}
#________________________________________________________________________________
# Title : bla_to_msf (this is not used. Use convert_bla_to_msf)
# Usage : @msf_file_made=@{&bla_to_msf(\@bla_file)};
# Function : matched each query seq name and if the E value is lower than
# my arbitrary threshold, I put the subject and target pair
# alignment into a hash.
# In later iterations, the latest is replaced
# Example :
# Keywords : convert_bla_to_msf
# Options :
# Author :
# Category :
# Version : 1.1
#--------------------------------------------------------------------------------
sub bla_to_msf{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($e_val_threshold)=0.0005;
my(@template_query_seq, @keys, %alignment_hash, %alignment_hash_query,
%alignment_hash_subject);
$choose_iteration=1;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening file
#______________________________________________
for($i=0; $i< @file; $i++){
$file_base_name=${&get_base_names($file[$i])};
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
if(/^Query=(\S+)/){
$query_seq=$1; last;
}
}
close(BLAST_OUTPUT);
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finds the query sequence, resets $start_point and next line
#____________________________________________
if(/^Searching\.\.\.\.\.\.\.\.\.\.\./){
$present_iteration++;
if($present_iteration > $choose_iteration){
last
}else{
%alignment_hash_subject=%alignment_hash_query=();
}
}elsif(/^\> *(\S+)/){
$subject_seq=$1;
$start_point='';
if($alignment_hash_subject{$subject_seq}){
$seq_already_in=1;
$subject_seq='';
next;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, match the line to get expectation value
$alignment_hash_subject{$subject_name}=~/^(\S+) +(\S+)/;
@splited_subject_seq=split(//, $2);
$evalue=$1;
for($t=0; $t< @template_query_seq; $t++){
if($template_query_seq[$t] ne '-' ){
next
}elsif($template_query_seq[$t] eq '-'){
$char_of_the_position=$splited_query_seq[$t];
if($char_of_the_position ne '-' and $char_of_the_position ne '_'){
#print "\n# \$t is $t";
#print "\n# \$evalue is $evalue\n ==>";
#print @splited_query_seq, "\n ==>";
#print @splited_subject_seq, "\n ==>";
splice(@splited_subject_seq, $t, 0, '-');
splice(@splited_query_seq, $t, 0, '-');
#print @splited_query_seq, "\n ==>";
#print @splited_subject_seq, "\n";
next;
}elsif($char_of_the_position eq '_'){
splice(@splited_subject_seq, 0, 0, '_');
splice(@splited_query_seq, 0, 0, '_');
}elsif($char_of_the_position eq '-'){
next;
}
}
}
$new_subject_seq=join('', @splited_subject_seq);
$new_query_seq =join('', @splited_query_seq);
#$alignment_hash{$keys[$g]}="$evalue $new_subject_seq";
#$alignment_hash{$keys[$g-1]}="$evalue $new_query_seq";
$alignment_hash_subject{$subject_name}="$evalue $new_subject_seq";
$alignment_hash_query{$subject_name} ="$evalue $new_query_seq";
}
print "\n";print @template_query_seq, "\n" if $verbose;
for($h=0; $h< @keys; $h++){
$subject_name=$keys[$h];
$alignment_hash_subject{$subject_name}=~/^(\S+) +(\S+)/;
#print "\n $alignment_hash_query{$subject_name}";
print "\n $alignment_hash_subject{$subject_name}";
$final_seq_out{$subject_name}=$2;
}
&write_msf(\%final_seq_out, \$output_msf);
push(@final_out, $output_msf);
}
return(\@final_out);
}
#________________________________________________________________________________
# Title : convert_bla_to_msf
# Usage : @msf_file_made=@{&convert_bla_to_msf(\@bla_file)};
# Function : matched each query seq name and if the E value is lower than
# my arbitrary threshold, I put the subject and target pair
# alignment into a hash.
# In later iterations, the latest is replaced
# Example :
# Keywords : convert_bla_to_msf
# Options :
# Author :
# Category :
# Version : 1.1
#--------------------------------------------------------------------------------
sub convert_bla_to_msf{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($e_val_threshold)=0.0005;
my(@template_query_seq, @keys, %alignment_hash, %alignment_hash_query,
%alignment_hash_subject);
$choose_iteration=1;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening file
#______________________________________________
for($i=0; $i< @file; $i++){
$file_base_name=${&get_base_names($file[$i])};
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
if(/^Query=(\S+)/){
$query_seq=$1; last;
}
}
close(BLAST_OUTPUT);
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finds the query sequence, resets $start_point and next line
#____________________________________________
if(/^Searching\.\.\.\.\.\.\.\.\.\.\./){
$present_iteration++;
if($present_iteration > $choose_iteration){
last
}else{
%alignment_hash_subject=%alignment_hash_query=();
}
}elsif(/^\> *(\S+)/){
$subject_seq=$1;
$start_point='';
if($alignment_hash_subject{$subject_seq}){
$seq_already_in=1;
$subject_seq='';
next;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, match the line to get expectation value
my (@finale_out, $sorted_name, $msp_line, $evalue, $score, $matched, $seq_id, $query_range_start,$accumulative_hits_eval_thresh,
$query_range_stop, $query, $match_string_start, $match_string_stop, $read_point_found, %hash_out, %accumulative_hits, $evalue_thresh);
%hash_out=%{$_[0]}; %accumulative_hits=%{$_[1]};
$query=$_[2]; $matched=$_[3];
$evalue=$_[4]; $score=$_[5];
$seq_id=$_[6]; $sorted_name=$_[7];
$query_range_start=$_[8]; $query_range_stop =$_[9];
$match_string_start=$_[10]; $match_string_stop=$_[11];
$read_point_found=$_[12]; $accumulative_hits_eval_thresh=$_[13];
$take_only_the_last_iteration=$_[14];
$accumulative_hits_eval_thresh=$_[15];
$evalue_thresh=$_[16];
$query ="$query\_$query_range_start\-$query_range_stop";
if($matched !~/^\S+\_\d+\-\d+ *$/){ $matched="$matched\_$match_string_start\-$match_string_stop";
}elsif($matched =~/^(\S+)\_\d+\-\d+ *$/){ $matched="$1\_$match_string_start\-$match_string_stop"; }
if($score=~/\S/ and $evalue=~/\S/ and $match_string_start=~/\S/ and $evalue_thresh > $evalue){
$msp_line=sprintf("%-6s %-9s %-5s %-5s %-5s %-32s %-5s %-5s %-38s %-3s\n",
$score, $evalue, $seq_id, $query_range_start, $query_range_stop,
$query, $match_string_start, $match_string_stop, $matched, $read_point_found);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is where I really put the matches !!!
#_____________________________________________________
if($hash_out{$sorted_name}=~/^\S+ +(\S+) +/){
if($1 >= $evalue){
print " (1) put_msp_lines_to_hash_from_bla: $1 >= $evalue WRITING to hash. 1\n" if $verbose;
$hash_out{$sorted_name}=$msp_line;
}else{
print " put_msp_lines_to_hash_from_bla: $1 < $evalue_ori NO write to hash\n" if $verbose; }
}else{
print " (2) put_msp_lines_to_hash_from_bla: NO eval >= $evalue WRITING to hash. 2\n" if $verbose;
$hash_out{$sorted_name}=$msp_line;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This part is to rescue the hits dropped by matrix migration
#_________________________________________________________________
if(!$take_only_the_last_iteration and $evalue <= $accumulative_hits_eval_thresh ){
if($accumulative_hits{$sorted_name}){
if($accumulative_hits{$sorted_name}=~/^[\t ]*\S+[\t ]+(\S+)[\t ]/){
if($evalue < $1){
$accumulative_hits{$sorted_name}=$msp_line; } }
}else{ $accumulative_hits{$sorted_name}=$msp_line; }
}
}else{ }
@finale_out=(\%hash_out, \%accumulative_hits, $read_point_found, $query, $matched, $evalue, $score, $seq_id, $sorted_name,
$query_range_start, $query_range_stop, $match_string_start, $match_string_stop );
return(\@finale_out);
}
#________________________________________________________________________________
# Title : convert_bla_multaln_to_msf
# Usage : @msf_file_made=@{&convert_bla_multaln_to_msf(\@bla_file, [i=2])};
# Function : matched each query seq name and if the E value is lower than
# my arbitrary threshold, I put the subject and target pair
# alignment into a hash.
# In later iterations, the latest is replaced,
# when you use m6 option for PSI blast
# this adds '00x' extensions to the repeatedly occurring seq names
#
# Example : @msf_file_made=@{&convert_bla_multaln_to_msf(\@bla_file,
# $verbose, "i=$iteration")};
# Keywords : psi_blast_to_msf, psi_blast_multaln_to_msf
# Options :
# i=$iteration
# v for verbose
# Author :
# Category :
# Version : 1.6
#--------------------------------------------------------------------------------
sub convert_bla_multaln_to_msf{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($e_val_threshold)=0.0005;
my(@template_query_seq, @keys, $present_iteration, $blank_line_counter,
%alignment_hash_subject, $seq_order, $choose_iteration, %final_output_hash,
$seq_name, %seq_names_in_block, $put_alphabet_to_number_only_name);
$choose_iteration=1;
if($vars{'i'}=~/(\d+)/){
$choose_iteration=$1;
}
if($char_opt=~/v/){ $verbose='v' }
if($char_opt=~/a/){ $put_alphabet_to_number_only_name='a' }
print "\n# $0: bla_multaln_to_msf, \$choose_iteration is $choose_iteration\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening file
#______________________________________________
for($i=0; $i< @file; $i++){
$file_base_name=${&get_base_names($file[$i])};
print "\n# bla_multaln_to_msf: processing $file[$i]\n";
my($present_iteration, %seq_names_in_block, $seq_name_ori, $sequence);
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finds the query sequence, resets $start_point and next line
#____________________________________________
if(/^Query= *(\S+)/){
$query_seq=$1;
print "\n# The query sequence is: $query_seq\n";
}elsif(/^Searching\.\.\.\.\.\.\.\./ or eof){ ### to make sure it gets the last one
$present_iteration++;
if($present_iteration > $choose_iteration){
%final_output_hash=%alignment_hash_subject;
last;
}else{
%final_output_hash=%alignment_hash_subject;
%alignment_hash_subject=();
( run in 0.543 second using v1.01-cache-2.11-cpan-96521ef73a4 )