Bioinf

 view release on metacpan or  search on metacpan

Bioinf.pl  view on Meta::CPAN

	if($comm_col =~ /C/i){
		%DSSP_common=%{&open_dssp_files( @names, $H, $S, $E, $T, $I, $G, $B, $simplify, 'C')};
		@keys_common= keys %DSSP_common;
		@stringDSSP_common = split(/|\,/, $DSSP_common{$keys_common[0]});
		if($debug2 eq 1){ print __LINE__," \$comm_col is set to: $comm_col \n";
			print __LINE__," \@stringDSSP_common is :@stringDSSP_common \n";
		}
	}

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	# Comparing two hashes
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	for $name (@names){
		#"""""""""""""""" Splitting the sequence string
		if($arraySEQ{$name}=~/\,\S+\,/){
			@stringSEQ =split(/\,/, $arraySEQ{$name});
			@stringSTR=split(/\,/, $arraySTR{$name});  }
		else{
			@stringSEQ =split(//, $arraySEQ{$name});
			@stringSTR=split(//, $arraySTR{$name});
		}
		print "\n",__LINE__, " \@stringSEQ  is  @stringSEQ \n" if $debug2 eq 1;
		print "\n",__LINE__, " \@stringSTR  is  @stringSTR \n" if $debug2 eq 1;

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   Contracting  the SEQ.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@seq_positionSEQ = @{&get_posi_sans_gaps(\$arraySEQ{$name})};
		@seq_positionSTR = @{&get_posi_sans_gaps(\$arraySTR{$name})};

		#"""""""""""""""" To get secondary structure only calc  """"""""""""""""""""""""""""
		# It superposes the NON sec. region on  @seq_positionSTR to nullify positions.
		#  get_posi_diff ignores non char positions in calc.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		if( ($ss_opt =~ /ss$/i) && ($comm_col !~ /C/i) ){
			%DSSP=%{&open_dssp_files($name, $H, $S, $E, $T, $I, $G, $B, $simplify, $comm_col)};
			if($debug1 eq 1){
			   print "\n",__LINE__," open_dssp_files has options \$H ->$H \$S->$S \$E->$E \n";
			   print "\n",__LINE__," \$T->$T \$I->$I \$G->$B \$simplify->$simplify \$comm_col ->$comm_col\n";
			   &show_hash( \%DSSP );
			}
			if(ref(\%DSSP) eq 'HASH'){ # to check it %DSSP was valid, If not it skips overlaying
				@stringDSSP = split(/|\,/, $DSSP{$name});
				$size_of_stringDSSP = @stringDSSP;
				$size_of_seq_positionSTR = @seq_positionSTR;
				if($debug2 eq 1){
					  print "\n",__LINE__," \@stringDSSP is \n @stringDSSP\n";
					  print "\n",__LINE__," Size of \@stringDSSP      is $size_of_stringDSSP\n" ;
					  print "\n",__LINE__," Size of \@seq_positionSTR is $size_of_seq_positionSTR\n";
					  print "\n",__LINE__," \$gap_char is \"$gap_char\" \n" ;
				}
				#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
				#   When the sec. str is not defined in DSSP, I delete the position of
				#   @stringDSSP to gap(ie. make it blank to exclude error rate calc)
				#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
				for($i=0; $i < @stringDSSP; $i++){
					if($stringDSSP[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
				}
			}
		}elsif( $comm_col =~ /C/i){
				print __LINE__, " Replacing position with \gap_char \"$gap_char\"\n" if $debug2 eq 1;
				$ss_opt = 'ss'; # whether it was set or not, make it 'ss'
				for($i=0; $i < @stringDSSP_common; $i++){
					if($stringDSSP_common[$i] =~ /\W/){ $seq_positionSTR[$i]= $gap_char;}
				}
		}

		if($debug2 eq 1){
			print __LINE__,
			print " \@seq_positionSTR is  @seq_positionSTR\n";
		}

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   getting Position differences.
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@position_diffs  = @{&get_posi_diff(\@seq_positionSEQ, \@seq_positionSTR)};

		if($debug2 eq 1){
			print __LINE__,
			print " \@position_diffs is  @position_diffs\n";
		}

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#  You can have two types of output according to which alignment you compare your
		#   error rates. (1) Compare to @stringSEQ   (2) @stringSTR
		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		@position_corrected1 = @{&put_position_back_to_str_seq(\@stringSEQ, \@position_diffs)};
		$array3{$name}=join(",", @position_corrected1);

	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	# The final Step for error rate, $LIMIT is to confine error rate in one digit (ie, less than 10)
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	%final_posi_diffs =%{&get_residue_error_rate(\%array3, $LIMIT)};

	undef(@whole_length, $len_of_seq);
	return(\%final_posi_diffs);
}

#________________________________________________________________________
# Title     : get_posi_rates_hash_out (derived from 'get_posi_shift_hash' )
# Usage     : %rate_hash = %{&get_posi_shift_hash(\%hash_msf, \%hash_jp)};
# Function  : This is to get position specific error rate for line display rather than
#             actual final error rate for the alignment.
#             Output >>
#             seq1_seq2  1110...222...2222
#             seq2_seq3  1111....10...1111
#             seq1_seq3  1111....0000.0000
#
# Example   :
# Warning   : split and join char is ','; (space)
# Keywords  :
# Options   :
# Returns   : \%final_posi_diffs;
# Argument  : %{&get_posi_rates_hash_out(\%msfo_file, \%jpo_file)};
#             Whatever the names, it takes one TRUE structral and one ALIGNED hash.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub get_posi_rates_hash_out{
	my(%array1)=%{$_[0]};

Bioinf.pl  view on Meta::CPAN

#----------------------------------------------------------------------------
sub open_sso_files{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my (@out_refs, @SSO, $create_sso, $parseable, @OUT, @temp_sso_lines,
            %match, $attach_range_in_names, $margin, $uppercase_seq_name,
            $lowercase_seq_name, $target_seq, $new_format, $get_alignment,
            $pvm_version_fasta_out, $original_target_seq, $big_msp_out_file);

    my ($upper_expect_limit, $lower_expect_limit)=(50,0);

    if($char_opt=~/R/){  $attach_range_in_names2=1; };
    if($char_opt=~/r2/){ $attach_range_in_names =1; $attach_range_in_names2=1 };
    if($char_opt=~/r/){  $attach_range_in_names =1; };
    if($char_opt=~/c/){  $create_sso   ='c' };
    if($char_opt=~/n/){  $new_format   ='n' };
    if($char_opt=~/a/){  $get_alignment='a' };
    if($char_opt=~/U[pperPPER]*/){ $uppercase_seq_name='U' };
    if($char_opt=~/L[owerOWER]*/){ $lowercase_seq_name='L' };
    if($vars{'u'}=~/(\.?\d+)/){ $upper_expect_limit = $vars{'u'} };
    if($vars{'l'}=~/(\.?\d+)/){ $lower_expect_limit = $vars{'l'} };
    if($vars{'m'}=~/\d+/){ $margin = $vars{'m'} };
    $attach_range_in_names2=$attach_range_in_names=1;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # opening file input (can handle .gz  files)
    #_______________________________________________
    if(@file < 1 and @array > 0){
         for($i=0; $i< @array; $i++){
              @sso=@{$array[$i]};
         }
         print "\n# (INFO) \@sso has ", scalar(@sso), " lines. \n"  if $verbose;
         if(@sso > 3000){ # if @sso is very big, I remove the useless contents
             print "\n# (INFO) open_sso_files: size of \@sso for $file[$i] exceeds 3000 lines, ", scalar(@sso), " !!! \n";
         }
         push(@OUT, &read_sso_lines(\@sso, $create_sso,
                    "u=$upper_expect_limit",
                    "l=$lower_expect_limit",
                    $attach_range_in_names,
                    $attach_range_in_names2,
                    $new_format, $get_alignment) );
    }
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Opening input FILE!
    #_______________________________________________
    else{
         print "\n# open_sso_files : processing @file \n\n";
         for($i=0; $i< @file; $i++){
              if($file[$i]=~/\S+\.msp *$/){ $big_msp_out_file=$file[$i]; splice (@file, $i, 1); $i--;
              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              # Opening zipped file
              #_______________________________________________
              }elsif($file[$i]=~/\S+\.\gz$/ or -B $file[$i]){  ## if file has xxxx.gz extension
                  my (@sso);
                  @sso=`gunzip -c $file[$i]`;
                  if(@sso < 30){  @sso=`zcat $file[$i]`; }      # if zcat fails to produce output use gunzip -c
                  if(@sso > 3000){ # if @sso is very big, I remove the useless contents
                      print "\n# open_sso_files: size of \@sso for $file[$i] exceeds 3000 lines, ", scalar(@sso), " !!! \n";
                  }
                  push(@OUT, &read_sso_lines(\@sso, $create_sso,
                      "u=$upper_expect_limit",
                      "l=$lower_expect_limit",
                      $attach_range_in_names,
                      $attach_range_in_names2,
                      $new_format, $get_alignment) );
              }
              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              # Opening plain file(not zipped)
              #_______________________________________________
              elsif($file[$i]=~/\S+\.[fsm]?sso/ or $file[$i]=~/\S+\.out/ or $file[$i]=~/\S+\.fso/){
                  print "\n# (INFO) openning text File format xxxx.[fms]sso $file[$i]";
                  open(SSO, "$file[$i]") or die "\n# (ERROR) open_sso_files: Failed to open $file[$i]\n";
                  my @sso=<SSO>;
                  if(@sso < 30){  @sso=`zcat $file[$i]`; }      # if zcat fails to produce output use gunzip -c
                  if(@sso > 3000){ # if @sso is very big, I remove the useless contents
                      print "\n# (INFO) open_sso_files: size of \@sso is for $file[$i] exceeds 3000 lines, ",
                             scalar(@sso), " !!! \n";
                  }
                  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                  # Calling read_sso_lines sub
                  #_____________________________________
                  push(@OUT, &read_sso_lines([@sso], $create_sso,
                       "u=$upper_expect_limit",
                       "l=$lower_expect_limit",
                       $attach_range_in_names,
                       $attach_range_in_names2,
                       $new_format, $get_alignment) );
                  close SSO;
              }
         }
    }
    print "\n# \@OUT has ", scalar(@OUT), " elements \n" if $verbose;
    return(\@OUT); # @OUT has refs of hashes  (\%xxx, \%YYY, \%XXX,,,,)
}


#_____________________________________________________________________________
# Title     : open_msp_files
# Usage     : %seq=%{&open_msp_files(@file, $names_only)};
# Function  : opens Erik Sonhammer's MSPcrunch file output(default).
#             This looks up xxxxx.fa files in the pwd (with S opt) and see
#             if it can get the sequences as well.
#             With 'n' option you can just get the matched sequence
#              names with ranges.
# Example   : Example output(with 'n' opt):
#   d1bi6h1         d1bi6h1_1-24     IBR1_ANACO_20-42  IBR2_ANACO_19-42
#   e1bi6.1h1       IBR1_ANACO_38-52 e1bi6.1h1_1-18    IBR2_ANACO_38-52
#
# Keywords  : exchange_msp_file_columns,
# Options   :
#          s -s  for size return only
#          S -S  for the sequences are fetched if equivalent xxxx.fa files are in pwd
#          n -n  for matched seq NAMEs with ranges only (eg: HI0001_1-12,,), hash ref is out



( run in 1.527 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )