Bioinf

 view release on metacpan or  search on metacpan

Bioinf.pl  view on Meta::CPAN

                                               $R_start='';
                                               $sequence='';
                                               $seq_found1='';  ## reset $R_start, $seq_found1,,
                                               next F0;
					  }
				  }
			   }

		  }else{
			   print "\n# Error, the sequence pos for $NAME (from $Keys[$e]) in DB doesnt exist in xxxx.idx file?\n";
		  }
	   }
	   close DB_FASTA;
	}
	#print "\n# (6) fetch_sequence_from_db: counted fetched seqs: $found_seq_count, $acquired_seq_count";
	#print "\n# (7) fetch_sequence_from_db: Fetching seq has finished \n";

	return(\%sequence);
}



#______________________________________________________________
# Title     : fetch_seq
# Usage     : &fetch_seq(@ARGV);
# Function  : fetches swissprot entry or fasta format seq with
#             given seq name(like  SAA_HORSE, SA*HORSE, SAA,..)
#             you can give multi files(SAA*, SAU*) at the same
#             time. This uses ENV setting of 'SWDIR'
# Example   : &fetch_swiss_seq(@ARGV);
# Keywords  : fetch_swissprot_sequence, fetch_sequence,
#             find_swiss_sequence, find_sequence
# Options   : _  for debugging.
#             #  for debugging.
#             -f for fasta format file output
#             -a is for ALL matched seq. (same as using glob=> *YEAST)
#             -c is for Creating seq.idx file
#             -h is for HELP!
#             -g is for GDF file format output
#             -l is for list of match entries(in 1 column)
#             -s is for species option (input name mst be species (YEAST, RAT, HUMAN..)
#             n= is for Number of seq you want to get from swissprot
#             s= is for Size limit. Min seq size in swiss, s=10  -> minimum 11 aa seq.
#             S= is for Size limit. Max seq size in swiss, s=1000 -> get less than 1000
#
# Argument  : swissprot seqname
# Category  :
# Version   : 1.7
#--------------------------------------------------------------
sub fetch_seq{
	 my @in=@_;
	 my ($FASTA_index, $FASTA, $where_index, %index, $question, $i,
	    $s,$t,$fasta,$index_file, $all, $species,$target, $matched, $seq, $gdf, $list, $count, $create);
	 my $SEQ_size_max=100000000;

	 if(@_ < 1){	  &HELP_fetch_seq;
	 }else{
	 F: for($t=0; $t<@in; $t++){ #'''''''''''' PROMPT ARGV processing ''''''''''''''''''
		if($in[$t]=~/^\-c$/i){
		   $create=1; splice(@in, $t, 1); $t--;
		   print "\n You should provide database\(e.g, seq.dat\) file with this opt, I guess you did\n";
		   print "\n If you wanted to make an index with any fasta db, you also have to\n";
		   print "  give the file name. e.g:\n     $0 -c /DB/swiss/seq.dat\n";
		   print "  or $0 -c my_db.fa\n\n";
		   next; }
		if($in[$t]=~/^\-af$/){ $fasta=$all=1; splice(@in, $t, 1); $t--; next; }
		if($in[$t]=~/^\-afs$/){ $species=$fasta=$all=1; splice(@in, $t, 1); $t--; next; }
		if($in[$t]=~/^\-ag$/){ $gdf=$all=1; splice(@in, $t, 1); $t--; next; }
		if($in[$t]=~/^\-g$/){    $gdf=1; splice(@in, $t, 1); $t--; next; }
		if($in[$t]=~/^\-f$/i){   $fasta=1; splice(@in, $t, 1); $t--; next; }
		if($in[$t]=~/^\-a$/i){   $all=1;   splice(@in, $t, 1); $t--; next; }
		if($in[$t]=~/^\-l$/i){   $list=$all=1;   splice(@in, $t, 1); $t--; next; }
		if($in[$t]=~/^\-s$/i){  $species=$all=1; splice(@in, $t, 1); $t--; next; }
		if( ($in[$t]=~/seq\.dat/)&&(-f $in[$t])){ ## if the path for swiss prot is given
		   $DB=$in[$t];  splice(@in, $t, 1); $t--; next;        }
		if( ($in[$t]=~/seq\.idx/)&&(-e $in[$t])){ ## if the path for swiss index is given
		   $index_file=$in[$t];	splice(@in, $t, 1); $t--; next;	}

		#''''''' SWiss prompt input file check ''''''''''''''''''
		if( -f $in[$t]){
		   open(TEMP, "$in[$t]");
		   while(<TEMP>){
			 if(/^ID[\t ]+\w+/){$DB=$in[$t]; splice(@in, $t, 1);$t--;next F;}}
		   close TEMP;
		}

		#'''''''' FASTA prmpt input file check '''''''''''''''''''
		if( (-f $in[$t]) && !(defined($FASTA))){
		   open(TEMP, "$in[$t]");
		   while(<TEMP>){
			 if(/^\> {0,4}\S+/){$FASTA=$in[$t]; $fasta=1;
			 if(-s "$FASTA\.idx"){ $FASTA_index="$FASTA\.idx"; }
		     splice(@in, $t, 1);$t--;next F;}}
		   close TEMP;
		}

		#'''''''' INDEX file automatic check ''''''''''''''''''
		if( -f $in[$t]){
		   open(TEMP2, "$in[$t]");
		   my ($first_pos, $Count, @splited);
		   while(<TEMP2>){
			 $Count++;
			 if( $Count>3 ){
				if(/^ {0,2}\S+ +(\d+)/){
				   if(defined($first_pos) && ($1-$first_pos ) > 1000 ){
					  $index_file=$in[$t];
					  splice(@in, $t, 1);$t--;next F;
				   }elsif( defined($first_pos) && ($1-$first_pos)<1000 ){
					  $FASTA_index=$in[$t]; $fasta=1;
					  if($FASTA_index=~/^(\S+)\.\w+$/){
					     if(-s $1){ $FASTA= $1; }
					  }
					  splice(@in, $t, 1);$t--;next F;
				   }
				   $first_pos=$1;
				}
			 }
		   }
		   close TEMP2;
		}
		if($in[$t]=~/^\-h$/i){ &HELP_fetch_seq; exit;}

Bioinf.pl  view on Meta::CPAN

#  %hash4 = ('s4', '-TESH-CHEEE-XX-EEH-CHHEE----EEH-CHHEE----EEH-CHHEE--');
#
#    s1_s2_s3_s4    --E-H-CH-EE----E-H--HHEE----E-H--HHEE----E-H-CHHEE--
#
# Warning   : This gets more than 2 hashes. Not more than that!
#
# Class     : get_common_column, get_common_column_in_seq, get common column in sequence
#             for secondary structure only representation.
# Keywords  : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
#             get_common_column, get_com_column, get_common_sequence,
#             get_common_seq_region, multiply_seq_hash, get_common_column_in_sequence
# Options   :
# Reference :
# Returns   : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Tips      :
# Argument  : 2 or more ref for hash of identical keys and value length.
#             One optional arg for replacing space char to the given one.
# Author    : jong@salt2.med.harvard.edu
# Category  :
# Version   : 1.6
#--------------------------------------------------------------------
sub get_common_column{
  my($i, $k,$j, $name1, $name2, $sorted_name, @in, %out, @out_chars,
     $gap_chr, @str1, @str2);
  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  Sub argument handling     $gap_chr here can be 'HE' etc.
  #_______________________________________________________
  for($k=0; $k< @_ ;$k++){
	  if( ( !ref($_[$k]) )&&($_[$k]=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
	  elsif((ref($_[$k]) eq "SCALAR")&&(${$_[$k]}=~ /^(.)$/) ){
		  $gap_chr  .= $1;    }
          elsif(ref($_[$k]) eq "HASH") { push(@in,  $_[$k]); }    }

  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  Checking if the input hashes were right
  #_______________________________________________________
  if( (@in < 2) && ( ref($in[0]) eq 'HASH') ){
	  print "\n", __LINE__, " # get_common_column usually needs 2 hashes. Error \n";
	  print "\n", __LINE__, " # get_common_column : Anyway, I will just return the single input hash:  @in. \n";
	  %out=%{$in[0]}; # <--- This is essential to return the single input hash!!
	  goto FINAL;
  }
  if(@in ==2){  print "\n# (INFO) \@in has 2 elements for ref. of hash, good!\n\n"; }

  %out = %{$in[0]};  ## Initializing %out
  print "\n",__LINE__, " # get_common_column hashes given are: @in \n" if $debug eq 1;

  for( $k=1; $k < @in; $k++){
      my(@out_chars);   ## <-- Necessary to prevent concatenation.
      my(%hash1)=%out;
      my(%hash2)=%{$in[$k]};
      my(@names1)= sort keys %hash1;
      my(@names2)= sort keys %hash2;
      $name1 = $names1[0];
      $name2 = $names2[0];
      @str1=split(/\||\,/, $hash1{$names1[0]});
      @str2=split(/\||\,/, $hash2{$names2[0]});

      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Trying to guess the correct gap char
      #_____________________________________________________
      if($hash2{$names2[0]}=~/_/){
          $gap_chr='_';
      }elsif($hash2{$names2[0]}=~/(\W)\W/){
          $gap_chr=$1;
      }

      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      #  If I fail to split string by ',', then split by ''
      #_____________________________________________________
      if(@str1 == 1){  @str1=split(//, $hash1{$names1[0]}); }
      if(@str2 == 1){  @str2=split(//, $hash2{$names2[0]}); }
      for($i=0; $i < @str1; $i++){
          if($str1[$i] eq $str2[$i] ){
              push(@out_chars, $str1[$i]);
          }else{
              if( defined($gap_chr) ){
                   push(@out_chars, $gap_chr);
              }else{
                   push(@out_chars, ' ');
              }
          }
      }
      $sorted_name=join('', ($name1, $name2));
      %out='';
      $out{"$sorted_name"}= join("", @out_chars);

  }
  FINAL:
  if ($debug eq 1){
	  print "\n",__LINE__, " # get_common_column Final res. \%out :\n",
	  &show_hash(%out);
  }
  return(\%out);
}

#________________________________________________________________________
# Title     : overlay_seq_for_identical_chars
# Usage     : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
# Function  : (name1         --EHH--HHEE-- )
#             (name2         --HHH--EEEE-- ) ==> result is;
#
#             (name1_name2   -- HH--  EE-- )
#             to get the identical chars in hash strings of sequences.
#
# Example   : %out =%{&overlay_seq_for_identical_chars(\%hash1, \%hash2, '-')};
#             output> with 'E' option >>> "name1     --HHH--1232-"
# Warning   : Works only for 2 sequence hashes.
# Keywords  : Overlap, superpose hash, overlay identical chars, superpose_seq_hash
# Options   :
# Returns   : one hash ref. of the combined key name (i.e., name1_name2). Combined by '_'
# Argument  : 2 ref for hash of identical keys and value length. One optional arg for
#             replacing space char to the given one.
# Category  :
# Version   : 1.0
#--------------------------------------------------------------------
sub overlay_seq_for_identical_chars{
	my($i, $k,$j, $name1, $name2, @in, %out, @out_chars, $gap_chr, @str1, @str2);
	######################################
	####### Sub argument handling ########  $gap_chr here can be 'HE' etc.

Bioinf.pl  view on Meta::CPAN

                 $answer_char='';
                 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 print "\n# (3) write_nhco_files: chaning all the file input extension to 'parf'";
                 #_______________________________________________________________________________
                 for($i=0; $i< @file; $i++){
                     $base=${&get_base_names($file[$i])};
                     $file[$i]="$base\.$file_ext_wanted";
                 }
             }else{
                 $answer_char='';
                 print "\n\n\n# check_input_file_extension: You rudely put rubbish, I am dying. $0\n\n\n";
                 print chr(7);
                 exit;
             }
        }
        return(\@file);
}




#________________________________________________________________________________
# Title     : geanfammer_main
# Usage     : &geanfammer_main;
# Function  : The main sub of geanfammer
# Example   :
# Keywords  : main_geanfammer, geanfammer
# Options   :
# Version   : 1.9
#--------------------------------------------------------------------------------
sub geanfammer_main{
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # All the defaults, Evalues are determined in geanfammer sub
    #__________________________________________________________________
    $|=1;
    unless($algorithm){
        $algorithm='fasta'; # default search algorithm setting(will be overridden
             # by 'a=xxx' prompt argument
    }
    $sub_dir_size=2; # this is the subdirectory name char size
    $machine_readable='M';  # this is to invoke FASTA's m=10 option
    $make_msp_in_sub_dir_opt='m';
    $make_subdir_gzipped='d';
    $make_subdir_out='D';
    $Evalue_cut_single_link=0.01;
    $Evalue_cut_divclus    =0.01;
    $length_thresh=30;  # default

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
    # preprocessing the inputs,  parse_arguments reads in the options in the headerbox
    #__________________________________________________________________________________
    @your_genome_or_db_to_analyse_file=@{&parse_arguments(1)};

    print "\n\n\n# (2) geanfammer_main (with $algorithm): \@your_genome_or_db_to_analyse_file are(is)\n";
    print "\n => @your_genome_or_db_to_analyse_file with $algorithm. Min domain size is \"$length_thresh\"\n\n\n";

    if(@your_genome_or_db_to_analyse_file < 1){
        print "\n# (E) geanfammer_main: ERROR!\n";
        print "\n# Dear $ENV{'USER'}, $0: failed to find input file!\n
              Did you put FASTA format DB file as input?\n
              Or I guess your INPUT file DOES NOT exist in PWD.\n\n";
        print " As like:  $0 MG.fa \n\n\n";
        print chr(7);
        exit;
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
    # Checking if they have 'fasta' or 'ssearch' in the path
    #__________________________________________________________________________________
    if($algorithm=~/\/([^\/]+) *$/){
        print "\n# Checking \$algorithm ($algorithm) name\n";
        $algorithm_name=$1;
        if(-s $algorithm){
           print "\n# (i) $0 will use \"$algorithm\"\n";
        }else{
           $result_of_which_run=`which $algorithm_name`;
           if($result_of_which_run=~/^ *(\/\S+\/)[fastassearch]+\d* *$/){ ## after Lily Fu's suggestion
               print "\n# $0: Your $algorithm_name is in $1, good, I am running it\n";
           }else{
               print "\n# (E) \$algorithm value $algorithm is not found\n";
               print "\n# (E) $0 ran \'which\' Linux command and the result is:\n";
               print "  $result_of_which_run\n";
               print "\n# Please check your path for $algorithm\n\n"; print chr(7);
               exit;
           }
        }
    }
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Running the actual big sub
    #______________________________
    @final_clu_files=@{&geanfammer(\@your_genome_or_db_to_analyse_file,
          "a=$algorithm",
          "T=$length_thresh",
           $verbose,
           "d=$sub_dir_size",
           $over_write,
           $dynamic_factor,
           $create_sso_file,
           $reverse_sequence,
           $machine_readable,
           $make_msp_in_sub_dir_opt,
           "E=$Evalue_cut_single_link",
           "e=$Evalue_cut_divclus",
           $make_subdir_gzipped,
           $Lean_output,
         )};
    if($verbose){
        print "\n\n\n#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~";
        print "\n#";
        print "\n# $0  is finished \n\n";
    }
    print "\n# $0 : the final output \'@final_clu_files\' is created " if $make_separate_summary;
    print "\n#__________________________________________________________________\n\n" if $verbose;
    return(\@final_clu_files);
}




#______________________________________________________________________________
# Title     : encrypt_passwd



( run in 1.555 second using v1.01-cache-2.11-cpan-39bf76dae61 )