BioPerl

 view release on metacpan or  search on metacpan

examples/tools/standaloneblast.pl  view on Meta::CPAN

#   vary_masks(): Example demonstrating varying of parameter
#
#  args:
#	$queryseq  - query sequence (can be filename (fasta),  or Bio:Seq object)
#	$maskvalues  - reference to array of values to be used for the mask threshold
#  returns: nothing

# Now we'll perform several blasts, 1 for each value of the mask threshold.
# In the default case, we use thresholds of 25%, 50% and 75%. (Recall the threshold is
# % of resudues which must match the consensus residue before deciding to use the
# position specific scoring matrix rather than the default - BLOSUM or PAM - matrix)
# We then automatically parse the resulting reports to identify which hits
# are found with any of the mask threshold values and which with only one of them.
#

sub vary_masks {

my $queryseq = shift;
my $values = shift;


print "Beginning mask-varying example... \n";

my ($report, $sbjct, $maskvalue);

my $hashhits = { };     # key is hit id, value is string of param values for which hit was found

# Get the alignment file
my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
my $aln = $str->next_aln();

foreach $maskvalue (@$values)  {

    print "Performing BLAST with mask threshold = $maskvalue % \n";

    # Create the proper mask for 'jumpstarting'
    my $mask = &create_mask($aln, $maskvalue);
    my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
    my $r = $report2->next_result;
    while($sbjct = $r->next_hit) {
	$hashhits->{$sbjct->name} .= "$maskvalue";			
    }
}

&compare_runs( 'mask threshold' , $values , $hashhits);

return 1;

}

#################################################
#  aligned_blast ():
#
#
#  args: 
#	$queryseq  - query sequence (can be filename (fasta),  or Bio:Seq object) 
#	$queryaln  - file containing alignment to be used to "jumpstart" psiblast in "-B mode"
#			$queryaln *must contain $queryseq with the same name and length
#				(psiblast is very picky)
#	$queryalnformat  - format of alignment (can = "fasta", "msf", etc)
#	$jiter  - number of iterations in psiblast run
#	$maskvalue  - threshold indicating how similar residues must be at a sequence location
#		before position-specific-scoring matrix is used
#		: "0" => use position specific matrix at all residues,  or
#			"100" => use default (eg BLOSUM) at all residues
#  returns: nothing  


# For this example, we'll compare the results of psiblast depending on whether psiblast itself is 

#  used to identify an alignment to use for blasting or whether an external alignment is given to 
#  psiblast

sub aligned_blast {


my     $queryseq  =  shift; 
my	$queryaln  =  shift; 
my	$queryalnformat  =  shift;
my	$jiter  =  shift;
my	$maskvalue  =  shift;

my $hashhits = { };
my ($sbjct, $id);

print "\nBeginning aligned blast example... \n";


# First we do a  single-iteration psiblast search but with a specified alignment to
#  "jump start" psiblast


print "\nBeginning jump-start psiblast ... \n";


my $tag1 = 'jumpstart';

# $factory->j('1');    # perform single iteration

# Get the alignment file
my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
my $aln = $str->next_aln();


# Create the proper mask for 'jumpstarting'
my $mask = &create_mask($aln, $maskvalue);


my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
while($sbjct = $report2->next_result) {
		$hashhits->{$sbjct->name} .= "$tag1";			
}

# Then we do a "plain" psiblast multiple-iteration search

print "\nBeginning multiple-iteration psiblast ... \n";

my $undefineB ;
  $factory->B($undefineB);

my $tag2 = 'iterated';
$factory->j($jiter);    # 'j' is blast parameter for # of iterations
my $report1 = $factory->blastpgp($queryseq);
my $total_iterations = $report1->number_of_iterations;
my $last_iteration = $report1->round($total_iterations);


 while($sbjct = $last_iteration->next_result) {
		$hashhits->{$sbjct->name} .= "$tag2";			
	}

# Now we compare the results of the searches

my $tagtype = 'iterated_or_jumpstart'; 
my $values = [ $tag1, $tag2];

&compare_runs( $tagtype , $values , $hashhits);  
 
return 1;

}


#################################################


# create_mask(): creates a mask for the psiblast jumpstart alignment
#                that determines at what residues position-specific
#                scoring matrices (PSSMs) are used and at what
#                residues default scoring matrices (eg BLOSUM) are
#                used. See psiblast documentation for more details,

#  args: 
#	$aln  -  SimpleAlign object with alignment
#	$maskvalue  -  label describing type of "tags"
#  returns: actual mask, ie a string of 0's and 1's which is the 
#           same length as each sequence in the alignment and has 
#           a "1" at locations where (PSSMs) are to be used
#           and a "0" at all other locations.


sub create_mask {
	my $aln = shift;
	my $maskvalue = shift;
	my $mask = "";

	die "psiblast jumpstart requires all sequences to be same length \n"
	  unless $aln->is_flush();
	my $len = $aln->length();

	if ($maskvalue =~ /^(\d){1,3}$/  ) {
		$mask = $aln->consensus_string($maskvalue) ;
		$mask =~ s/[^\?]/1/g ;
		$mask =~ s/\?/0/g ;
	}
	else { die "maskvalue must be an integer between 0 and 100 \n"; }
	return $mask ;
}

#----------------
sub example_usage {
#----------------

#-----------------------
# Prints usage information for general parameters.

    print STDERR <<"QQ_PARAMS_QQ";

 Command-line accessible script variables and commands:
 -------------------------------
 -h 		:  Display this usage info and exit.
 -in <str>	:  File containing input sequences in fasta format (default = $queryseq) .
 -inaln <str>	:  File containing input alignment for example 3 (default = $queryaln) .
 -alnfmt <str>	:  Format of input alignment for example 3, eg "msf", "fasta", "pfam".
		   (default = $queryalnformat) .
 -do <int>	:  Number of test to be executed ("1" => vary parameters,
		   "3" => compare iterated & jumpstart psiblast.) If omitted,
		   three default tests performed.
 -exec <str>  	:  Blast executable to be used in example 1.  Can be "blastall" or
		   "blastpgp" (default is "blastpgp").
 -param <str>  	:  Parameter to be varied in example 1. Any blast parameter
		   can be varied (default = 'MATRIX')
 -paramvals <str>:  String containing parameter values in example 1, separated
		   by ":"'s. (default = 'BLOSUM62:BLOSUM80:PAM70')
 -iter <int>    :  Maximum number of iterations in psiblast in example 3 (default = 2)
 -maskvals <str>:  String containing mask threshold values (in per-cents) for example 3,
		   separated by ":"'s. (default = '50:75:25')

In addition, any valid Blast parameter can be set using the syntax "parameter=>value" as in "database=>swissprot"

So some typical command lines might be:
 >standaloneblast.pl -do 1 -param expectation -paramvals '1e-10:1e-5'
or
 >standaloneblast.pl -do 1 -exec blastall -param q -paramvals '-1:-7' -in='t/dna1.fa' "pr=>blastn" "d=>ecoli.nt"
or
 >standaloneblast.pl -do 4 -maskvals 0 -iter 3
or
 >standaloneblast.pl -do 3 -maskvals '10:50:90'  -in 't/data/cysprot1.fa' -alnfmt msf -inaln 't/cysprot.msf'



QQ_PARAMS_QQ
}



( run in 1.213 second using v1.01-cache-2.11-cpan-71847e10f99 )