thread results from the CPAN

thread

App-lapidary

view release on metacpan or search on metacpan

#!/usr/bin/perl
###
#	Lapidary.pl
#	Script to determine the read coverage and identity to a protein database using diamond
#	Last modified: 19/03/2024
#	Author: Samuel Bloomfield
###

use warnings;
use strict;
use Getopt::Long;
use List::MoreUtils qw(uniq);
use FindBin qw($RealBin);
use File::Basename;
use File::Spec::Functions qw(catfile);
use LWP::Simple;
use Archive::Extract;


# here check if no arguments were given, then print help message
if (@ARGV == 0) {
    # Print the help message
    print "This script requires some arguments to run.\n";
    print "use --help to see the options\n";
    exit;
}

my $script_location = $RealBin;
my $path_variable = $ENV{'PATH'};
my $read_1;
my $read_2="";
my $db;
my $threads = 1; #Default to 1 thread
my $identity = 70; #Default to 70% identity
my $coverage = 50; #Default to 50% coverage
my $read_type;
my $help;
my $version;
my $sequence_identification = "identity";

# Check if the script location is already in the PATH
if ($path_variable !~ m/$script_location/) {
    # Add the script location to the PATH
    $ENV{'PATH'} = "$path_variable:$script_location";
}

GetOptions (	'read_1:s'		=> \$read_1,
	'read_2:s'		=> \$read_2,
	'db:s'			=> \$db,
	'threads:i'		=> \$threads,
	'identity:i'	=> \$identity,
	'coverage:i'	=> \$coverage,
	'read_type:s'	=> \$read_type,
	'sequence_identification:s'	=> \$sequence_identification,
	'help'			=> \$help,
	'version'		=> \$version);

#Print out help message
if(defined($help)){
    die "\n\nLapidary: a software for identifying amino acid sequences using sequenced reads\n\n
    Options:\n
    read_1\tLocation of first read file (required)\n
    read_2\tLocation of second read file if read files are paired\n
    db\tFull location to fasta file containing amino acid sequences (required)\n
    threads\tNumber of threads to use for Diamond (default: 1)\n
    identity\tDiamond identity percentage cut-off to use (default: 70)\n
    coverage\tDiamond coverage percentage cut-off to use (default: 50)\n
    read_type\tTypes of reads used (required): single or paired\n
    sequence_identification\tMethod for calling most likely sequence: identity (default) or consensus\n
    help\tDisplay help screen\n
    version\tReturn version of Lapidary\n\n";
} 

#Print out version
if(defined($version)){
    die "\n\nLapidary version 0.5.0\n\n";
} 


# Check if we have diamond
# Define the URL to download diamond
my $diamond_url = 'http://github.com/bbuchfink/diamond/releases/download/v2.1.8/diamond-linux64.tar.gz';

# Check if diamond is in the PATH
unless (system("which diamond >/dev/null 2>&1") == 0) {
    print "diamond not found in PATH. Downloading and installing...\n";
    
    # Get the filename from the URL
    my $filename = basename($diamond_url);
    
    # Set the destination path for download and extraction
    my $download_path = catfile($RealBin, $filename);
    my $extraction_path = $RealBin;
    
    # Download the diamond tar.gz file
    my $status = getstore($diamond_url, $download_path);
    
    # Check if download was successful
    unless ($status == 200) {
	die "Failed to download diamond: $status";
    }
    
    # Extract the tar.gz file
    my $extractor = Archive::Extract->new(archive => $download_path);
    my $result = $extractor->extract(to => $extraction_path);
    
    # Check if extraction was successful
    unless ($result) {
	die "Failed to extract diamond: " . $extractor->error;
    }
    
    # Print success message
    print "diamond has been downloaded and installed in $extraction_path\n";
}


#Create diamond database of protein sequences
my $db_name;
if ($db=~m/^.+\/(.*?)\.f.*?$/) {
    $db_name = $1;
    
    system "diamond makedb --in $db --db $db_name"

} else {
    die "File $db not in fasta format $!\n";

script/lapidary view on Meta::CPAN

my @index_sorted;

my $position_depth;

my $identity_proportion;
my $coverage_proportion;
my $mean_read_depth;

my @protein_indexes;
my @protein_index_sorted;

my @position_depth_array;
my @position_cover_array;
my @position_match_array;
my @position_start_array;
my @position_end_array;
my @index_sorted_size;

my $sample_position_current;

my $alignment_start;
my $alignment_end;

my @consensus;
my @uniq_consensus;
my @uniq_consensus_count;
my $variant_count;

my @consensus_positions;
my @consensus_chunks;

my $read_identity;
my @sample_identity;
my @protein_identity;
my @consensus_identity;
my @sample_length;

my @identity_index;

my $adjusted_identity;
my $adjusted_coverage;
my @adjusted_amino_acids;

my $partial_match;
my $partial_count;
my $partial_position_current;  

my $temp;

#Use first read file to name outputs
if ($read_1=~m/^.+\/(.*?)\.f.*?$/) { 
    $concatenated = ($1 . "_concatenated.fastq.gz");
    $diamond_output = ($1 . "_diamond_read_alignments.txt");
    $protein_cut_off = ($1 . "_protein_cut_offs.txt");

    if($read_type eq "paired") {
	#Concaternate read files
	system "cat $read_1 $read_2 > $concatenated";
	
	#Run diamond on concatenated reads
	system "diamond blastx -k 1000000000000 -e 1.0 -k0 --matrix BLOSUM45 -q $concatenated --db $db_name --threads $threads --id $identity --query-cover $coverage -o $diamond_output -f 6 stitle qseqid qseq_translated qstart qend sstart send qcovhsp piden...
	
	#Remove concatenated reads
	system "rm $concatenated";
    } elsif ($read_type eq "single") {
	#Run diamond on single reads
	system "diamond blastx -k 1000000000000 -e 1.0 -k0 --matrix BLOSUM45 -q $read_1 --db $db_name --threads $threads --id $identity --query-cover $coverage -o $diamond_output -f 6 stitle qseqid qseq_translated qstart qend sstart send qcovhsp pident qlen...
    }

    #Read in diamond_output and extract read information
    @sample_proteins = ();	
#	@sample_reads = ();
    @sample_positions = ();
    @sample_amino_acids = ();
    @sample_identity = ();
    @sample_length = ();

    open DIAMOND, $diamond_output or die "File $diamond_output NOT FOUND - $!\n";
    foreach $_ (<DIAMOND>) {
	if ($_=~m/^(.*?)\t(.*?)\t(.*?)\t(\d+\.*\d*)\t(\d+\.*\d*)\t(\d+\.*\d*)\t\d+\.*\d*\t\d+\.*\d*\t(\d+\.*\d*)\t(\d+)\t(\d+)$/) {
	$read_protein = $1;
#		$read_name = $2;
	$read_translate = $3;
	$start_temp = $4;
	$end_temp = $5;
	$read_position = ($6 - 1);
	$read_identity = $7;
	$read_length = $8;
	$protein_length = $9;
	    
	#Make read_start smallest and read_end largest
	if ($start_temp < $end_temp) {
	    $read_start = $start_temp;
	    $read_end = $end_temp;
	} else {
	    $read_start = $end_temp;
	    $read_end = $start_temp;
	}
	    
	    
	#Ignore read if alignment is in the middle of the read - account for two nucleotides due to frame shifts
	$read_alignment_QC = 0;
	    
	if($read_start > 3 ){
	    $read_alignment_QC++;
	}
	    
	if($read_end < ($read_length - 2)){
	    $read_alignment_QC++;
	}					
	    
	if($read_alignment_QC < 2) {
	    push @sample_proteins, $read_protein;
#			push @sample_reads, $read_name;
	    push @sample_positions, $read_position;
	    push @sample_amino_acids, $read_translate;	
	    push @sample_identity, $read_identity;
	    push @sample_length, $read_length;

	    }
	}
    }

    close DIAMOND;
	
    #Identify protein sequences with read matches
    @unique_proteins = uniq @sample_proteins;

( run in 0.947 second using v1.01-cache-2.11-cpan-c966e8aa7e8 )