App-lapidary
view release on metacpan or search on metacpan
script/lapidary view on Meta::CPAN
#!/usr/bin/perl
###
# Lapidary.pl
# Script to determine the read coverage and identity to a protein database using diamond
# Last modified: 19/03/2024
# Author: Samuel Bloomfield
###
use warnings;
use strict;
use Getopt::Long;
use List::MoreUtils qw(uniq);
use FindBin qw($RealBin);
use File::Basename;
use File::Spec::Functions qw(catfile);
use LWP::Simple;
use Archive::Extract;
# here check if no arguments were given, then print help message
if (@ARGV == 0) {
# Print the help message
print "This script requires some arguments to run.\n";
print "use --help to see the options\n";
exit;
}
my $script_location = $RealBin;
my $path_variable = $ENV{'PATH'};
my $read_1;
my $read_2="";
my $db;
my $threads = 1; #Default to 1 thread
my $identity = 70; #Default to 70% identity
my $coverage = 50; #Default to 50% coverage
my $read_type;
my $help;
my $version;
my $sequence_identification = "identity";
# Check if the script location is already in the PATH
if ($path_variable !~ m/$script_location/) {
# Add the script location to the PATH
$ENV{'PATH'} = "$path_variable:$script_location";
}
GetOptions ( 'read_1:s' => \$read_1,
'read_2:s' => \$read_2,
'db:s' => \$db,
'threads:i' => \$threads,
'identity:i' => \$identity,
'coverage:i' => \$coverage,
'read_type:s' => \$read_type,
'sequence_identification:s' => \$sequence_identification,
'help' => \$help,
'version' => \$version);
#Print out help message
if(defined($help)){
die "\n\nLapidary: a software for identifying amino acid sequences using sequenced reads\n\n
Options:\n
read_1\tLocation of first read file (required)\n
read_2\tLocation of second read file if read files are paired\n
db\tFull location to fasta file containing amino acid sequences (required)\n
threads\tNumber of threads to use for Diamond (default: 1)\n
identity\tDiamond identity percentage cut-off to use (default: 70)\n
coverage\tDiamond coverage percentage cut-off to use (default: 50)\n
read_type\tTypes of reads used (required): single or paired\n
sequence_identification\tMethod for calling most likely sequence: identity (default) or consensus\n
help\tDisplay help screen\n
version\tReturn version of Lapidary\n\n";
}
#Print out version
if(defined($version)){
die "\n\nLapidary version 0.5.0\n\n";
}
# Check if we have diamond
# Define the URL to download diamond
my $diamond_url = 'http://github.com/bbuchfink/diamond/releases/download/v2.1.8/diamond-linux64.tar.gz';
# Check if diamond is in the PATH
unless (system("which diamond >/dev/null 2>&1") == 0) {
print "diamond not found in PATH. Downloading and installing...\n";
# Get the filename from the URL
my $filename = basename($diamond_url);
# Set the destination path for download and extraction
my $download_path = catfile($RealBin, $filename);
my $extraction_path = $RealBin;
# Download the diamond tar.gz file
my $status = getstore($diamond_url, $download_path);
# Check if download was successful
unless ($status == 200) {
die "Failed to download diamond: $status";
}
# Extract the tar.gz file
my $extractor = Archive::Extract->new(archive => $download_path);
my $result = $extractor->extract(to => $extraction_path);
# Check if extraction was successful
unless ($result) {
die "Failed to extract diamond: " . $extractor->error;
}
# Print success message
print "diamond has been downloaded and installed in $extraction_path\n";
}
#Create diamond database of protein sequences
my $db_name;
if ($db=~m/^.+\/(.*?)\.f.*?$/) {
$db_name = $1;
system "diamond makedb --in $db --db $db_name"
} else {
die "File $db not in fasta format $!\n";
script/lapidary view on Meta::CPAN
my @index_sorted;
my $position_depth;
my $identity_proportion;
my $coverage_proportion;
my $mean_read_depth;
my @protein_indexes;
my @protein_index_sorted;
my @position_depth_array;
my @position_cover_array;
my @position_match_array;
my @position_start_array;
my @position_end_array;
my @index_sorted_size;
my $sample_position_current;
my $alignment_start;
my $alignment_end;
my @consensus;
my @uniq_consensus;
my @uniq_consensus_count;
my $variant_count;
my @consensus_positions;
my @consensus_chunks;
my $read_identity;
my @sample_identity;
my @protein_identity;
my @consensus_identity;
my @sample_length;
my @identity_index;
my $adjusted_identity;
my $adjusted_coverage;
my @adjusted_amino_acids;
my $partial_match;
my $partial_count;
my $partial_position_current;
my $temp;
#Use first read file to name outputs
if ($read_1=~m/^.+\/(.*?)\.f.*?$/) {
$concatenated = ($1 . "_concatenated.fastq.gz");
$diamond_output = ($1 . "_diamond_read_alignments.txt");
$protein_cut_off = ($1 . "_protein_cut_offs.txt");
if($read_type eq "paired") {
#Concaternate read files
system "cat $read_1 $read_2 > $concatenated";
#Run diamond on concatenated reads
system "diamond blastx -k 1000000000000 -e 1.0 -k0 --matrix BLOSUM45 -q $concatenated --db $db_name --threads $threads --id $identity --query-cover $coverage -o $diamond_output -f 6 stitle qseqid qseq_translated qstart qend sstart send qcovhsp piden...
#Remove concatenated reads
system "rm $concatenated";
} elsif ($read_type eq "single") {
#Run diamond on single reads
system "diamond blastx -k 1000000000000 -e 1.0 -k0 --matrix BLOSUM45 -q $read_1 --db $db_name --threads $threads --id $identity --query-cover $coverage -o $diamond_output -f 6 stitle qseqid qseq_translated qstart qend sstart send qcovhsp pident qlen...
}
#Read in diamond_output and extract read information
@sample_proteins = ();
# @sample_reads = ();
@sample_positions = ();
@sample_amino_acids = ();
@sample_identity = ();
@sample_length = ();
open DIAMOND, $diamond_output or die "File $diamond_output NOT FOUND - $!\n";
foreach $_ (<DIAMOND>) {
if ($_=~m/^(.*?)\t(.*?)\t(.*?)\t(\d+\.*\d*)\t(\d+\.*\d*)\t(\d+\.*\d*)\t\d+\.*\d*\t\d+\.*\d*\t(\d+\.*\d*)\t(\d+)\t(\d+)$/) {
$read_protein = $1;
# $read_name = $2;
$read_translate = $3;
$start_temp = $4;
$end_temp = $5;
$read_position = ($6 - 1);
$read_identity = $7;
$read_length = $8;
$protein_length = $9;
#Make read_start smallest and read_end largest
if ($start_temp < $end_temp) {
$read_start = $start_temp;
$read_end = $end_temp;
} else {
$read_start = $end_temp;
$read_end = $start_temp;
}
#Ignore read if alignment is in the middle of the read - account for two nucleotides due to frame shifts
$read_alignment_QC = 0;
if($read_start > 3 ){
$read_alignment_QC++;
}
if($read_end < ($read_length - 2)){
$read_alignment_QC++;
}
if($read_alignment_QC < 2) {
push @sample_proteins, $read_protein;
# push @sample_reads, $read_name;
push @sample_positions, $read_position;
push @sample_amino_acids, $read_translate;
push @sample_identity, $read_identity;
push @sample_length, $read_length;
}
}
}
close DIAMOND;
#Identify protein sequences with read matches
@unique_proteins = uniq @sample_proteins;
( run in 1.259 second using v1.01-cache-2.11-cpan-39bf76dae61 )