view release on metacpan or search on metacpan
## Usage
```
Usage: roary [options] *.gff
Options: -p INT number of threads [1]
-o STR clusters output filename [clustered_proteins]
-f STR output directory [.]
-e create a multiFASTA alignment of core genes using PRANK
-n fast core gene alignment with MAFFT, use with -e
-i minimum percentage identity for blastp [95]
-cd FLOAT percentage of isolates a gene must be in to be core [99]
-qc generate QC report with Kraken
-k STR path to Kraken database for QC, use with -qc
-a check dependancies and print versions
-b STR blastp executable [blastp]
-c STR mcl executable [mcl]
-d STR mcxdeblast executable [mcxdeblast]
-g INT maximum number of clusters [50000]
-m STR makeblastdb executable [makeblastdb]
-r create R plots, requires R and ggplot2
-s dont split paralogs
bin/create_pan_genome_plots.R view on Meta::CPAN
mydata = read.table("number_of_genes_in_pan_genome.Rtab")
boxplot(mydata, data=mydata, main="No. of genes in the pan-genome",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
mydata = read.table("number_of_unique_genes.Rtab")
boxplot(mydata, data=mydata, main="Number of unique genes",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
mydata = read.table("blast_identity_frequency.Rtab")
plot(mydata,main="Number of blastp hits with different percentage identity", xlab="Blast percentage identity", ylab="No. blast results")
library(ggplot2)
conserved = colMeans(read.table("number_of_conserved_genes.Rtab"))
total = colMeans(read.table("number_of_genes_in_pan_genome.Rtab"))
genes = data.frame( genes_to_genomes = c(conserved,total),
genomes = c(c(1:length(conserved)),c(1:length(conserved))),
Key = c(rep("Conserved genes",length(conserved)), rep("Total genes",length(total))) )
bin/roary-create_pan_genome_plots.R view on Meta::CPAN
mydata = read.table("number_of_genes_in_pan_genome.Rtab")
boxplot(mydata, data=mydata, main="No. of genes in the pan-genome",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
mydata = read.table("number_of_unique_genes.Rtab")
boxplot(mydata, data=mydata, main="Number of unique genes",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
mydata = read.table("blast_identity_frequency.Rtab")
plot(mydata,main="Number of blastp hits with different percentage identity", xlab="Blast percentage identity", ylab="No. blast results")
library(ggplot2)
conserved = colMeans(read.table("number_of_conserved_genes.Rtab"))
total = colMeans(read.table("number_of_genes_in_pan_genome.Rtab"))
genes = data.frame( genes_to_genomes = c(conserved,total),
genomes = c(c(1:length(conserved)),c(1:length(conserved))),
Key = c(rep("Conserved genes",length(conserved)), rep("Total genes",length(total))) )
lib/Bio/Roary/AccessoryBinaryFasta.pm view on Meta::CPAN
use Bio::Roary::AnnotateGroups;
use Bio::Roary::AnalyseGroups;
use Bio::Roary::Exceptions;
use Bio::SeqIO;
use File::Basename;
has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_binary_genes.fa' );
has 'lower_bound_percentage' => ( is => 'ro', isa => 'Int', default => 5 );
has 'upper_bound_percentage' => ( is => 'ro', isa => 'Int', default => 5 );
has 'max_accessory_to_include' => ( is => 'ro', isa => 'Int', default => 4000 );
has 'groups_to_files' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_files' );
has '_lower_bound_value' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__lower_bound_value' );
has '_upper_bound_value' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__upper_bound_value' );
sub _build__groups_to_files {
my ($self) = @_;
my %groups_to_files;
for my $group ( @{ $self->annotate_groups_obj->_groups } ) {
my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group};
lib/Bio/Roary/AccessoryBinaryFasta.pm view on Meta::CPAN
}
$groups_to_files{$group} = \%filenames;
}
return \%groups_to_files;
}
sub _build__lower_bound_value {
my ($self) = @_;
my $num_files = @{ $self->input_files };
return ceil( $num_files * ( $self->lower_bound_percentage / 100 ) );
}
sub _build__upper_bound_value {
my ($self) = @_;
my $num_files = @{ $self->input_files };
return $num_files - ceil( $num_files * ( $self->upper_bound_percentage / 100 ) );
}
sub create_accessory_binary_fasta {
my ($self) = @_;
my $out_seq_io = Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
for my $full_filename ( @{ $self->input_files } ) {
my($filename, $dirs, $suffix) = fileparse($full_filename);
my $output_sequence = '';
lib/Bio/Roary/AssemblyStatistics.pm view on Meta::CPAN
use Moose;
use Bio::Roary::ExtractCoreGenesFromSpreadsheet;
use Log::Log4perl qw(:easy);
with 'Bio::Roary::SpreadsheetRole';
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'assembly_statistics.csv' );
has 'job_runner' => ( is => 'ro', isa => 'Str', default => 'Local' );
has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
has '_cloud_percentage' => ( is => 'rw', isa => 'Num', default => 0.15 );
has '_shell_percentage' => ( is => 'rw', isa => 'Num', default => 0.95 );
has '_soft_core_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'verbose' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'contiguous_window' => ( is => 'ro', isa => 'Int', default => 10 );
has 'ordered_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_genes' );
has '_genes_to_rows' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_rows' );
has 'all_sample_statistics' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_all_sample_statistics' );
has 'sample_names_to_column_index' => ( is => 'rw', isa => 'Maybe[HashRef]' );
has 'summary_output_filename'=> ( is => 'ro', isa => 'Str', default => 'summary_statistics.txt' );
has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger');
has 'gene_category_count' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_gene_category_count' );
lib/Bio/Roary/AssemblyStatistics.pm view on Meta::CPAN
Log::Log4perl->easy_init( $ERROR );
my $logger = get_logger();
return $logger;
}
sub create_summary_output
{
my ($self) = @_;
open(my $fh, '>', $self->summary_output_filename) or Bio::Roary::Exceptions::CouldntWriteToFile->throw(error => "Couldnt write to ".$self->summary_output_filename);
my $core_percentage = $self->core_definition()*100;
my $soft_core_percentage = $self->_soft_core_percentage*100;
my $shell_percentage = $self->_shell_percentage()*100;
my $cloud_percentage = $self->_cloud_percentage()*100;
my $core_genes = ($self->gene_category_count->{core} ? $self->gene_category_count->{core} : 0);
my $soft_core_genes = ($self->gene_category_count->{soft_core} ? $self->gene_category_count->{soft_core} : 0);
my $shell_genes =($self->gene_category_count->{shell} ? $self->gene_category_count->{shell} : 0);
my $cloud_genes = ($self->gene_category_count->{cloud} ? $self->gene_category_count->{cloud} : 0);
my $total_genes = $core_genes + $soft_core_genes + $shell_genes + $cloud_genes ;
$self->logger->warn("Very few core genes detected with the current settings. Try modifying the core definition ( -cd 90 ) and/or
the blast identity (-i 70) parameters. Also try checking for contamination (-qc) and ensure you only have one species.") if($core_genes < 100);
print {$fh} "Core genes\t($core_percentage".'% <= strains <= 100%)'."\t$core_genes\n";
print {$fh} "Soft core genes\t(".$shell_percentage."% <= strains < ".$soft_core_percentage."%)\t$soft_core_genes\n";
print {$fh} "Shell genes\t(".$cloud_percentage."% <= strains < ".$shell_percentage."%)\t$shell_genes\n";
print {$fh} "Cloud genes\t(0% <= strains < ".$cloud_percentage."%)\t$cloud_genes\n";
print {$fh} "Total genes\t(0% <= strains <= 100%)\t$total_genes\n";
close($fh);
return 1;
}
sub _build_gene_category_count {
my ($self) = @_;
my %gene_category_count;
$self->_soft_core_percentage($self->core_definition);
if ( $self->_soft_core_percentage <= $self->_shell_percentage ) {
$self->_shell_percentage( $self->_soft_core_percentage - 0.01 );
}
my $number_of_samples = keys %{ $self->sample_names_to_column_index };
for my $gene_name ( keys %{ $self->_genes_to_rows } ) {
my $isolates_with_gene = 0;
for ( my $i = $self->_num_fixed_headers ; $i < @{ $self->_genes_to_rows->{$gene_name} } ; $i++ ) {
$isolates_with_gene++
if ( defined( $self->_genes_to_rows->{$gene_name}->[$i] ) && $self->_genes_to_rows->{$gene_name}->[$i] ne "" );
}
if ( $isolates_with_gene < $self->_cloud_percentage() * $number_of_samples ) {
$gene_category_count{cloud}++;
}
elsif ( $isolates_with_gene < $self->_shell_percentage() * $number_of_samples ) {
$gene_category_count{shell}++;
}
elsif ( $isolates_with_gene < $self->_soft_core_percentage() * $number_of_samples ) {
$gene_category_count{soft_core}++;
}
else {
$gene_category_count{core}++;
}
}
return \%gene_category_count;
}
sub _build_ordered_genes {
lib/Bio/Roary/CombinedProteome.pm view on Meta::CPAN
version 3.13.0
=head1 SYNOPSIS
Take in multiple FASTA sequences containing proteomes and concat them together and output a FASTA file, filtering out more than 5% X's
use Bio::Roary::CombinedProteome;
my $obj = Bio::Roary::CombinedProteome->new(
proteome_files => ['abc.fa','efg.fa'],
output_filename => 'example_output.fa',
maximum_percentage_of_unknowns => 5.0,
);
$obj->create_combined_proteome_file;
=head1 AUTHOR
Andrew J. Page <ap13@sanger.ac.uk>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
lib/Bio/Roary/CommandLine/AssemblyStatistics.pm view on Meta::CPAN
sub usage_text {
my ($self) = @_;
return <<USAGE;
Usage: pan_genome_assembly_statistics [options] gene_presence_absence.csv
Take in a gene presence and absence spreadsheet and output some statistics
Options: -p INT number of threads [1]
-o STR output filename [assembly_statistics.csv]
-cd FLOAT percentage of isolates a gene must be in to be core [99]
-v verbose output to STDOUT
-w print version and exit
-h this help message
Example: Run with defaults
pan_genome_assembly_statistics gene_presence_absence.csv
For further information see: http://sanger-pathogens.github.io/Roary/
USAGE
}
lib/Bio/Roary/CommandLine/CreatePanGenome.pm view on Meta::CPAN
return <<USAGE;
Usage: create_pan_genome [options] *.gff
Build a pan genome with WTSI defaults.
Options: -p INT number of threads [1]
-o STR clusters output filename [clustered_proteins]
-f STR output directory [.]
-e create a multiFASTA alignment of core genes
-n fast core gene alignement with MAFFT, use with -e
-i minimum percentage identity for blastp [95]
-cd FLOAT percentage of isolates a gene must be in to be core [99]
-z dont delete intermediate files
-t INT translation table [11]
-v verbose output to STDOUT
-y add gene inference information to spreadsheet, doesnt work with -e
-g INT maximum number of clusters [50000]
-qc generate QC report with Kraken
-k STR path to Kraken database for QC, use with -qc
-w print version and exit
-a check dependancies and print versions
-h this help message
lib/Bio/Roary/CommandLine/IterativeCdhit.pm view on Meta::CPAN
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );
has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'output_cd_hit_filename' => ( is => 'rw', isa => 'Str', default => '_clustered' );
has 'output_combined_filename' => ( is => 'rw', isa => 'Str', default => '_combined_files' );
has 'number_of_input_files' => ( is => 'rw', isa => 'Int', default => 1 );
has 'output_filtered_clustered_fasta' => ( is => 'rw', isa => 'Str', default => '_clustered_filtered.fa' );
has 'lower_bound_percentage' => ( is => 'rw', isa => 'Num', default => 0.98 );
has 'upper_bound_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'step_size_percentage' => ( is => 'rw', isa => 'Num', default => 0.005 );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
sub BUILD {
my ($self) = @_;
my ( $output_cd_hit_filename,$cpus,$lower_bound_percentage,$upper_bound_percentage,$step_size_percentage, $output_combined_filename, $number_of_input_files, $output_filtered_clustered_fasta,$verbose,
$help );
GetOptionsFromArray(
$self->args,
'c|output_cd_hit_filename=s' => \$output_cd_hit_filename,
'm|output_combined_filename=s' => \$output_combined_filename,
'n|number_of_input_files=i' => \$number_of_input_files,
'f|output_filtered_clustered_fasta=s' => \$output_filtered_clustered_fasta,
'l|lower_bound_percentage=s' => \$lower_bound_percentage,
'u|upper_bound_percentage=s' => \$upper_bound_percentage,
's|step_size_percentage=s' => \$step_size_percentage,
'p|cpus=i' => \$cpus,
'v|verbose' => \$verbose,
'h|help' => \$help,
);
if ( defined($verbose) ) {
$self->verbose($verbose);
$self->logger->level(10000);
}
$self->help($help) if(defined($help));
$self->lower_bound_percentage($lower_bound_percentage/100) if ( defined($lower_bound_percentage) );
$self->upper_bound_percentage($upper_bound_percentage/100) if ( defined($upper_bound_percentage) );
$self->step_size_percentage($step_size_percentage/100) if ( defined($step_size_percentage) );
$self->output_cd_hit_filename($output_cd_hit_filename) if ( defined($output_cd_hit_filename) );
$self->output_combined_filename($output_combined_filename) if ( defined($output_combined_filename) );
$self->number_of_input_files($number_of_input_files) if ( defined($number_of_input_files) );
$self->cpus($cpus) if ( defined($cpus) );
$self->output_filtered_clustered_fasta($output_filtered_clustered_fasta)
if ( defined($output_filtered_clustered_fasta) );
}
sub run {
lib/Bio/Roary/CommandLine/IterativeCdhit.pm view on Meta::CPAN
if ( defined( $self->_error_message ) ) {
print $self->_error_message . "\n";
die $self->usage_text;
}
my $obj = Bio::Roary::IterativeCdhit->new(
output_cd_hit_filename => $self->output_cd_hit_filename,
output_combined_filename => $self->output_combined_filename,
number_of_input_files => $self->number_of_input_files,
output_filtered_clustered_fasta => $self->output_filtered_clustered_fasta,
lower_bound_percentage => $self->lower_bound_percentage,
upper_bound_percentage => $self->upper_bound_percentage,
step_size_percentage => $self->step_size_percentage,
cpus => $self->cpus,
logger => $self->logger
);
$obj->run;
}
sub usage_text {
my ($self) = @_;
lib/Bio/Roary/CommandLine/IterativeCdhit.pm view on Meta::CPAN
Usage: iterative_cdhit [options]
Iteratively cluster a FASTA file of proteins with CD-hit, lower the threshold each time and extracting core genes (1 per isolate) to another file, and remove them from the input proteins file.
Required arguments:
-m STR input FASTA file of protein sequences [_combined_files]
Options: -p INT number of threads [1]
-n INT number of isolates [1]
-c STR cd-hit output filename [_clustered]
-f STR output filename for filtered sequences [_clustered_filtered.fa]
-l FLOAT lower bound percentage identity [98.0]
-u FLOAT upper bound percentage identity [99.0]
-s FLOAT step size for percentage identity [0.5]
-v verbose output to STDOUT
-h this help message
For further info see: http://sanger-pathogens.github.io/Roary/
USAGE
}
__PACKAGE__->meta->make_immutable;
no Moose;
1;
lib/Bio/Roary/CommandLine/ParallelAllAgainstAllBlastp.pm view on Meta::CPAN
);
my $output_combined_filename;
if(@{$self->fasta_files} > 1)
{
$output_combined_filename = 'combined_files.fa';
$self->logger->info("Combining protein files");
my $combine_fasta_files = Bio::Roary::CombinedProteome->new(
proteome_files => $prepare_input_files->fasta_files,
output_filename => $output_combined_filename,
maximum_percentage_of_unknowns => 5.0,
apply_unknowns_filter => 0
);
$combine_fasta_files->create_combined_proteome_file;
}
else
{
$output_combined_filename = $self->fasta_files->[0];
}
$self->logger->info("Beginning all against all blast");
lib/Bio/Roary/CommandLine/QueryRoary.pm view on Meta::CPAN
sub usage_text {
my ($self) = @_;
return <<USAGE;
Usage: query_pan_genome [options] *.gff
Perform set operations on the pan genome to see the gene differences between groups of isolates.
Options: -g STR groups filename [clustered_proteins]
-a STR action (union/intersection/complement/gene_multifasta/difference) [union]
-c FLOAT percentage of isolates a gene must be in to be core [99]
-o STR output filename [pan_genome_results]
-n STR comma separated list of gene names for use with gene_multifasta action
-i STR comma separated list of filenames, comparison set one
-t STR comma separated list of filenames, comparison set two
-v verbose output to STDOUT
-h this help message
Examples:
Union of genes found in isolates
query_pan_genome -a union *.gff
lib/Bio/Roary/CommandLine/Roary.pm view on Meta::CPAN
$self->blastp_exec($blastp_exec) if ( defined($blastp_exec) );
$self->mcxdeblast_exec($mcxdeblast_exec) if ( defined($mcxdeblast_exec) );
$self->mcl_exec($mcl_exec) if ( defined($mcl_exec) );
$self->cpus($cpus) if ( defined($cpus) );
$self->inflation_value($inflation_value) if ( defined($inflation_value));
if ( defined($perc_identity) ) {
$self->perc_identity($perc_identity);
if ( $perc_identity < 50 ) {
$self->logger->error(
"The percentage identity is too low. Either something is wrong with your data, like contamination, or your doing something that the software isnt designed to support."
);
}
}
$self->mafft($mafft) if ( defined($mafft) );
$self->apply_unknowns_filter($apply_unknowns_filter)
if ( defined($apply_unknowns_filter) );
if ( defined($output_multifasta_files) ) {
if ( which('prank') ) {
lib/Bio/Roary/CommandLine/Roary.pm view on Meta::CPAN
my ($self) = @_;
return <<USAGE;
Usage: roary [options] *.gff
Options: -p INT number of threads [1]
-o STR clusters output filename [clustered_proteins]
-f STR output directory [.]
-e create a multiFASTA alignment of core genes using PRANK
-n fast core gene alignment with MAFFT, use with -e
-i minimum percentage identity for blastp [95]
-cd FLOAT percentage of isolates a gene must be in to be core [99]
-qc generate QC report with Kraken
-k STR path to Kraken database for QC, use with -qc
-a check dependancies and print versions
-b STR blastp executable [blastp]
-c STR mcl executable [mcl]
-d STR mcxdeblast executable [mcxdeblast]
-g INT maximum number of clusters [50000]
-m STR makeblastdb executable [makeblastdb]
-r create R plots, requires R and ggplot2
-s dont split paralogs
lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm view on Meta::CPAN
}
sub usage_text {
my ($self) = @_;
return <<USAGE;
Usage: pan_genome_core_alignment [options]
Create an alignment of core genes from the spreadsheet and the directory of gene multi-FASTAs.
Options: -o STR output filename [core_gene_alignment.aln]
-cd FLOAT percentage of isolates a gene must be in to be core [99]
-m STR directory containing gene multi-FASTAs [pan_genome_sequences]
-s STR gene presence and absence spreadsheet [gene_presence_absence.csv]
-p allow paralogs
-z dont delete intermediate files
-v verbose output to STDOUT
-h this help message
For further info see: http://sanger-pathogens.github.io/Roary/
USAGE
}
lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm view on Meta::CPAN
sub usage_text {
my ($self) = @_;
return <<USAGE;
Usage: pan_genome_post_analysis [options]
Perform the post analysis on the pan genome. This script is usally only called by another script.
Options: -a dont delete intermediate files
-b dont create R plots
-c STR clusters filename [_clustered.clstr]
-cd FLOAT percentage of isolates a gene must be in to be core [0.99]
-d dont split groups
-e add inference values to gene presence and absence spreadsheet
-f STR file of protein filenames [_fasta_files]
-g INT maximum number of clusters [50000]
-i STR file of GFF filenames [_gff_files]
-m core gene alignement with PRANK
-n fast core gene alignement with MAFFT instead of PRANK
-o STR clusters output filename [clustered_proteins]
-p STR output pan genome filename [pan_genome.fa]
-q allow paralogs in core alignment
lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm view on Meta::CPAN
use Moose;
use Bio::Tools::GFF;
with 'Bio::Roary::ParseGFFAnnotationRole';
has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');
has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids');
has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]});
has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10);
# Manually parse the GFF file because the BioPerl module is too slow
sub _build_contig_to_ids
{
my ($self) = @_;
my %contigs_to_ids;
my @genes_annotation;
open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
while(<$fh>)
lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm view on Meta::CPAN
next if($current_feature->{database_annotation_exists} == 1);
next unless($current_feature->{product} =~ /hypothetical/i);
next unless($next_feature->{database_annotation_exists} == 1);
my $start_coord = $current_feature->{start} ;
my $end_coord = $current_feature->{end} ;
my $comparison_start_coord =$next_feature->{start} ;
my $comparison_end_coord =$next_feature->{end} ;
if($comparison_start_coord < $end_coord && $comparison_end_coord > $start_coord )
{
my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord);
if($percent_overlap >= $self->_min_nucleotide_overlap_percentage)
{
$overlapping_protein_ids{$current_feature->{id_name}}++;
}
}
}
return \%overlapping_protein_ids;
}
sub _percent_overlap
{
my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_;
my $size_of_hypothetical_gene = $end_coord - $start_coord;
my $lower_bound = $start_coord;
if($comparison_start_coord > $start_coord)
{
$lower_bound = $comparison_start_coord;
}
my $upper_bound = $end_coord;
lib/Bio/Roary/ExtractProteomeFromGFF.pm view on Meta::CPAN
use Bio::Roary::Exceptions;
use File::Basename;
use File::Temp;
use File::Copy;
use Bio::Tools::GFF;
with 'Bio::Roary::JobRunner::Role';
with 'Bio::Roary::BedFromGFFRole';
has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
sub _build_fasta_file {
my ($self) = @_;
$self->_extract_nucleotide_regions;
$self->_convert_nucleotide_to_protein;
lib/Bio/Roary/ExtractProteomeFromGFF.pm view on Meta::CPAN
}
sub _convert_nucleotide_to_protein {
my ($self) = @_;
$self->_fastatranslate( $self->_extracted_nucleotide_fasta_file_from_bed_filename, $self->_unfiltered_output_filename );
unlink( $self->_extracted_nucleotide_fasta_file_from_bed_filename );
}
sub _does_sequence_contain_too_many_unknowns {
my ( $self, $sequence_obj ) = @_;
my $maximum_number_of_Xs = int( ( $sequence_obj->length() * $self->maximum_percentage_of_unknowns ) / 100 );
my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
if ( $number_of_Xs_found > $maximum_number_of_Xs ) {
return 1;
}
else {
return 0;
}
}
sub _filter_fasta_sequences {
lib/Bio/Roary/FilterUnknownsFromFasta.pm view on Meta::CPAN
use Moose;
use Bio::SeqIO;
use Cwd;
use Bio::Roary::Exceptions;
use File::Basename;
has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' );
has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} );
sub _build_filtered_fasta_files
{
my ($self) = @_;
my @output_file_names;
lib/Bio/Roary/FilterUnknownsFromFasta.pm view on Meta::CPAN
{
my ( $filename, $directories, $suffix ) = fileparse($fasta_file);
push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file ));
}
return \@output_file_names;
}
sub _does_sequence_contain_too_many_unknowns
{
my ($self, $sequence_obj) = @_;
my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100);
my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
if($number_of_Xs_found > $maximum_number_of_Xs)
{
return 1;
}
else
{
return 0;
}
}
lib/Bio/Roary/IterativeCdhit.pm view on Meta::CPAN
use Bio::Roary::FilterFullClusters;
use File::Copy;
use Log::Log4perl qw(:easy);
# CD hit is run locally
has 'output_cd_hit_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'output_combined_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'number_of_input_files' => ( is => 'ro', isa => 'Int', required => 1 );
has 'output_filtered_clustered_fasta' => ( is => 'ro', isa => 'Str', required => 1 );
has 'lower_bound_percentage' => ( is => 'ro', isa => 'Num', default => 0.98 );
has 'upper_bound_percentage' => ( is => 'ro', isa => 'Num', default => 0.99 );
has 'step_size_percentage' => ( is => 'ro', isa => 'Num', default => 0.005 );
has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger');
sub _build_logger
{
my ($self) = @_;
Log::Log4perl->easy_init(level => $ERROR);
my $logger = get_logger();
return $logger;
}
lib/Bio/Roary/IterativeCdhit.pm view on Meta::CPAN
my ($self) = @_;
$self->filter_complete_clusters(
$self->output_cd_hit_filename,
1,
$self->output_combined_filename,
$self->number_of_input_files,
$self->output_filtered_clustered_fasta, 1
);
for ( my $percent_match = $self->upper_bound_percentage ; $percent_match >= $self->lower_bound_percentage ; $percent_match -= $self->step_size_percentage ) {
$self->filter_complete_clusters(
$self->output_cd_hit_filename,
$percent_match,
$self->output_combined_filename,
$self->number_of_input_files,
$self->output_filtered_clustered_fasta, 0
);
}
my $cdhit_obj = Bio::Roary::External::Cdhit->new(
input_file => $self->output_combined_filename,
output_base => $self->output_cd_hit_filename,
_length_difference_cutoff => $self->lower_bound_percentage,
_sequence_identity_threshold => $self->lower_bound_percentage,
cpus => $self->cpus,
logger => $self->logger
);
$cdhit_obj->run();
return $cdhit_obj->clusters_filename;
}
sub filter_complete_clusters {
my ( $self, $output_cd_hit_filename, $percentage_match, $output_combined_filename, $number_of_input_files,
$output_filtered_clustered_fasta,
$greater_than_or_equal )
= @_;
my $cdhit_obj = Bio::Roary::External::Cdhit->new(
input_file => $output_combined_filename,
output_base => $output_cd_hit_filename,
_length_difference_cutoff => $percentage_match,
_sequence_identity_threshold => $percentage_match,
cpus => $self->cpus,
);
$cdhit_obj->run();
my $filter_clusters = Bio::Roary::FilterFullClusters->new(
clusters_filename => $cdhit_obj->clusters_filename,
fasta_file => $output_cd_hit_filename,
number_of_input_files => $number_of_input_files,
output_file => $output_filtered_clustered_fasta,
_greater_than_or_equal => $greater_than_or_equal,
lib/Bio/Roary/OrderGenes.pm view on Meta::CPAN
has 'sample_weights' => ( is => 'ro', isa => 'Maybe[HashRef]' );
has 'samples_to_clusters' => ( is => 'ro', isa => 'Maybe[HashRef]' );
has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' );
has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
has 'group_graphs' => ( is => 'ro', isa => 'Graph', lazy => 1, builder => '_build_group_graphs' );
has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' );
has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' );
has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' );
has 'number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files' );
has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 );
sub _build_number_of_files {
my ($self) = @_;
return @{ $self->gff_files };
}
sub _build_groups {
my ($self) = @_;
my %groups;
for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) {
lib/Bio/Roary/Output/BlastIdentityFrequency.pm view on Meta::CPAN
package Bio::Roary::Output::BlastIdentityFrequency;
$Bio::Roary::Output::BlastIdentityFrequency::VERSION = '3.13.0';
# ABSTRACT: Take in blast results and find the percentage identity graph
use Moose;
use Bio::SeqIO;
use Bio::Roary::Exceptions;
has 'input_filename' => ( is => 'ro', isa => 'Str', default => '_blast_results' );
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'blast_identity_frequency.Rtab' );
has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
lib/Bio/Roary/Output/BlastIdentityFrequency.pm view on Meta::CPAN
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
Bio::Roary::Output::BlastIdentityFrequency - Take in blast results and find the percentage identity graph
=head1 VERSION
version 3.13.0
=head1 SYNOPSIS
Take in blast results and find the percentage identity graph
use Bio::Roary::Output::BlastIdentityFrequency;
my $obj = Bio::Roary::Output::BlastIdentityFrequency->new(
input_filename => '_blast_results',
output_filename => 'blast_identity_frequency.Rtab',
);
$obj->create_file();
=head1 AUTHOR
lib/Bio/Roary/SortFasta.pm view on Meta::CPAN
my $nnn_at_end_of_all_sequences = 1;
my $sequence;
my $variation_detected = 0;
while ( my $input_seq = $self->_input_seqio->next_seq() ) {
$sequence = $input_seq->seq if(!defined($sequence));
$self->_add_padding_to_make_sequence_length_multiple_of_three($input_seq) if ( $self->make_multiple_of_three );
$nnn_at_end_of_all_sequences = 0 if ( $nnn_at_end_of_all_sequences == 1 && !( $input_seq->seq() =~ /NNN$/i ) );
$input_sequences{ $input_seq->display_id } = $input_seq;
my $factor = $self->_percentage_similarity($sequence, $input_seq->seq);
if($factor < $self->similarity)
{
$self->similarity($factor);
}
}
$self->_remove_nnn_from_all_sequences( \%input_sequences ) if ( $self->remove_nnn_from_end && $nnn_at_end_of_all_sequences );
my $sequence_length = 0;
my $sequences_unaligned = 0;
lib/Bio/Roary/SortFasta.pm view on Meta::CPAN
}
return $self;
}
sub replace_input_with_output_file {
my ($self) = @_;
move( $self->output_filename, $self->input_filename );
return $self;
}
sub _percentage_similarity
{
my ($self, $string1, $string2) = @_;
my $num_differences = 0;
my $string1_length = length($string1);
for(my $i = 0; $i < $string1_length && $i< length($string2); $i++)
{
$num_differences++ if( substr($string1, $i, 1) ne substr($string2, $i, 1));
}
return 1 if($num_differences == 0);
return 0 if($string1_length == 0);
t/Bio/Roary/AccessoryClustering.t view on Meta::CPAN
use_ok('Bio::Roary::AccessoryClustering');
}
my $identity_to_num_clusters = {
'1' => [ 10, 10 ],
'0.99' => [ 4, 5 ],
'0.95' => [ 2, 4 ],
'0.90' => [ 1, 1 ],
};
for my $percentage_identity ( keys %{$identity_to_num_clusters} ) {
ok(
my $obj = Bio::Roary::AccessoryClustering->new(
input_file => 't/data/input_accessory_binary.fa',
identity => $percentage_identity
),
"initialise object with identity of $percentage_identity"
);
ok( my @clusters = keys %{ $obj->clusters_to_samples }, "build the clusters for $percentage_identity" );
ok( $obj->sample_weights, "build samples weights for $percentage_identity" );
ok( $obj->samples_to_clusters, "build samples to clusters for $percentage_identity" );
my $min_cluster_size = $identity_to_num_clusters->{$percentage_identity}->[0];
my $max_cluster_size = $identity_to_num_clusters->{$percentage_identity}->[1];
ok(
( @clusters >= $min_cluster_size && @clusters <= $max_cluster_size ? 1 : 0 ),
"check number of clusters as expected, allowing for some variation for $percentage_identity"
);
}
my $obj = Bio::Roary::AccessoryClustering->new(
input_file => 't/data/input_accessory_binary.fa',
identity => 0.9
);
is_deeply(
$obj->samples_to_clusters,
{
t/Bio/Roary/CommandLine/Roary.t view on Meta::CPAN
%scripts_and_expected_files = (
'-o some_different_output -i 90 -p 2 --translation_table 1 t/data/real_data_1.gff t/data/real_data_2.gff' => [ 'some_different_output', 't/data/expected_some_different_output' ],
);
mock_execute_script_and_check_output_sorted( $script_name, \%scripts_and_expected_files, [ 0 ] );
stderr_should_have($script_name,'--translation_table 1 -o some_different_output --core_definition 60 -p 2 -e --mafft --group_limit 10 t/data/real_data_1.gff t/data/real_data_2.gff', 'Exiting early because number of clusters is too high');
stderr_should_have($script_name,'--verbose_stats --group_limit 10 -e t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff', 'Exiting early because number of clusters is too high');
stderr_should_not_have($script_name,'-e --group_limit 10 t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ', 'Cant access the multifasta base directory');
stderr_should_have($script_name,'-i 90 --core_definition 60 -p 2 -v t/data/real_data_1.gff t/data/real_data_2.gff ','Cleaning up files');
stderr_should_have($script_name,'-i 30 t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff','The percentage identity is too low');
stderr_should_not_have($script_name,'--dont_delete_files -v t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ','Cleaning up files');
stderr_should_have($script_name,'-v --group_limit 100000 -e t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' ,'Running command: pan_genome_core_alignment');
stderr_should_have($script_name,'--translation_table 1 -v t/data/real_data_1.gff t/data/real_data_2.gff ' ,'Cleaning up files');
stderr_should_have($script_name,'-e -v t/data/real_data_1.gff t/data/real_data_2.gff ','Creating files with the nucleotide sequences for every cluster');
SKIP:
{
skip "kraken not installed", 2 unless ( which('kraken') );
skip "kraken-report not installed", 2 unless ( which('kraken-report') );
stderr_should_have($script_name,'-v --qc t/data/real_data_1.gff t/data/real_data_2.gff' ,'Running Kraken on each input assembly');
t/Bio/Roary/Output/NumberOfGroups.t view on Meta::CPAN
ok(-e 'number_of_unique_genes.Rtab', 'check raw output file created');
compare_ok('t/data/expected_number_of_unique_genes.tab', 'number_of_unique_genes.Rtab', 'Content of unique groups tab file as expected');
unlink('number_of_unique_genes.Rtab');
# Vary the core
ok($obj = Bio::Roary::Output::NumberOfGroups->new(
group_statistics_obj => $group_statistics,
annotate_groups_obj => $annotate_groups,
core_definition => 0.6
),"initialise object with 60 percent core definition");
ok($obj->create_output_files, 'create the raw output files for 60 percent core def');
compare_ok('t/data/expected_number_of_conserved_genes_0.6.tab','number_of_conserved_genes.Rtab', 'Content of conserved genes with 60 percent core def');
unlink('number_of_conserved_genes.Rtab');
unlink('number_of_new_genes.Rtab');
unlink('number_of_genes_in_pan_genome.Rtab');
unlink('number_of_unique_genes.Rtab');
unlink('group_statitics.csv');
done_testing();
t/Bio/Roary/SortFasta.t view on Meta::CPAN
ok( $obj = Bio::Roary::SortFasta->new(
input_filename => 't/data/uneven_sequences.fa',
make_multiple_of_three => 1,
remove_nnn_from_end => 1,
), 'initalise object with uneven sequences and remove nnn from end but nothing to remove');
ok($obj->sort_fasta, 'sort the fasta file');
compare_ok($obj->output_filename, 't/data/expected_uneven_sequences.fa', "output sequences are now divisible by three and no nnn removed");
is(0,$obj->_percentage_similarity("AAA","BBB"), 'totally different');
is(1,$obj->_percentage_similarity("AAA","AAA"), 'all the same');
is(0.5,$obj->_percentage_similarity("AAAA","AABB"), 'half different');
is(1,$obj->_percentage_similarity("AAAA","AAAABB"), 'first half the same');
done_testing();