Bio-Roary

 view release on metacpan or  search on metacpan

README.md  view on Meta::CPAN


## Usage
```
Usage:   roary [options] *.gff

Options: -p INT    number of threads [1]
         -o STR    clusters output filename [clustered_proteins]
         -f STR    output directory [.]
         -e        create a multiFASTA alignment of core genes using PRANK
         -n        fast core gene alignment with MAFFT, use with -e
         -i        minimum percentage identity for blastp [95]
         -cd FLOAT percentage of isolates a gene must be in to be core [99]
         -qc       generate QC report with Kraken
         -k STR    path to Kraken database for QC, use with -qc
         -a        check dependancies and print versions
         -b STR    blastp executable [blastp]
         -c STR    mcl executable [mcl]
         -d STR    mcxdeblast executable [mcxdeblast]
         -g INT    maximum number of clusters [50000]
         -m STR    makeblastdb executable [makeblastdb]
         -r        create R plots, requires R and ggplot2
         -s        dont split paralogs

bin/create_pan_genome_plots.R  view on Meta::CPAN

 
mydata = read.table("number_of_genes_in_pan_genome.Rtab")
boxplot(mydata, data=mydata, main="No. of genes in the pan-genome",
          xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("number_of_unique_genes.Rtab")
boxplot(mydata, data=mydata, main="Number of unique genes",
         xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("blast_identity_frequency.Rtab")
plot(mydata,main="Number of blastp hits with different percentage identity",  xlab="Blast percentage identity", ylab="No. blast results")


library(ggplot2)
conserved = colMeans(read.table("number_of_conserved_genes.Rtab"))
total = colMeans(read.table("number_of_genes_in_pan_genome.Rtab"))

genes = data.frame( genes_to_genomes = c(conserved,total),
                    genomes = c(c(1:length(conserved)),c(1:length(conserved))),
                    Key = c(rep("Conserved genes",length(conserved)), rep("Total genes",length(total))) )
                    

bin/roary-create_pan_genome_plots.R  view on Meta::CPAN

 
mydata = read.table("number_of_genes_in_pan_genome.Rtab")
boxplot(mydata, data=mydata, main="No. of genes in the pan-genome",
          xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("number_of_unique_genes.Rtab")
boxplot(mydata, data=mydata, main="Number of unique genes",
         xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("blast_identity_frequency.Rtab")
plot(mydata,main="Number of blastp hits with different percentage identity",  xlab="Blast percentage identity", ylab="No. blast results")


library(ggplot2)
conserved = colMeans(read.table("number_of_conserved_genes.Rtab"))
total = colMeans(read.table("number_of_genes_in_pan_genome.Rtab"))

genes = data.frame( genes_to_genomes = c(conserved,total),
                    genomes = c(c(1:length(conserved)),c(1:length(conserved))),
                    Key = c(rep("Conserved genes",length(conserved)), rep("Total genes",length(total))) )
                    

lib/Bio/Roary/AccessoryBinaryFasta.pm  view on Meta::CPAN

use Bio::Roary::AnnotateGroups;
use Bio::Roary::AnalyseGroups;
use Bio::Roary::Exceptions;
use Bio::SeqIO;
use File::Basename;

has 'input_files'            => ( is => 'ro', isa => 'ArrayRef',                   required => 1 );
has 'annotate_groups_obj'    => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
has 'analyse_groups_obj'     => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups',  required => 1 );
has 'output_filename'        => ( is => 'ro', isa => 'Str',                        default  => 'accessory_binary_genes.fa' );
has 'lower_bound_percentage' => ( is => 'ro', isa => 'Int',                        default  => 5 );
has 'upper_bound_percentage' => ( is => 'ro', isa => 'Int',                        default  => 5 );
has 'max_accessory_to_include' => ( is => 'ro', isa => 'Int',                      default  => 4000 );
has 'groups_to_files'        => ( is => 'ro', isa => 'HashRef',                    lazy     => 1, builder => '_build__groups_to_files' );
has '_lower_bound_value'     => ( is => 'ro', isa => 'Int',                        lazy     => 1, builder => '_build__lower_bound_value' );
has '_upper_bound_value'     => ( is => 'ro', isa => 'Int',                        lazy     => 1, builder => '_build__upper_bound_value' );

sub _build__groups_to_files {
    my ($self) = @_;
    my %groups_to_files;
    for my $group ( @{ $self->annotate_groups_obj->_groups } ) {
        my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group};

lib/Bio/Roary/AccessoryBinaryFasta.pm  view on Meta::CPAN

        }
        $groups_to_files{$group} = \%filenames;
    }

    return \%groups_to_files;
}

sub _build__lower_bound_value {
    my ($self) = @_;
    my $num_files = @{ $self->input_files };
    return ceil( $num_files * ( $self->lower_bound_percentage / 100 ) );
}

sub _build__upper_bound_value {
    my ($self) = @_;
    my $num_files = @{ $self->input_files };
    return $num_files - ceil( $num_files * ( $self->upper_bound_percentage / 100 ) );
}

sub create_accessory_binary_fasta {
    my ($self) = @_;
    my $out_seq_io = Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );

    for my $full_filename ( @{ $self->input_files } ) {
        my($filename, $dirs, $suffix) = fileparse($full_filename);
        
        my $output_sequence = '';

lib/Bio/Roary/AssemblyStatistics.pm  view on Meta::CPAN


use Moose;
use Bio::Roary::ExtractCoreGenesFromSpreadsheet;
use Log::Log4perl qw(:easy);
with 'Bio::Roary::SpreadsheetRole';

has 'output_filename'       => ( is => 'ro', isa => 'Str',      default => 'assembly_statistics.csv' );
has 'job_runner'            => ( is => 'ro', isa => 'Str',      default => 'Local' );
has 'cpus'                  => ( is => 'ro', isa => 'Int',      default => 1 );
has 'core_definition'       => ( is => 'rw', isa => 'Num',      default => 0.99 );
has '_cloud_percentage'     => ( is => 'rw', isa => 'Num',      default => 0.15 );
has '_shell_percentage'     => ( is => 'rw', isa => 'Num',      default => 0.95 );
has '_soft_core_percentage' => ( is => 'rw', isa => 'Num',      default => 0.99 );
has 'verbose'               => ( is => 'ro', isa => 'Bool',     default => 0 );
has 'contiguous_window'     => ( is => 'ro', isa => 'Int',      default => 10 );
has 'ordered_genes'         => ( is => 'ro', isa => 'ArrayRef', lazy    => 1, builder => '_build_ordered_genes' );
has '_genes_to_rows'        => ( is => 'ro', isa => 'HashRef',  lazy    => 1, builder => '_build__genes_to_rows' );
has 'all_sample_statistics' => ( is => 'ro', isa => 'HashRef',  lazy    => 1, builder => '_build_all_sample_statistics' );
has 'sample_names_to_column_index' => ( is => 'rw', isa => 'Maybe[HashRef]' );
has 'summary_output_filename'=> ( is => 'ro', isa => 'Str',      default => 'summary_statistics.txt' );
has 'logger'                 => ( is => 'ro', lazy => 1, builder => '_build_logger');
has 'gene_category_count'   => ( is => 'ro', isa => 'HashRef',  lazy    => 1, builder => '_build_gene_category_count' );

lib/Bio/Roary/AssemblyStatistics.pm  view on Meta::CPAN

    Log::Log4perl->easy_init( $ERROR );
    my $logger = get_logger();
    return $logger;
}

sub create_summary_output
{
	my ($self) = @_;
	open(my $fh, '>', $self->summary_output_filename) or Bio::Roary::Exceptions::CouldntWriteToFile->throw(error => "Couldnt write to ".$self->summary_output_filename);

    my $core_percentage      = $self->core_definition()*100;
	my $soft_core_percentage = $self->_soft_core_percentage*100;
	my $shell_percentage     = $self->_shell_percentage()*100;
	my $cloud_percentage     = $self->_cloud_percentage()*100;
	
	my $core_genes      = ($self->gene_category_count->{core} ? $self->gene_category_count->{core} : 0);
	my $soft_core_genes = ($self->gene_category_count->{soft_core} ? $self->gene_category_count->{soft_core} : 0);
	my $shell_genes     =($self->gene_category_count->{shell} ? $self->gene_category_count->{shell} : 0);
	my $cloud_genes     = ($self->gene_category_count->{cloud} ? $self->gene_category_count->{cloud} : 0);
	my $total_genes = $core_genes  + $soft_core_genes  + $shell_genes + $cloud_genes  ;
	
	$self->logger->warn("Very few core genes detected with the current settings. Try modifying the core definition ( -cd 90 ) and/or 
	the blast identity (-i 70) parameters.  Also try checking for contamination (-qc) and ensure you only have one species.") if($core_genes < 100);
	
	print {$fh} "Core genes\t($core_percentage".'% <= strains <= 100%)'."\t$core_genes\n";
	print {$fh} "Soft core genes\t(".$shell_percentage."% <= strains < ".$soft_core_percentage."%)\t$soft_core_genes\n";
	print {$fh} "Shell genes\t(".$cloud_percentage."% <= strains < ".$shell_percentage."%)\t$shell_genes\n";
	print {$fh} "Cloud genes\t(0% <= strains < ".$cloud_percentage."%)\t$cloud_genes\n";
	print {$fh} "Total genes\t(0% <= strains <= 100%)\t$total_genes\n";
	
	close($fh);
	return 1;
}

sub _build_gene_category_count {
    my ($self) = @_;
    my %gene_category_count;
	$self->_soft_core_percentage($self->core_definition);
	
    if ( $self->_soft_core_percentage <= $self->_shell_percentage ) {
        $self->_shell_percentage( $self->_soft_core_percentage - 0.01 );
    }

    my $number_of_samples = keys %{ $self->sample_names_to_column_index };
    for my $gene_name ( keys %{ $self->_genes_to_rows } ) {
        my $isolates_with_gene = 0;

        for ( my $i = $self->_num_fixed_headers ; $i < @{ $self->_genes_to_rows->{$gene_name} } ; $i++ ) {
            $isolates_with_gene++
              if ( defined( $self->_genes_to_rows->{$gene_name}->[$i] ) && $self->_genes_to_rows->{$gene_name}->[$i] ne "" );
        }

        if ( $isolates_with_gene < $self->_cloud_percentage() * $number_of_samples ) {
            $gene_category_count{cloud}++;
        }
        elsif ( $isolates_with_gene < $self->_shell_percentage() * $number_of_samples ) {
            $gene_category_count{shell}++;
        }
        elsif ( $isolates_with_gene < $self->_soft_core_percentage() * $number_of_samples ) {
            $gene_category_count{soft_core}++;
        }
        else {
            $gene_category_count{core}++;
        }
    }
    return \%gene_category_count;
}

sub _build_ordered_genes {

lib/Bio/Roary/CombinedProteome.pm  view on Meta::CPAN

version 3.13.0

=head1 SYNOPSIS

Take in multiple FASTA sequences containing proteomes and concat them together and output a FASTA file, filtering out more than 5% X's
   use Bio::Roary::CombinedProteome;

   my $obj = Bio::Roary::CombinedProteome->new(
     proteome_files   => ['abc.fa','efg.fa'],
     output_filename   => 'example_output.fa',
     maximum_percentage_of_unknowns => 5.0,
   );
   $obj->create_combined_proteome_file;

=head1 AUTHOR

Andrew J. Page <ap13@sanger.ac.uk>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.

lib/Bio/Roary/CommandLine/AssemblyStatistics.pm  view on Meta::CPAN


sub usage_text {
    my ($self) = @_;

    return <<USAGE;
Usage: pan_genome_assembly_statistics [options] gene_presence_absence.csv
Take in a gene presence and absence spreadsheet and output some statistics
  
Options: -p INT    number of threads [1]	
         -o STR    output filename [assembly_statistics.csv]
         -cd FLOAT percentage of isolates a gene must be in to be core [99]
         -v        verbose output to STDOUT
         -w        print version and exit
         -h        this help message
		 
Example: Run with defaults
         pan_genome_assembly_statistics gene_presence_absence.csv

For further information see: http://sanger-pathogens.github.io/Roary/
USAGE
}

lib/Bio/Roary/CommandLine/CreatePanGenome.pm  view on Meta::CPAN


    return <<USAGE;
Usage:   create_pan_genome [options] *.gff
Build a pan genome with WTSI defaults.

Options: -p INT    number of threads [1]
         -o STR    clusters output filename [clustered_proteins]
		 -f STR    output directory [.]
         -e        create a multiFASTA alignment of core genes
         -n        fast core gene alignement with MAFFT, use with -e
         -i        minimum percentage identity for blastp [95]
         -cd FLOAT percentage of isolates a gene must be in to be core [99]
         -z        dont delete intermediate files
         -t INT    translation table [11]
         -v        verbose output to STDOUT
         -y        add gene inference information to spreadsheet, doesnt work with -e
         -g INT    maximum number of clusters [50000]
         -qc       generate QC report with Kraken
         -k STR    path to Kraken database for QC, use with -qc
         -w        print version and exit
		 -a        check dependancies and print versions
         -h        this help message

lib/Bio/Roary/CommandLine/IterativeCdhit.pm  view on Meta::CPAN

has 'args'        => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str',      required => 1 );
has 'help'        => ( is => 'rw', isa => 'Bool',     default  => 0 );
has '_error_message' => ( is => 'rw', isa => 'Str' );

has 'output_cd_hit_filename'          => ( is => 'rw', isa => 'Str', default => '_clustered' );
has 'output_combined_filename'        => ( is => 'rw', isa => 'Str', default => '_combined_files' );
has 'number_of_input_files'           => ( is => 'rw', isa => 'Int', default => 1 );
has 'output_filtered_clustered_fasta' => ( is => 'rw', isa => 'Str', default => '_clustered_filtered.fa' );

has 'lower_bound_percentage'          => ( is => 'rw', isa => 'Num', default => 0.98 );
has 'upper_bound_percentage'          => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'step_size_percentage'            => ( is => 'rw', isa => 'Num', default => 0.005 );
has 'cpus'                            => ( is => 'rw', isa => 'Int', default => 1 );
has 'verbose'                         => ( is => 'rw', isa => 'Bool', default => 0 );


sub BUILD {
    my ($self) = @_;

    my ( $output_cd_hit_filename,$cpus,$lower_bound_percentage,$upper_bound_percentage,$step_size_percentage, $output_combined_filename, $number_of_input_files, $output_filtered_clustered_fasta,$verbose, 
        $help );

    GetOptionsFromArray(
        $self->args,
        'c|output_cd_hit_filename=s'          => \$output_cd_hit_filename,
        'm|output_combined_filename=s'        => \$output_combined_filename,
        'n|number_of_input_files=i'           => \$number_of_input_files,
        'f|output_filtered_clustered_fasta=s' => \$output_filtered_clustered_fasta,
        'l|lower_bound_percentage=s'          => \$lower_bound_percentage,
        'u|upper_bound_percentage=s'          => \$upper_bound_percentage,
        's|step_size_percentage=s'            => \$step_size_percentage,
        'p|cpus=i'                              => \$cpus,
		'v|verbose'                           => \$verbose,
        'h|help'                              => \$help,
    );

    if ( defined($verbose) ) {
        $self->verbose($verbose);
        $self->logger->level(10000);
    }
    $self->help($help) if(defined($help));
    $self->lower_bound_percentage($lower_bound_percentage/100) if ( defined($lower_bound_percentage) );
    $self->upper_bound_percentage($upper_bound_percentage/100) if ( defined($upper_bound_percentage) );
    $self->step_size_percentage($step_size_percentage/100)     if ( defined($step_size_percentage) );
    $self->output_cd_hit_filename($output_cd_hit_filename)     if ( defined($output_cd_hit_filename) );
    $self->output_combined_filename($output_combined_filename) if ( defined($output_combined_filename) );
    $self->number_of_input_files($number_of_input_files)       if ( defined($number_of_input_files) );
    $self->cpus($cpus)                                         if ( defined($cpus) );
    $self->output_filtered_clustered_fasta($output_filtered_clustered_fasta)
      if ( defined($output_filtered_clustered_fasta) );

}

sub run {

lib/Bio/Roary/CommandLine/IterativeCdhit.pm  view on Meta::CPAN

    if ( defined( $self->_error_message ) ) {
        print $self->_error_message . "\n";
        die $self->usage_text;
    }

    my $obj = Bio::Roary::IterativeCdhit->new(
        output_cd_hit_filename          => $self->output_cd_hit_filename,
        output_combined_filename        => $self->output_combined_filename,
        number_of_input_files           => $self->number_of_input_files,
        output_filtered_clustered_fasta => $self->output_filtered_clustered_fasta,
        lower_bound_percentage          => $self->lower_bound_percentage,
        upper_bound_percentage          => $self->upper_bound_percentage,
        step_size_percentage            => $self->step_size_percentage,
        cpus                            => $self->cpus,
		logger                          => $self->logger
        
    );
    $obj->run;
}

sub usage_text {
    my ($self) = @_;

lib/Bio/Roary/CommandLine/IterativeCdhit.pm  view on Meta::CPAN

Usage: iterative_cdhit [options]
Iteratively cluster a FASTA file of proteins with CD-hit, lower the threshold each time and extracting core genes (1 per isolate) to another file, and remove them from the input proteins file.

Required arguments:
         -m STR   input FASTA file of protein sequences [_combined_files]

Options: -p INT   number of threads [1]
         -n INT   number of isolates [1]
         -c STR   cd-hit output filename [_clustered]
         -f STR   output filename for filtered sequences [_clustered_filtered.fa]
         -l FLOAT lower bound percentage identity [98.0]
         -u FLOAT upper bound percentage identity [99.0]
         -s FLOAT step size for percentage identity [0.5]
         -v       verbose output to STDOUT
         -h       this help message

For further info see: http://sanger-pathogens.github.io/Roary/
USAGE
}

__PACKAGE__->meta->make_immutable;
no Moose;
1;

lib/Bio/Roary/CommandLine/ParallelAllAgainstAllBlastp.pm  view on Meta::CPAN

    );
    
    my $output_combined_filename;
    if(@{$self->fasta_files} > 1)
    {
      $output_combined_filename = 'combined_files.fa';
	  $self->logger->info("Combining protein files");
      my $combine_fasta_files = Bio::Roary::CombinedProteome->new(
        proteome_files                 => $prepare_input_files->fasta_files,
        output_filename                => $output_combined_filename,
        maximum_percentage_of_unknowns => 5.0,
        apply_unknowns_filter          => 0
      );
      $combine_fasta_files->create_combined_proteome_file;
    }
    else
    {
      $output_combined_filename = $self->fasta_files->[0];
    }

    $self->logger->info("Beginning all against all blast");

lib/Bio/Roary/CommandLine/QueryRoary.pm  view on Meta::CPAN


sub usage_text {
    my ($self) = @_;

    return <<USAGE;
Usage: query_pan_genome [options] *.gff
Perform set operations on the pan genome to see the gene differences between groups of isolates.

Options: -g STR    groups filename [clustered_proteins]
         -a STR    action (union/intersection/complement/gene_multifasta/difference) [union]
         -c FLOAT  percentage of isolates a gene must be in to be core [99]
         -o STR    output filename [pan_genome_results]
         -n STR    comma separated list of gene names for use with gene_multifasta action
         -i STR    comma separated list of filenames, comparison set one
         -t STR    comma separated list of filenames, comparison set two
         -v        verbose output to STDOUT
         -h        this help message
 
Examples: 
Union of genes found in isolates
         query_pan_genome -a union *.gff

lib/Bio/Roary/CommandLine/Roary.pm  view on Meta::CPAN

    $self->blastp_exec($blastp_exec)           if ( defined($blastp_exec) );
    $self->mcxdeblast_exec($mcxdeblast_exec)   if ( defined($mcxdeblast_exec) );
    $self->mcl_exec($mcl_exec)                 if ( defined($mcl_exec) );
    $self->cpus($cpus)                         if ( defined($cpus) );
    $self->inflation_value($inflation_value)   if ( defined($inflation_value));

    if ( defined($perc_identity) ) {
        $self->perc_identity($perc_identity);
        if ( $perc_identity < 50 ) {
            $self->logger->error(
"The percentage identity is too low. Either something is wrong with your data, like contamination, or your doing something that the software isnt designed to support."
            );
        }
    }

    $self->mafft($mafft) if ( defined($mafft) );
    $self->apply_unknowns_filter($apply_unknowns_filter)
      if ( defined($apply_unknowns_filter) );

    if ( defined($output_multifasta_files) ) {
        if ( which('prank') ) {

lib/Bio/Roary/CommandLine/Roary.pm  view on Meta::CPAN

    my ($self) = @_;

    return <<USAGE;
Usage:   roary [options] *.gff

Options: -p INT    number of threads [1]
         -o STR    clusters output filename [clustered_proteins]
         -f STR    output directory [.]
         -e        create a multiFASTA alignment of core genes using PRANK
         -n        fast core gene alignment with MAFFT, use with -e
         -i        minimum percentage identity for blastp [95]
         -cd FLOAT percentage of isolates a gene must be in to be core [99]
         -qc       generate QC report with Kraken
         -k STR    path to Kraken database for QC, use with -qc
         -a        check dependancies and print versions
         -b STR    blastp executable [blastp]
         -c STR    mcl executable [mcl]
         -d STR    mcxdeblast executable [mcxdeblast]
         -g INT    maximum number of clusters [50000]
         -m STR    makeblastdb executable [makeblastdb]
         -r        create R plots, requires R and ggplot2
         -s        dont split paralogs

lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm  view on Meta::CPAN

}

sub usage_text {
    my ($self) = @_;

    return <<USAGE;
Usage: pan_genome_core_alignment [options]
Create an alignment of core genes from the spreadsheet and the directory of gene multi-FASTAs.

Options: -o STR    output filename [core_gene_alignment.aln]
         -cd FLOAT percentage of isolates a gene must be in to be core [99]
         -m STR    directory containing gene multi-FASTAs [pan_genome_sequences]
         -s STR    gene presence and absence spreadsheet [gene_presence_absence.csv]
         -p        allow paralogs
         -z        dont delete intermediate files
         -v        verbose output to STDOUT
         -h        this help message

For further info see: http://sanger-pathogens.github.io/Roary/
USAGE
}

lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm  view on Meta::CPAN

sub usage_text {
    my ($self) = @_;

    return <<USAGE;
Usage: pan_genome_post_analysis [options]
Perform the post analysis on the pan genome. This script is usally only called by another script.

Options: -a        dont delete intermediate files
         -b        dont create R plots
         -c STR    clusters filename [_clustered.clstr]
         -cd FLOAT percentage of isolates a gene must be in to be core [0.99]
         -d        dont split groups
         -e        add inference values to gene presence and absence spreadsheet
         -f STR    file of protein filenames [_fasta_files]
         -g INT    maximum number of clusters [50000]
         -i STR    file of GFF filenames [_gff_files]
         -m        core gene alignement with PRANK
         -n        fast core gene alignement with MAFFT instead of PRANK
         -o STR    clusters output filename [clustered_proteins]
         -p STR    output pan genome filename [pan_genome.fa]
         -q        allow paralogs in core alignment

lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm  view on Meta::CPAN


use Moose;
use Bio::Tools::GFF;
with 'Bio::Roary::ParseGFFAnnotationRole';

has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');

has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids');
has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]});

has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10);

# Manually parse the GFF file because the BioPerl module is too slow
sub _build_contig_to_ids
{
  my ($self) = @_;
  my %contigs_to_ids;
  my @genes_annotation;
  
  open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
  while(<$fh>)

lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm  view on Meta::CPAN

	  next if($current_feature->{database_annotation_exists} == 1);
	  next unless($current_feature->{product} =~ /hypothetical/i);
	  next unless($next_feature->{database_annotation_exists} == 1);
	  
	  my $start_coord = $current_feature->{start} ;
      my $end_coord   = $current_feature->{end} ;
	  my $comparison_start_coord =$next_feature->{start} ;
	  my $comparison_end_coord   =$next_feature->{end} ;
      if($comparison_start_coord < $end_coord  && $comparison_end_coord > $start_coord )
      {
        my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord);
        if($percent_overlap >= $self->_min_nucleotide_overlap_percentage)
        {
          $overlapping_protein_ids{$current_feature->{id_name}}++;
        }
      }
  }
  
  return \%overlapping_protein_ids;
}

sub _percent_overlap
{
   my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_;
   my $size_of_hypothetical_gene =  $end_coord - $start_coord;
   
   my $lower_bound = $start_coord;
   if($comparison_start_coord > $start_coord)
   {
     $lower_bound = $comparison_start_coord;
   }
   my $upper_bound = $end_coord;

lib/Bio/Roary/ExtractProteomeFromGFF.pm  view on Meta::CPAN

use Bio::Roary::Exceptions;
use File::Basename;
use File::Temp;
use File::Copy;
use Bio::Tools::GFF;
with 'Bio::Roary::JobRunner::Role';
with 'Bio::Roary::BedFromGFFRole';

has 'gff_file'                       => ( is => 'ro', isa => 'Str',  required => 1 );
has 'apply_unknowns_filter'          => ( is => 'rw', isa => 'Bool', default  => 1 );
has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num',  default  => 5 );
has 'output_filename'                => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
has 'fasta_file'                     => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
has '_working_directory'             => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_working_directory_name'        => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
has 'translation_table'              => ( is => 'rw', isa => 'Int', default => 11 );

sub _build_fasta_file {
    my ($self) = @_;
    $self->_extract_nucleotide_regions;
    $self->_convert_nucleotide_to_protein;

lib/Bio/Roary/ExtractProteomeFromGFF.pm  view on Meta::CPAN

}

sub _convert_nucleotide_to_protein {
    my ($self) = @_;
    $self->_fastatranslate( $self->_extracted_nucleotide_fasta_file_from_bed_filename, $self->_unfiltered_output_filename );
    unlink( $self->_extracted_nucleotide_fasta_file_from_bed_filename );
}

sub _does_sequence_contain_too_many_unknowns {
    my ( $self, $sequence_obj ) = @_;
    my $maximum_number_of_Xs = int( ( $sequence_obj->length() * $self->maximum_percentage_of_unknowns ) / 100 );
    my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
    if ( $number_of_Xs_found > $maximum_number_of_Xs ) {
        return 1;
    }
    else {
        return 0;
    }
}

sub _filter_fasta_sequences {

lib/Bio/Roary/FilterUnknownsFromFasta.pm  view on Meta::CPAN



use Moose;
use Bio::SeqIO;
use Cwd;
use Bio::Roary::Exceptions;
use File::Basename;

has 'fasta_files'                    => ( is => 'ro', isa => 'ArrayRef',  required => 1 );
has 'apply_unknowns_filter'          => ( is => 'rw', isa => 'Bool', default => 1 );
has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num',  default  => 5 );

has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' );

has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} );

sub _build_filtered_fasta_files
{
  my ($self) = @_;
  
  my @output_file_names;

lib/Bio/Roary/FilterUnknownsFromFasta.pm  view on Meta::CPAN

  {
    my ( $filename, $directories, $suffix ) = fileparse($fasta_file);
    push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file ));
  }
  return \@output_file_names;
}

sub _does_sequence_contain_too_many_unknowns
{
  my ($self, $sequence_obj) = @_;
  my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100);
  my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
  if($number_of_Xs_found  > $maximum_number_of_Xs)
  {
    return 1;
  }
  else
  {
    return 0;
  }
}

lib/Bio/Roary/IterativeCdhit.pm  view on Meta::CPAN

use Bio::Roary::FilterFullClusters;
use File::Copy;
use Log::Log4perl qw(:easy);
# CD hit is run locally

has 'output_cd_hit_filename'          => ( is => 'ro', isa => 'Str', required => 1 );
has 'output_combined_filename'        => ( is => 'ro', isa => 'Str', required => 1 );
has 'number_of_input_files'           => ( is => 'ro', isa => 'Int', required => 1 );
has 'output_filtered_clustered_fasta' => ( is => 'ro', isa => 'Str', required => 1 );

has 'lower_bound_percentage'          => ( is => 'ro', isa => 'Num', default => 0.98 );
has 'upper_bound_percentage'          => ( is => 'ro', isa => 'Num', default => 0.99 );
has 'step_size_percentage'            => ( is => 'ro', isa => 'Num', default => 0.005 );
has 'cpus'                            => ( is => 'ro', isa => 'Int', default => 1 );
has 'logger'                          => ( is => 'ro', lazy => 1, builder => '_build_logger');

sub _build_logger
{
    my ($self) = @_;
    Log::Log4perl->easy_init(level => $ERROR);
    my $logger = get_logger();
    return $logger;
}

lib/Bio/Roary/IterativeCdhit.pm  view on Meta::CPAN

    my ($self) = @_;

    $self->filter_complete_clusters(
        $self->output_cd_hit_filename,
        1,
        $self->output_combined_filename,
        $self->number_of_input_files,
        $self->output_filtered_clustered_fasta, 1
    );

    for ( my $percent_match = $self->upper_bound_percentage ; $percent_match >= $self->lower_bound_percentage ; $percent_match -= $self->step_size_percentage ) {
        $self->filter_complete_clusters(
            $self->output_cd_hit_filename,
            $percent_match,
            $self->output_combined_filename,
            $self->number_of_input_files,
            $self->output_filtered_clustered_fasta, 0
        );
    }

    my $cdhit_obj = Bio::Roary::External::Cdhit->new(
        input_file                   => $self->output_combined_filename,
        output_base                  => $self->output_cd_hit_filename,
        _length_difference_cutoff    => $self->lower_bound_percentage,
        _sequence_identity_threshold => $self->lower_bound_percentage,
        cpus                         => $self->cpus,
		logger                       => $self->logger
    );
    $cdhit_obj->run();
    return $cdhit_obj->clusters_filename;
}

sub filter_complete_clusters {
    my ( $self, $output_cd_hit_filename, $percentage_match, $output_combined_filename, $number_of_input_files,
        $output_filtered_clustered_fasta,
        $greater_than_or_equal )
      = @_;
    my $cdhit_obj = Bio::Roary::External::Cdhit->new(
        input_file                   => $output_combined_filename,
        output_base                  => $output_cd_hit_filename,
        _length_difference_cutoff    => $percentage_match,
        _sequence_identity_threshold => $percentage_match,
        cpus                         => $self->cpus,
    );
    $cdhit_obj->run();

    my $filter_clusters = Bio::Roary::FilterFullClusters->new(
        clusters_filename       => $cdhit_obj->clusters_filename,
        fasta_file              => $output_cd_hit_filename,
        number_of_input_files   => $number_of_input_files,
        output_file             => $output_filtered_clustered_fasta,
        _greater_than_or_equal  => $greater_than_or_equal,

lib/Bio/Roary/OrderGenes.pm  view on Meta::CPAN

has 'sample_weights'           => ( is => 'ro', isa => 'Maybe[HashRef]' );
has 'samples_to_clusters'      => ( is => 'ro', isa => 'Maybe[HashRef]' );
has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' );
has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
has 'group_graphs'            => ( is => 'ro', isa => 'Graph',   lazy => 1, builder => '_build_group_graphs' );
has 'groups_to_contigs'       => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' );
has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' );
has '_groups'                 => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' );
has 'number_of_files'         => ( is => 'ro', isa => 'Int',     lazy => 1, builder => '_build_number_of_files' );
has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 );

sub _build_number_of_files {
    my ($self) = @_;
    return @{ $self->gff_files };
}

sub _build_groups {
    my ($self) = @_;
    my %groups;
    for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) {

lib/Bio/Roary/Output/BlastIdentityFrequency.pm  view on Meta::CPAN

package Bio::Roary::Output::BlastIdentityFrequency;
$Bio::Roary::Output::BlastIdentityFrequency::VERSION = '3.13.0';
# ABSTRACT:  Take in blast results and find the percentage identity graph


use Moose;
use Bio::SeqIO;
use Bio::Roary::Exceptions;

has 'input_filename'        => ( is => 'ro', isa => 'Str', default => '_blast_results' );
has 'output_filename'       => ( is => 'ro', isa => 'Str', default => 'blast_identity_frequency.Rtab' );

has '_output_fh'            => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );

lib/Bio/Roary/Output/BlastIdentityFrequency.pm  view on Meta::CPAN

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Bio::Roary::Output::BlastIdentityFrequency - Take in blast results and find the percentage identity graph

=head1 VERSION

version 3.13.0

=head1 SYNOPSIS

Take in blast results and find the percentage identity graph
   use Bio::Roary::Output::BlastIdentityFrequency;

   my $obj = Bio::Roary::Output::BlastIdentityFrequency->new(
       input_filename      => '_blast_results',
       output_filename  => 'blast_identity_frequency.Rtab',
     );
   $obj->create_file();

=head1 AUTHOR

lib/Bio/Roary/SortFasta.pm  view on Meta::CPAN


    my $nnn_at_end_of_all_sequences = 1;
	my $sequence;
	my $variation_detected = 0;
    while ( my $input_seq = $self->_input_seqio->next_seq() ) {
		$sequence = $input_seq->seq if(!defined($sequence));
        $self->_add_padding_to_make_sequence_length_multiple_of_three($input_seq) if ( $self->make_multiple_of_three );
        $nnn_at_end_of_all_sequences = 0 if ( $nnn_at_end_of_all_sequences == 1 && !( $input_seq->seq() =~ /NNN$/i ) );
        $input_sequences{ $input_seq->display_id } = $input_seq;
        
		my $factor = $self->_percentage_similarity($sequence, $input_seq->seq);
        if($factor < $self->similarity)
        {
            $self->similarity($factor);
        }
    }

    $self->_remove_nnn_from_all_sequences( \%input_sequences ) if ( $self->remove_nnn_from_end && $nnn_at_end_of_all_sequences );

    my $sequence_length = 0;
    my $sequences_unaligned = 0;

lib/Bio/Roary/SortFasta.pm  view on Meta::CPAN

    }
    return $self;
}

sub replace_input_with_output_file {
    my ($self) = @_;
    move( $self->output_filename, $self->input_filename );
    return $self;
}

sub _percentage_similarity
{
    my ($self, $string1, $string2) = @_;
    my $num_differences = 0;
    my $string1_length = length($string1);
    for(my $i = 0; $i < $string1_length && $i< length($string2); $i++)
    {
        $num_differences++ if( substr($string1, $i, 1) ne substr($string2, $i, 1));
    }
    return 1 if($num_differences == 0);
    return 0 if($string1_length == 0);

t/Bio/Roary/AccessoryClustering.t  view on Meta::CPAN

    use_ok('Bio::Roary::AccessoryClustering');
}

my $identity_to_num_clusters = {
    '1'    => [ 10, 10 ],
    '0.99' => [ 4,  5 ],
    '0.95' => [ 2,  4 ],
    '0.90' => [ 1,  1 ],
};

for my $percentage_identity ( keys %{$identity_to_num_clusters} ) {
    ok(
        my $obj = Bio::Roary::AccessoryClustering->new(
            input_file => 't/data/input_accessory_binary.fa',
            identity   => $percentage_identity
        ),
        "initialise object with identity of $percentage_identity"
    );
    ok( my @clusters = keys %{ $obj->clusters_to_samples }, "build the clusters for $percentage_identity" );
    ok( $obj->sample_weights,      "build samples weights for $percentage_identity" );
    ok( $obj->samples_to_clusters, "build samples to clusters for $percentage_identity" );

    my $min_cluster_size = $identity_to_num_clusters->{$percentage_identity}->[0];
    my $max_cluster_size = $identity_to_num_clusters->{$percentage_identity}->[1];
    ok(
        ( @clusters >= $min_cluster_size && @clusters <= $max_cluster_size ? 1 : 0 ),
        "check number of clusters as expected, allowing for some variation for $percentage_identity"
    );
}

my $obj = Bio::Roary::AccessoryClustering->new(
    input_file => 't/data/input_accessory_binary.fa',
    identity   => 0.9
);
is_deeply(
    $obj->samples_to_clusters,
    {

t/Bio/Roary/CommandLine/Roary.t  view on Meta::CPAN


    %scripts_and_expected_files = (
        '-o some_different_output -i 90 -p 2 --translation_table 1 t/data/real_data_1.gff t/data/real_data_2.gff' => [ 'some_different_output', 't/data/expected_some_different_output' ],
    	);
    mock_execute_script_and_check_output_sorted( $script_name, \%scripts_and_expected_files, [ 0 ] );
    
    stderr_should_have($script_name,'--translation_table 1  -o some_different_output --core_definition 60 -p 2 -e --mafft  --group_limit 10 t/data/real_data_1.gff t/data/real_data_2.gff', 'Exiting early because number of clusters is too high');
    stderr_should_have($script_name,'--verbose_stats --group_limit 10 -e t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff', 'Exiting early because number of clusters is too high');
    stderr_should_not_have($script_name,'-e --group_limit 10 t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ', 'Cant access the multifasta base directory');
    stderr_should_have($script_name,'-i 90 --core_definition 60 -p 2 -v t/data/real_data_1.gff t/data/real_data_2.gff ','Cleaning up files'); 
    stderr_should_have($script_name,'-i 30 t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff','The percentage identity is too low');
    stderr_should_not_have($script_name,'--dont_delete_files -v t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ','Cleaning up files');
    stderr_should_have($script_name,'-v --group_limit 100000 -e t/data/query_1.gff t/data/query_2.gff t/data/query_5.gff ' ,'Running command: pan_genome_core_alignment');
    stderr_should_have($script_name,'--translation_table 1 -v t/data/real_data_1.gff t/data/real_data_2.gff ' ,'Cleaning up files');
    stderr_should_have($script_name,'-e -v t/data/real_data_1.gff t/data/real_data_2.gff ','Creating files with the nucleotide sequences for every cluster');
    
    SKIP:
    {
        skip "kraken not installed",        2 unless ( which('kraken') );
        skip "kraken-report not installed", 2 unless ( which('kraken-report') );
        stderr_should_have($script_name,'-v --qc t/data/real_data_1.gff t/data/real_data_2.gff' ,'Running Kraken on each input assembly');

t/Bio/Roary/Output/NumberOfGroups.t  view on Meta::CPAN

ok(-e 'number_of_unique_genes.Rtab', 'check raw output file created');
compare_ok('t/data/expected_number_of_unique_genes.tab', 'number_of_unique_genes.Rtab', 'Content of unique groups tab file as expected');
unlink('number_of_unique_genes.Rtab');


# Vary the core
ok($obj = Bio::Roary::Output::NumberOfGroups->new(
  group_statistics_obj => $group_statistics,
  annotate_groups_obj      => $annotate_groups,
  core_definition => 0.6
  ),"initialise object with 60 percent core definition");
ok($obj->create_output_files, 'create the raw output files for 60 percent core def');
compare_ok('t/data/expected_number_of_conserved_genes_0.6.tab','number_of_conserved_genes.Rtab', 'Content of conserved genes with 60 percent core def');

unlink('number_of_conserved_genes.Rtab');
unlink('number_of_new_genes.Rtab');
unlink('number_of_genes_in_pan_genome.Rtab');
unlink('number_of_unique_genes.Rtab');
unlink('group_statitics.csv');

done_testing();

t/Bio/Roary/SortFasta.t  view on Meta::CPAN


ok( $obj = Bio::Roary::SortFasta->new(
  input_filename   => 't/data/uneven_sequences.fa',
  make_multiple_of_three => 1,
  remove_nnn_from_end => 1,
), 'initalise object with uneven sequences and remove nnn from end but nothing to remove');
ok($obj->sort_fasta, 'sort the fasta file');
compare_ok($obj->output_filename, 't/data/expected_uneven_sequences.fa', "output sequences are now divisible by three and no nnn removed");


is(0,$obj->_percentage_similarity("AAA","BBB"), 'totally different');
is(1,$obj->_percentage_similarity("AAA","AAA"), 'all the same');
is(0.5,$obj->_percentage_similarity("AAAA","AABB"), 'half different');
is(1,$obj->_percentage_similarity("AAAA","AAAABB"), 'first half the same');

done_testing();



( run in 0.484 second using v1.01-cache-2.11-cpan-709fd43a63f )