Bio_AssemblyImprovement

 view release on metacpan or  search on metacpan

lib/Bio/AssemblyImprovement/Util/FastqTools.pm  view on Meta::CPAN

package Bio::AssemblyImprovement::Util::FastqTools;

# ABSTRACT: Take a fastq file and calculate some useful statistics and values


use Moose;
use Statistics::Lite qw(:all);
use Cwd;
use Cwd 'abs_path';
use File::Basename;
#use GD::Graph::histogram;
use Bio::SeqIO;

with 'Bio::AssemblyImprovement::Util::GetReadLengthsRole';
with 'Bio::AssemblyImprovement::Util::ZipFileRole';
with 'Bio::AssemblyImprovement::Util::UnzipFileIfNeededRole';

has 'input_filename'   => ( is => 'ro', isa => 'Str' , required => 1 );
has 'single_cell'      => ( is => 'ro', isa => 'Bool', default => 0);


# sub draw_histogram_of_read_lengths {
# 	my ($self) = @_;
#
# 	my $fastq_file = $self->_gunzip_file_if_needed( $self->input_filename );
#
# 	my $arrayref = $self->_get_read_lengths($fastq_file);
#
# 	# Set graph details
# 	my $graph = new GD::Graph::histogram(400,600);
# 	$graph->set(
#                 x_label         => 'Read length',
#                 y_label         => 'Number of reads',
#                 title           => 'Histogram of read lengths for '.$self->input_filename,
#                 x_labels_vertical => 1,
#                 bar_spacing     => 0,
#                 shadow_depth    => 1,
#                 shadowclr       => 'dred',
#                 transparent     => 0,
#     )
#     or warn $graph->error;
#     #Draw the graph
#     my $gd = $graph->plot($arrayref) or die $graph->error;
#     #Store the graph
#     open(IMG, '>histogram.png') or die "Could not open a file called histogram.png";
#     binmode IMG;
#     print IMG $gd->png;
#
#     $self->_cleaup_after_ourselves($fastq_file);
#
#
# }

sub calculate_kmer_sizes {

	my ($self) = @_;
    my %kmer_size;
	
    if ($self->single_cell) {
        my $read_length = $self->first_read_length();
        $kmer_size{min} = int($read_length * 0.5);
        $kmer_size{max} = int($read_length * 0.95);
    }
    else {
        my $fastq_file = $self->_gunzip_file_if_needed( $self->input_filename );

        my $arrayref = $self->_get_read_lengths($fastq_file);
        my $median = median(@$arrayref);

        # Set a minimum median so that the min kmer length stays at a reasonable value
        if($median < 30){
            $median = 30;
        }

        $kmer_size{min} = int($median*0.66); #66% of median read length will be minimum kmer length
        $kmer_size{max} = int($median*0.90); #90% of median read length will be maximum kmer length

        $self->_cleaup_after_ourselves($fastq_file);
    }

    if($kmer_size{min} % 2 == 0){
        $kmer_size{min}--;
    }

    if($kmer_size{max} % 2 == 0){
        $kmer_size{max}--;
    }
  	
  	return %kmer_size;
	
}

sub calculate_coverage {

	my ($self, $expected_genome_size) = @_;
	unless ($expected_genome_size) { return undef };
	
	my $fastq_file = $self->_gunzip_file_if_needed( $self->input_filename );
	



( run in 0.569 second using v1.01-cache-2.11-cpan-39bf76dae61 )