CLIPSeqTools

 view release on metacpan or  search on metacpan

lib/CLIPSeqTools/App/genome_coverage.pm  view on Meta::CPAN

=head1 NAME

CLIPSeqTools::App::genome_coverage - Measure percent of genome covered by reads.

=head1 SYNOPSIS

clipseqtools genome_coverage [options/parameters]

=head1 DESCRIPTION

Measure the percent of genome that is covered by the reads of a library.

=head1 OPTIONS

  Input options for library.
    --driver <Str>         driver for database connection (eg. mysql,
                           SQLite).
    --database <Str>       database name or path to database file for file
                           based databases (eg. SQLite).
    --table <Str>          database table.
    --host <Str>           hostname for database connection.
    --user <Str>           username for database connection.
    --password <Str>       password for database connection.
    --records_class <Str>  type of records stored in database.
    --filter <Filter>      filter library. May be used multiple times.
                           Syntax: column_name="pattern"
                           e.g. keep reads with deletions AND not repeat
                                masked AND longer than 31
                                --filter deletion="def" 
                                --filter rmsk="undef" .
                                --filter query_length=">31".
                           Operators: >, >=, <, <=, =, !=, def, undef

  Other input.
    --rname_sizes <Str>    file with sizes for reference alignment
                           sequences (rnames). Must be tab delimited
                           (chromosome\tsize) with one line per rname.

  Output
    --o_prefix <Str>       output path prefix. Script will create and add
                           extension to path. [Default: ./]

  Other options.
    -v --verbose           print progress lines and extra information.
    -h -? --usage --help   print help message

=cut


package CLIPSeqTools::App::genome_coverage;
$CLIPSeqTools::App::genome_coverage::VERSION = '1.0.0';

# Make it an app command
use MooseX::App::Command;
extends 'CLIPSeqTools::App';


#######################################################################
#######################   Load External modules   #####################
#######################################################################
use Modern::Perl;
use autodie;
use namespace::autoclean;
use PDL::Lite;


#######################################################################
#######################   Command line options   ######################
#######################################################################
option 'rname_sizes' => (
	is            => 'rw',
	isa           => 'Str',
	required      => 1,
	documentation => 'file with sizes for reference alignment sequences (rnames). Must be tab delimited (chromosome\tsize) with one line per rname.',
);


#######################################################################
##########################   Consume Roles   ##########################
#######################################################################
with 
	"CLIPSeqTools::Role::Option::Library" => {
		-alias    => { validate_args => '_validate_args_for_library' },
		-excludes => 'validate_args',
	},
	"CLIPSeqTools::Role::Option::OutputPrefix" => {
		-alias    => { validate_args => '_validate_args_for_output_prefix' },
		-excludes => 'validate_args',
	};

	
#######################################################################
########################   Interface Methods   ########################
#######################################################################
sub validate_args {
	my ($self) = @_;
	
	$self->_validate_args_for_library;
	$self->_validate_args_for_output_prefix;
	$self->usage_error('File with sizes for reference alignment sequences is required') if !$self->rname_sizes;
}

sub run {
	my ($self) = @_;
	
	warn "Starting analysis: genome_coverage\n";
	
	warn "Validating arguments\n" if $self->verbose;
	$self->validate_args();
	
	warn "Reading sizes for reference alignment sequences\n" if $self->verbose;
	my %rname_sizes = $self->read_rname_sizes;
		
	warn "Creating reads collection\n" if $self->verbose;
	my $reads_collection = $self->reads_collection;
	my @rnames = $reads_collection->rnames_for_all_strands;
		
	warn "Creating output path\n" if $self->verbose;
	$self->make_path_for_output_prefix();
		
	warn "Calculating genome coverage\n" if $self->verbose;
	my $total_genome_coverage = 0;
	my $total_genome_length = 0;
	open(my $OUT, '>', $self->o_prefix.'genome_coverage.tab');
	say $OUT join("\t", 'rname', 'covered_area', 'size', 'percent_covered');
	foreach my $rname (@rnames) {
		warn "Working for $rname\n" if $self->verbose;
		my $pdl = PDL->zeros(PDL::byte(), $rname_sizes{$rname});
		
		$reads_collection->foreach_record_on_rname_do($rname, sub {
			$pdl->slice([$_[0]->start, $_[0]->stop]) .= 1;
			return 0;
		});
		
		my $covered_area = $pdl->sum();
		say $OUT join("\t", $rname, $covered_area, $rname_sizes{$rname}, $covered_area/$rname_sizes{$rname}*100);
		
		$total_genome_coverage += $covered_area;
		$total_genome_length += $rname_sizes{$rname};
	}
	say $OUT join("\t", 'Total', $total_genome_coverage, $total_genome_length, $total_genome_coverage/$total_genome_length*100);
	close $OUT;
}

sub read_rname_sizes {
	my ($self) = @_;
	
	my %rname_size;
	open (my $CHRSIZE, '<', $self->rname_sizes);
	while (my $line = <$CHRSIZE>) {
		chomp $line;
		my ($chr, $size) = split(/\t/, $line);
		$rname_size{$chr} = $size;
	}
	close $CHRSIZE;
	return %rname_size;
}


1;



( run in 3.123 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )