App-Sandy

 view release on metacpan or  search on metacpan

lib/App/Sandy/DB/Handle/Variation.pm  view on Meta::CPAN

package App::Sandy::DB::Handle::Variation;
# ABSTRACT: Class to handle structural variation database schemas.

use App::Sandy::Base 'class';
use App::Sandy::DB;
use IO::Compress::Gzip 'gzip';
use IO::Uncompress::Gunzip 'gunzip';
use Storable qw/nfreeze thaw/;
use Scalar::Util qw/looks_like_number refaddr/;
use List::Util 'max';

with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/;

our $VERSION = '0.25'; # VERSION

sub insertdb {
	my ($self, $file, $name, $source, $is_user_provided, $type, @args) = @_;
	my $schema = App::Sandy::DB->schema;

	log_msg ":: Checking if there is already a structural variation '$name' ...";
	my $rs = $schema->resultset('StructuralVariation')->find({ name => $name });
	if ($rs) {
		die "There is already a structural variation '$name'\n";
	} else {
		log_msg ":: structural variation '$name' not found";
	}

	log_msg ":: Indexing '$file'";
	log_msg ":: It may take a while ...";
	my $indexed_file;

	if ($type eq 'raw') {
		$indexed_file = $self->_index_snv($file);
	} elsif ($type eq 'vcf') {
		$indexed_file = $self->_index_vcf($file, @args);
	} else {
		croak "No type '$type'";
	}

	log_msg ":: Removing overlapping entries in structural variation file '$file', if any ...";
	$self->_validate_indexed_snv($indexed_file, $file);

	log_msg ":: Converting data to bytes ...";
	my $bytes = nfreeze $indexed_file;
	log_msg ":: Compressing bytes ...";
	gzip \$bytes => \my $compressed;

	# Begin transation
	my $guard = $schema->txn_scope_guard;

	log_msg ":: Storing structural variation '$name'...";
	$rs = $schema->resultset('StructuralVariation')->create({
		name             => $name,
		source           => $source,
		is_user_provided => $is_user_provided,
		matrix           => $compressed
	});

	# End transation
	$guard->commit;
}

sub _index_snv {
	my ($self, $variation_file) = @_;

	my $fh = $self->with_open_r($variation_file);

	my %indexed_snv;
	my $line = 0;

	# chr pos ref @obs he
	while (<$fh>) {
		$line++;

		# Skip coments and blank lines
		next if /^#/;
		next if /^\s*$/;

		chomp;
		my @fields = split;

		die "Not found all fields (SEQID, POSITION, ID, REFERENCE, OBSERVED, GENOTYPE) into file '$variation_file' at line $line\n"
			unless scalar @fields >= 6;

		die "Second column, position, does not seem to be a number into file '$variation_file' at line $line\n"
			unless looks_like_number($fields[1]);

		die "Second column, position, has a value lesser or equal to zero into file '$variation_file' at line $line\n"
			if $fields[1] <= 0;

		die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$variation_file' at line $line\n"
			unless $fields[3] =~ /^(\w+|-)$/;

		die "Fifth column, alteration, does not seem to be a valid entry: '$fields[4]' into file '$variation_file' at line $line\n"
			unless $fields[4] =~ /^(\w+|-)$/;

		die "Sixth column, genotype, has an invalid entry: '$fields[5]' into file '$variation_file' at line $line. Valid ones are 'HE' or 'HO'\n"
			unless $fields[5] =~ /^(HE|HO)$/;

		if ($fields[3] eq $fields[4]) {
			warn "There is an alteration equal to the reference at '$variation_file' line $line. I will ignore it\n";
			next;
		}

		# Sequence inside perl begins at 0
		my $position = int($fields[1] - 1);

		# Compare the alterations and reference to guess the max variation on sequence
		my $size_of_variation = max map { length } $fields[3], $fields[4];
		my $high = $position + $size_of_variation - 1;

		my %variation = (
			seq_id => $fields[0],
			id     => $fields[2],
			ref    => $fields[3],
			alt    => $fields[4],
			plo    => $fields[5],
			pos    => $position,
			low    => $position,
			high   => $high,
			line   => $line
		);

		push @{ $indexed_snv{$self->with_std_seqid($fields[0])} } => \%variation;
	}

	close $fh
		or die "Cannot close structural variation file '$variation_file': $!\n";

	return \%indexed_snv;
}

sub _index_vcf {
	my ($self, $vcf_file, $sample_name) = @_;

	my $fh = $self->with_open_r($vcf_file);

	my $magic = <$fh>;
	chomp $magic;

	if ($magic !~ /^##fileformat=VCFv\d+(\.\d+)?$/) {
		log_msg ":: Putative VCF file '$vcf_file' does not have the required 'fileformat=VCFv{VERSION}' field\n";
		log_msg ":: Continue anyway ...\n";
	}

	my $sample_i = 9;
	my $has_header = 0;
	my $line = 1;
	my %indexed_snv;

	while (<$fh>) {
		$line ++;

		next if /^##/;
		next if /^\s*$/;

		chomp;
		my @fields = split /\t/;

		if (/^#/) {
			my $number_of_fields = scalar @fields;

			if ($number_of_fields < 8) {
				die "Invalid header: Less than 8 columns at line $line\n";
			} elsif ($number_of_fields == 8) {
				die "No genotype data found in the header at line $line\n";
			} elsif ($number_of_fields == 9) {
				die "Invalid header: Missing sample data at line $line\n";
			}

			my @samples = splice @fields, 9;

			if (defined $sample_name) {
				my %columns;

				for my $i (0..$#samples) {
					$columns{$samples[$i]} = $i;
				}

				if (not exists $columns{$sample_name}) {
					die "Not found sample named '$sample_name' at file '$vcf_file'\n";
				}

				$sample_i += $columns{$sample_name};
			} else {
				$sample_name = $samples[$sample_i];
			}

			$has_header = 1;
			next;
		}

		unless ($has_header) {
			die "No header found in file '$vcf_file'\n";
		}

		die "Not found all columns into file '$vcf_file' at line $line\n"
			unless scalar @fields > 9;

		die "Second column, position, does not seem to be a number into file '$vcf_file' at line $line\n"
			unless looks_like_number($fields[1]);

		die "Second column, position, has a value lesser or equal to zero into file '$vcf_file' at line $line\n"
			if $fields[1] <= 0;

		die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$vcf_file' at line $line\n"
			unless $fields[3] =~ /^(<\w+(:\w+)*>|\w+|-)$/;

		die "Fifth column, alternate, does not seem to be a valid entry: '$fields[4]' into file '$vcf_file' at line $line\n"
			unless $fields[4] =~ /^(\w+|<\w+(:\w+)*>|-|\*)(,(\w+|<\w+(:\w+)*>|-|\*))*$/;

		die "No genotype field found in format column at line $line\n"
			unless $fields[8] =~ /^GT:?/;

		die "Sample '$sample_name', column '$sample_i', does not seem to be a valid entry: '$fields[$sample_i]' into file '$vcf_file' at line $line\n"
			unless $fields[$sample_i] =~ /^[0-9.]+([\/\|][0-9.])?:*/;

		if ($fields[3] eq $fields[4]) {
			log_msg ":: There is an alternate equal to the reference at '$vcf_file' line $line. I will ignore it\n";
			next;
		}

		# No support for structural variation <VAR>
		# So ignore it
		if ($fields[3] =~ /<\w+>/ || $fields[4] =~ /<\w+>/) {
			next;
		}

		# Ignore missing alleles
		if ($fields[4] =~ /\*/) {
			next;
		}

		my @alternates = split /,/ => $fields[4];
		my $genotype = (split /:/ => $fields[$sample_i])[0];

		my ($plo, $alternate);
		my ($g1, $g2) = split /\|/ => $genotype;


		if ($g1 eq '.' || (defined $g2 && $g2 eq '.')) {
			next;
		}

		# Homozygosity or haploid chromossome
		if (! defined($g2) || ($g1 == $g2)) {
			# No variation
			next if $g1 == 0;
			$alternate = $alternates[$g1 - 1];
			$plo = 'HO';
		} else {
			# The alternate is the one with the max index, in the case
			# of a variation with the two alleles been alterations
			my $i = $g1 > $g2
				? $g1 - 1
				: $g2 - 1;
			$alternate = $alternates[$i];
			$plo = 'HE';
		}

		# Sequence inside perl begins at 0



( run in 2.098 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )