App-Sandy
view release on metacpan or search on metacpan
lib/App/Sandy/DB/Handle/Variation.pm view on Meta::CPAN
# Begin transation
my $guard = $schema->txn_scope_guard;
log_msg ":: Storing structural variation '$name'...";
$rs = $schema->resultset('StructuralVariation')->create({
name => $name,
source => $source,
is_user_provided => $is_user_provided,
matrix => $compressed
});
# End transation
$guard->commit;
}
sub _index_snv {
my ($self, $variation_file) = @_;
my $fh = $self->with_open_r($variation_file);
my %indexed_snv;
my $line = 0;
# chr pos ref @obs he
while (<$fh>) {
$line++;
# Skip coments and blank lines
next if /^#/;
next if /^\s*$/;
chomp;
my @fields = split;
die "Not found all fields (SEQID, POSITION, ID, REFERENCE, OBSERVED, GENOTYPE) into file '$variation_file' at line $line\n"
unless scalar @fields >= 6;
die "Second column, position, does not seem to be a number into file '$variation_file' at line $line\n"
unless looks_like_number($fields[1]);
die "Second column, position, has a value lesser or equal to zero into file '$variation_file' at line $line\n"
if $fields[1] <= 0;
die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$variation_file' at line $line\n"
unless $fields[3] =~ /^(\w+|-)$/;
die "Fifth column, alteration, does not seem to be a valid entry: '$fields[4]' into file '$variation_file' at line $line\n"
unless $fields[4] =~ /^(\w+|-)$/;
die "Sixth column, genotype, has an invalid entry: '$fields[5]' into file '$variation_file' at line $line. Valid ones are 'HE' or 'HO'\n"
unless $fields[5] =~ /^(HE|HO)$/;
if ($fields[3] eq $fields[4]) {
warn "There is an alteration equal to the reference at '$variation_file' line $line. I will ignore it\n";
next;
}
# Sequence inside perl begins at 0
my $position = int($fields[1] - 1);
# Compare the alterations and reference to guess the max variation on sequence
my $size_of_variation = max map { length } $fields[3], $fields[4];
my $high = $position + $size_of_variation - 1;
my %variation = (
seq_id => $fields[0],
id => $fields[2],
ref => $fields[3],
alt => $fields[4],
plo => $fields[5],
pos => $position,
low => $position,
high => $high,
line => $line
);
push @{ $indexed_snv{$self->with_std_seqid($fields[0])} } => \%variation;
}
close $fh
or die "Cannot close structural variation file '$variation_file': $!\n";
return \%indexed_snv;
}
sub _index_vcf {
my ($self, $vcf_file, $sample_name) = @_;
my $fh = $self->with_open_r($vcf_file);
my $magic = <$fh>;
chomp $magic;
if ($magic !~ /^##fileformat=VCFv\d+(\.\d+)?$/) {
log_msg ":: Putative VCF file '$vcf_file' does not have the required 'fileformat=VCFv{VERSION}' field\n";
log_msg ":: Continue anyway ...\n";
}
my $sample_i = 9;
my $has_header = 0;
my $line = 1;
my %indexed_snv;
while (<$fh>) {
$line ++;
next if /^##/;
next if /^\s*$/;
chomp;
my @fields = split /\t/;
if (/^#/) {
my $number_of_fields = scalar @fields;
if ($number_of_fields < 8) {
die "Invalid header: Less than 8 columns at line $line\n";
} elsif ($number_of_fields == 8) {
die "No genotype data found in the header at line $line\n";
} elsif ($number_of_fields == 9) {
die "Invalid header: Missing sample data at line $line\n";
lib/App/Sandy/DB/Handle/Variation.pm view on Meta::CPAN
if $fields[1] <= 0;
die "Fourth column, reference, does not seem to be a valid entry: '$fields[3]' into file '$vcf_file' at line $line\n"
unless $fields[3] =~ /^(<\w+(:\w+)*>|\w+|-)$/;
die "Fifth column, alternate, does not seem to be a valid entry: '$fields[4]' into file '$vcf_file' at line $line\n"
unless $fields[4] =~ /^(\w+|<\w+(:\w+)*>|-|\*)(,(\w+|<\w+(:\w+)*>|-|\*))*$/;
die "No genotype field found in format column at line $line\n"
unless $fields[8] =~ /^GT:?/;
die "Sample '$sample_name', column '$sample_i', does not seem to be a valid entry: '$fields[$sample_i]' into file '$vcf_file' at line $line\n"
unless $fields[$sample_i] =~ /^[0-9.]+([\/\|][0-9.])?:*/;
if ($fields[3] eq $fields[4]) {
log_msg ":: There is an alternate equal to the reference at '$vcf_file' line $line. I will ignore it\n";
next;
}
# No support for structural variation <VAR>
# So ignore it
if ($fields[3] =~ /<\w+>/ || $fields[4] =~ /<\w+>/) {
next;
}
# Ignore missing alleles
if ($fields[4] =~ /\*/) {
next;
}
my @alternates = split /,/ => $fields[4];
my $genotype = (split /:/ => $fields[$sample_i])[0];
my ($plo, $alternate);
my ($g1, $g2) = split /\|/ => $genotype;
if ($g1 eq '.' || (defined $g2 && $g2 eq '.')) {
next;
}
# Homozygosity or haploid chromossome
if (! defined($g2) || ($g1 == $g2)) {
# No variation
next if $g1 == 0;
$alternate = $alternates[$g1 - 1];
$plo = 'HO';
} else {
# The alternate is the one with the max index, in the case
# of a variation with the two alleles been alterations
my $i = $g1 > $g2
? $g1 - 1
: $g2 - 1;
$alternate = $alternates[$i];
$plo = 'HE';
}
# Sequence inside perl begins at 0
my $position = int($fields[1] - 1);
# Compare the alterations and reference to guess the max variation on sequence
my $size_of_variation = max map { length } $fields[3], $alternate;
my $high = $position + $size_of_variation - 1;
my %variation = (
seq_id => $fields[0],
id => $fields[2],
ref => $fields[3],
alt => $alternate,
plo => $plo,
pos => $position,
low => $position,
high => $high,
line => $line
);
push @{ $indexed_snv{$self->with_std_seqid($fields[0])} } => \%variation;
}
close $fh
or die "Cannot close vcf file '$vcf_file': $!\n";
return \%indexed_snv;
}
sub _validate_indexed_snv {
my ($self, $indexed_snv, $variation_file) = @_;
for my $seq_id (keys %$indexed_snv) {
my $snvs_a = delete $indexed_snv->{$seq_id};
my @sorted_snvs = sort { $a->{low} <=> $b->{low} } @$snvs_a;
my $prev_snv = $sorted_snvs[0];
my $high = $prev_snv->{high};
push my @snv_cluster => $prev_snv;
for (my $i = 1; $i < @sorted_snvs; $i++) {
my $next_snv = $sorted_snvs[$i];
# If not overlapping
if ($next_snv->{low} > $high) {
my $valid_snvs = $self->_validate_indexed_snv_cluster(\@snv_cluster, $variation_file);
push @{ $indexed_snv->{$seq_id} } => @$valid_snvs;
@snv_cluster = ();
}
push @snv_cluster => $next_snv;
$high = max $high, $next_snv->{high};
$prev_snv = $next_snv;
}
my $valid_snvs = $self->_validate_indexed_snv_cluster(\@snv_cluster, $variation_file);
push @{ $indexed_snv->{$seq_id} } => @$valid_snvs;
}
}
sub _validate_indexed_snv_cluster {
my ($self, $snvs, $variation_file) = @_;
# My rules:
# The biggest structural variation gains precedence.
( run in 0.863 second using v1.01-cache-2.11-cpan-39bf76dae61 )