BioPerl
view release on metacpan or search on metacpan
Bio/SearchIO/gmap_f9.pm view on Meta::CPAN
my $a = {}; # args w/ which we'll create hsp
my $hsp;
my $identical;
$a->{-algorithm} = 'GMAP';
for my $c (@{$info}) {
$a->{-query_seq} .= $c->{query_base};
$a->{-hit_seq} .= $c->{hit_base};
$a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
$identical++ if ( $c->{query_base} eq $c->{hit_base} );
}
$a->{-query_seq} =~ s| |\-|g; # switch to bioperl gaps.
$a->{-hit_seq} =~ s| |\-|g;
$a->{-conserved} = $a->{-identical} = $identical;
# use the coordinates from from gmap's -f 9 output to
# determine whether gmap revcomped the query sequence
# to generate the alignment. Note that this is not
# the same as the cDNA's sense/anti-sense-ness.
$a->{-stranded} = 'both';
$a->{-query_start} = $info->[0]->{query_pos};
$a->{-query_end} = $info->[-1]->{query_pos};
$a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
# hit can be either strand, -f 9 output tells us which.
# we don't have to worry about it here, but telling the generichsp code
# that this hit is 'stranded', it compares the start and end positions
# sets it for us.
$a->{-hit_start} = $info->[0]->{hit_pos};
$a->{-hit_end} = $info->[-1]->{hit_pos};
$a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
$a->{-hsp_length} =
$a->{-query_length} > $a->{-hit_length} ?
$a->{-query_length} : $a->{-hit_length};
$hsp = Bio::Search::HSP::GenericHSP->new( %$a );
return $hsp;
}
# TODO (adjust regexp to swallow lines w/out md5 sig's.
sub _parse_path_header {
my $self = shift;
my $path_line = shift;
my $path = {};
(
$path->{query},
$path->{db},
$path->{path_num},
$path->{path_total_num},
$path->{query_length},
$path->{exon_count},
$path->{trimmed_coverage},
$path->{percent_identity},
$path->{query_start},
$path->{query_end},
$path->{whole_genome_start},
$path->{whole_genome_end},
$path->{chromosome},
$path->{chromo_start},
$path->{chromo_end},
$path->{strand},
$path->{sense},
$path->{md5},
) =
($_ =~ qr|
>
([^ ]*)[ ] # the query id}, followed by a space
([^ ]*)[ ] # the genome database, followed by a space
(\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12)
(\d+)[ ] # query length, followed by a space
(\d+)[ ] # hsp/exon count, followed by a space
(\d+\.\d*)[ ] # trimmed coverage
(\d+\.\d*)[ ] # percent identity
(\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
(\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
(\d+): # chromosome number
(\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
([+-])[ ] # strand, followed by a space
dir:(.*) # dir:sense or dir:antisense
[ ]md5:([\dabcdefg]+) # md5 signature
|x
);
$path->{query} or $self->throw("query was not found in path line.");
$path->{db} or $self->throw("db was not found in path line.");
$path->{path_num} or $self->throw("path_num was not found in path line.");
$path->{path_total_num} or
$self->throw("path_total_num was not found in path line.");
$path->{query_length} or
$self->throw("query_length was not found in path line.");
$path->{exon_count} or
$self->throw("exon_count was not found in path line.");
$path->{trimmed_coverage} or
$self->throw("trimmed_coverage was not found in path line.");
$path->{percent_identity} or
$self->throw("percent_identity was not found in path line.");
$path->{query_start} or
$self->throw("query_start was not found in path line.");
$path->{query_end} or
$self->throw("query_end was not found in path line.");
$path->{whole_genome_start} or
$self->throw("whole_genome_start was not found in path line.");
$path->{whole_genome_end} or
$self->throw("whole_genome_end was not found in path line.");
$path->{chromosome} or
$self->throw("chromosome was not found in path line.");
$path->{chromo_start} or
$self->throw("chromo_start was not found in path line.");
$path->{chromo_end} or
$self->throw("chromo_end was not found in path line.");
$path->{strand} or $self->throw("strand was not found in path line.");
$path->{sense} or $self->throw("sense was not found in path line.");
return $path;
}
sub _parse_alignment_line {
my $self = shift;
my $a_line = shift;
my $align = {};
(
$align->{chromo_start},
$align->{chromo_end},
$align->{query_start},
$align->{query_end},
$align->{percent_identity},
$align->{align_length},
$align->{intron_length},
) =
($_ =~ qr|
[\t]
([\d]+)[ ] # start in chromosome coord.
([\d]+)[ ] # end in chromosome coord.
([\d]+)[ ] # start in query coord.
([\d]+)[ ] # end in query coord.
([\d]+) # percent identity (as integer)
[\t].*[\t] # skip the edit script
([\d]+) # length of alignment block.
[\t]*([\d]+)* # length of following intron.
|x
);
$align->{chromo_start}
or $self->throw("chromo_start missing in alignment line.");
$align->{chromo_end},
or $self->throw("chromo_end was missing in alignment line.");
$align->{query_start},
or $self->throw("query_start was missing in alignment line.");
$align->{query_end},
or $self->throw("query_end was missing in alignment line.");
$align->{percent_identity},
or $self->throw("percent_identity was missing in alignment line.");
$align->{align_length},
or $self->throw("align_length was missing in alignment line.");
return $align;
}
1;
( run in 0.804 second using v1.01-cache-2.11-cpan-39bf76dae61 )