BioPerl
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
Bio/Tools/Sim4/Results.pm view on Meta::CPAN
sub parse_next_alignment {
my ($self) = @_;
my @exons = ();
my %seq1props = ();
my %seq2props = ();
# we refer to the properties of each seq by reference
my ($estseq, $genomseq, $to_reverse);
my $started = 0;
my $hit_direction = 1;
my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
# output yet)
while(defined($_ = $self->_readline())) {
#chomp();
#
# bascially, each sim4 'hit' starts with seq1...
#
/^seq1/ && do {
if($started) {
$self->_pushback($_);
last;
}
$started = 1;
# filename and length of seq 1
/^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
$self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
$seq1props{'filename'} = $1;
$seq1props{'length'} = $2;
next;
};
/^seq2/ && do {
# the second hit has also the database name in the >name syntax
# (in brackets).
/^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
$self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
$seq2props{'filename'} = $1;
$seq2props{'seqname'} = $2;
$seq2props{'length'} = $3;
next;
};
if(/^>(\S+)\s*(.*)$/) {
# output option was A=4, which not only gives the complete
# description lines, but also causes the longer sequence to be
# reversed if the second file contained one (genomic) sequence
$seq1props{'seqname'} = $1;
$seq1props{'description'} = $2 if $2;
$output_fmt = 4;
# we handle seq1 and seq2 both here
if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
$seq2props{'seqname'} = $1; # redundant, since already set above
$seq2props{'description'} = $2 if $2;
}
next;
}
/^\(complement\)/ && do {
$hit_direction = -1;
next;
};
# this matches
# start-end (start-end) pctid%
if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
$seq1props{'start'} = $1;
$seq1props{'end'} = $2;
$seq2props{'start'} = $3;
$seq2props{'end'} = $4;
my $pctid = $5;
if(! defined($estseq)) {
# for the first time here: need to set the references referring
# to seq1 and seq2
if(! exists($self->{'_est_is_first'})) {
# detect which one is the EST by looking at the lengths,
# and assume that this holds throughout the entire result
# file (i.e., when this method is called for the next
# alignment, this will not be checked again)
if($seq1props{'length'} > $seq2props{'length'}) {
$self->{'_est_is_first'} = 0;
} else {
$self->{'_est_is_first'} = 1;
}
}
if($self->{'_est_is_first'}) {
$estseq = \%seq1props;
$genomseq = \%seq2props;
# if the EST is given first, A=4 selects the genomic
# seq for being reversed (reversing the EST is default)
$to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
} else {
$estseq = \%seq2props;
$genomseq = \%seq1props;
# if the EST is the second, A=4 does not change the
# seq being reversed (always the EST is reversed)
$to_reverse = $estseq;
}
}
if($hit_direction == -1) {
# we have to reverse the coordinates of one of both seqs
my $tmp = $to_reverse->{'start'};
$to_reverse->{'start'} =
$to_reverse->{'length'} - $to_reverse->{'end'} + 1;
$to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
}
# create and initialize the exon object
my $exon = Bio::Tools::Sim4::Exon->new(
'-start' => $genomseq->{'start'},
'-end' => $genomseq->{'end'},
'-strand' => $hit_direction);
if(exists($genomseq->{'seqname'})) {
$exon->seq_id($genomseq->{'seqname'});
} else {
# take filename stripped of path as fall back
my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
$exon->seq_id($basename);
}
$exon->feature1()->add_tag_value('filename',
$genomseq->{'filename'});
# feature1 is supposed to be initialized to a Similarity object,
# but we provide a safety net
if($exon->feature1()->can('seqlength')) {
$exon->feature1()->seqlength($genomseq->{'length'});
view all matches for this distributionview release on metacpan - search on metacpan
( run in 3.138 seconds using v1.00-cache-2.02-grep-82fe00e-cpan-cec75d87357c )