BioPerl

 view release on metacpan or  search on metacpan

Bio/Tools/Sim4/Results.pm  view on Meta::CPAN

sub parse_next_alignment {
   my ($self) = @_;
   my @exons = ();
   my %seq1props = ();
   my %seq2props = ();
   # we refer to the properties of each seq by reference
   my ($estseq, $genomseq, $to_reverse);
   my $started = 0;
   my $hit_direction = 1;
   my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
                       # output yet)
   
   while(defined($_ = $self->_readline())) {
       #chomp();
       #
       # bascially, each sim4 'hit' starts with seq1...
       #
       /^seq1/ && do {
	   if($started) {
	       $self->_pushback($_);
	       last;
	   }
	   $started = 1;

	   # filename and length of seq 1
	   /^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
	       $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
	   $seq1props{'filename'} = $1;
	   $seq1props{'length'} = $2;
	   next;
       };
       /^seq2/ && do {
	   # the second hit has also the database name in the >name syntax 
	   # (in brackets).
	   /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
	       $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
	   $seq2props{'filename'} = $1;
	   $seq2props{'seqname'} = $2;
	   $seq2props{'length'} = $3;
	   next;
       };
       if(/^>(\S+)\s*(.*)$/) {
	   # output option was A=4, which not only gives the complete
	   # description lines, but also causes the longer sequence to be
	   # reversed if the second file contained one (genomic) sequence
	   $seq1props{'seqname'} = $1;
	   $seq1props{'description'} = $2 if $2;
	   $output_fmt = 4;
	   # we handle seq1 and seq2 both here
	   if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
	       $seq2props{'seqname'} = $1; # redundant, since already set above
	       $seq2props{'description'} = $2 if $2;
	   }
	   next;
       }
       /^\(complement\)/ && do {
	   $hit_direction = -1;
	   next;
       };
       # this matches
       # start-end (start-end) pctid%
       if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
 	   $seq1props{'start'} = $1;
 	   $seq1props{'end'} = $2;
 	   $seq2props{'start'} = $3;
 	   $seq2props{'end'} = $4;
	   my $pctid   = $5;
	   
	   if(! defined($estseq)) {
	       # for the first time here: need to set the references referring
	       # to seq1 and seq2 
	       if(! exists($self->{'_est_is_first'})) {
		   # detect which one is the EST by looking at the lengths,
		   # and assume that this holds throughout the entire result
		   # file (i.e., when this method is called for the next
		   # alignment, this will not be checked again)
		   if($seq1props{'length'} > $seq2props{'length'}) {
		       $self->{'_est_is_first'} = 0;
		   } else {
		       $self->{'_est_is_first'} = 1;
		   }
	       }
	       if($self->{'_est_is_first'}) {
		   $estseq = \%seq1props;
		   $genomseq = \%seq2props;
		   # if the EST is given first, A=4 selects the genomic
		   # seq for being reversed (reversing the EST is default)
		   $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
	       } else {
		   $estseq = \%seq2props;
		   $genomseq = \%seq1props;
		   # if the EST is the second, A=4 does not change the
		   # seq being reversed (always the EST is reversed)
		   $to_reverse = $estseq;
	       }
	   }
	   if($hit_direction == -1) {
	       # we have to reverse the coordinates of one of both seqs
	       my $tmp = $to_reverse->{'start'};
	       $to_reverse->{'start'} =
		   $to_reverse->{'length'} - $to_reverse->{'end'} + 1;
	       $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
	   }
	   # create and initialize the exon object
	   my $exon = Bio::Tools::Sim4::Exon->new(
					    '-start' => $genomseq->{'start'},
					    '-end'   => $genomseq->{'end'},
					    '-strand' => $hit_direction);
	   if(exists($genomseq->{'seqname'})) {
	       $exon->seq_id($genomseq->{'seqname'});
	   } else {
	       # take filename stripped of path as fall back
	       my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
	       $exon->seq_id($basename);
	   }
	   $exon->feature1()->add_tag_value('filename',
					    $genomseq->{'filename'});
	   # feature1 is supposed to be initialized to a Similarity object,
           # but we provide a safety net
	   if($exon->feature1()->can('seqlength')) {
	       $exon->feature1()->seqlength($genomseq->{'length'});

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 3.138 seconds using v1.00-cache-2.02-grep-82fe00e-cpan-cec75d87357c )