BioPerl

 view release on metacpan or  search on metacpan

Bio/SearchIO/gmap_f9.pm  view on Meta::CPAN

    my $a = {};			# args w/ which we'll create hsp
    my $hsp;
    my $identical;

    $a->{-algorithm} = 'GMAP';

    for my $c (@{$info}) {
	$a->{-query_seq} .= $c->{query_base};
	$a->{-hit_seq} .= $c->{hit_base};
    $a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
	$identical++ if ( $c->{query_base} eq $c->{hit_base} );
    }

    $a->{-query_seq} =~ s| |\-|g;		# switch to bioperl gaps.
    $a->{-hit_seq} =~ s| |\-|g;

    $a->{-conserved} = $a->{-identical} = $identical;

    # use the coordinates from from gmap's -f 9 output to
    # determine whether gmap revcomped the query sequence
    # to generate the alignment.  Note that this is not
    # the same as the cDNA's sense/anti-sense-ness.
    $a->{-stranded} = 'both';

    $a->{-query_start} = $info->[0]->{query_pos};
    $a->{-query_end} = $info->[-1]->{query_pos};
    $a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;

    # hit can be either strand, -f 9 output tells us which.
    # we don't have to worry about it here, but telling the generichsp code
    # that this hit is 'stranded', it compares the start and end positions
    # sets it for us.
    $a->{-hit_start} = $info->[0]->{hit_pos};
    $a->{-hit_end} = $info->[-1]->{hit_pos};

    $a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;

    $a->{-hsp_length} =
	$a->{-query_length} > $a->{-hit_length} ?
	    $a->{-query_length} : $a->{-hit_length};

    $hsp = Bio::Search::HSP::GenericHSP->new( %$a );

    return $hsp;
}

# TODO (adjust regexp to swallow lines w/out md5 sig's.
sub _parse_path_header {
    my $self = shift;
    my $path_line = shift;
    my $path = {};

    (
     $path->{query},
     $path->{db},
     $path->{path_num},
     $path->{path_total_num},
     $path->{query_length},
     $path->{exon_count},
     $path->{trimmed_coverage},
     $path->{percent_identity},
     $path->{query_start},
     $path->{query_end},
     $path->{whole_genome_start},
     $path->{whole_genome_end},
     $path->{chromosome},
     $path->{chromo_start},
     $path->{chromo_end},
     $path->{strand},
     $path->{sense},
     $path->{md5},
    ) =
	($_ =~ qr|
                >
                ([^ ]*)[ ]	# the query id}, followed by a space
	        ([^ ]*)[ ]      # the genome database, followed by a space
                (\d+)/(\d+)[ ]	# path_num/path_total_num (e.g. 3/12)
	        (\d+)[ ]	# query length, followed by a space
	        (\d+)[ ]	# hsp/exon count, followed by a space
                (\d+\.\d*)[ ]	# trimmed coverage
                (\d+\.\d*)[ ]	# percent identity
                (\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
                (\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
                (\d+):		# chromosome number
                (\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
                ([+-])[ ]	# strand, followed by a space
                dir:(.*) # dir:sense or dir:antisense
                [ ]md5:([\dabcdefg]+) # md5 signature
	 |x
	);

    $path->{query} or $self->throw("query was not found in path line.");
    $path->{db} or $self->throw("db was not found in path line.");
    $path->{path_num} or $self->throw("path_num was not found in path line.");
    $path->{path_total_num} or
	$self->throw("path_total_num was not found in path line.");
    $path->{query_length} or
	$self->throw("query_length was not found in path line.");
    $path->{exon_count} or
	$self->throw("exon_count was not found in path line.");
    $path->{trimmed_coverage} or
	$self->throw("trimmed_coverage was not found in path line.");
    $path->{percent_identity} or
	$self->throw("percent_identity was not found in path line.");
    $path->{query_start} or
	$self->throw("query_start was not found in path line.");
    $path->{query_end} or
	$self->throw("query_end was not found in path line.");
    $path->{whole_genome_start} or
	$self->throw("whole_genome_start was not found in path line.");
    $path->{whole_genome_end} or
	$self->throw("whole_genome_end was not found in path line.");
    $path->{chromosome} or
	$self->throw("chromosome was not found in path line.");
    $path->{chromo_start} or
	$self->throw("chromo_start was not found in path line.");
    $path->{chromo_end} or
	$self->throw("chromo_end was not found in path line.");
    $path->{strand} or $self->throw("strand was not found in path line.");
    $path->{sense} or $self->throw("sense was not found in path line.");

    return $path;
}

sub _parse_alignment_line {
    my $self = shift;
    my $a_line = shift;
    my $align = {};

    (
     $align->{chromo_start},
     $align->{chromo_end},
     $align->{query_start},
     $align->{query_end},
     $align->{percent_identity},
     $align->{align_length},
     $align->{intron_length},
    ) =
	($_ =~ qr|
                [\t]
                ([\d]+)[ ]	# start in chromosome coord.
                ([\d]+)[ ]	# end in chromosome coord.
                ([\d]+)[ ]	# start in query coord.
                ([\d]+)[ ]	# end in query coord.
                ([\d]+) 	# percent identity (as integer)
                [\t].*[\t]	# skip the edit script
                ([\d]+) 	# length of alignment block.
                [\t]*([\d]+)* 	# length of following intron.
	 |x
	);

     $align->{chromo_start}
 	or $self->throw("chromo_start missing in alignment line.");
     $align->{chromo_end},
 	or $self->throw("chromo_end was missing in alignment line.");
     $align->{query_start},
 	or $self->throw("query_start was missing in alignment line.");
     $align->{query_end},
 	or $self->throw("query_end was missing in alignment line.");
     $align->{percent_identity},
 	or $self->throw("percent_identity was missing in alignment line.");
     $align->{align_length},
 	or $self->throw("align_length was missing in alignment line.");

    return $align;
}

1;



( run in 0.804 second using v1.01-cache-2.11-cpan-39bf76dae61 )