Bio-WebService-LANL-SequenceLocator
view release on metacpan or search on metacpan
lib/Bio/WebService/LANL/SequenceLocator.pm view on Meta::CPAN
our $VERSION = 20170324;
=head1 METHODS
=head2 new
Returns a new instance of this class. An optional parameter C<agent_string>
should be provided to identify yourself to LANL out of politeness. See the
L</SYNOPSIS> for an example.
=cut
has agent_string => (
is => 'ro',
lazy => 1,
builder => sub { '' },
);
has agent => (
is => 'ro',
lazy => 1,
builder => sub {
require LWP::UserAgent;
my $self = shift;
my $agent = LWP::UserAgent->new(
agent => join(" ", __PACKAGE__ . "/$VERSION", $self->agent_string),
);
$agent->env_proxy;
return $agent;
},
);
has lanl_base => (
is => 'ro',
lazy => 1,
builder => sub { 'https://www.hiv.lanl.gov' },
);
has lanl_endpoint => (
is => 'ro',
lazy => 1,
builder => sub { shift->lanl_base . '/cgi-bin/LOCATE/locate.cgi' },
);
has _bogus_slug => (
is => 'ro',
default => sub { 'BOGUS_SEQ_SO_TABULAR_FILES_ARE_LINKED_IN_OUTPUT' },
);
sub _request {
my $self = shift;
my $req = shift;
my $response = $self->agent->request($req);
if (not $response->is_success) {
warn sprintf "Request failed: %s %s -> %s\n",
$req->method, $req->uri, $response->status_line;
return;
}
return $response->decoded_content;
}
=head2 find
Takes an array ref of sequence strings. Sequences may be in amino acids or
nucleotides and mixed freely. Sequences should not be in FASTA format.
If sequence bases are not clearly nucleotides or clearly amino acids, LANL
seems to default to nucleotides. This can be an issue for some sequences since
the full alphabet for nucleotides overlaps with the alphabet for amino acids.
To overcome this problem, you may specify C<< base => 'nucleotide' >>
or C<< base => 'amino acid' >> after the array ref of sequences. This forces
every sequence to be interpreted as nucleotides or amino acids, so you cannot
mix base types in your sequences if you use this option. C<n>, C<nuc>, and
C<nucleotides> are accepted aliases for C<nucleotide>. C<a>, C<aa>, C<amino>,
and C<amino acids> are accepted aliases for C<amino acid>.
Returns a list of hashrefs when called in list context, otherwise returns an
arrayref of hashrefs.
See L</EXAMPLE RESULTS> for the structure of the data returned.
=cut
sub find {
my ($self, $sequences, %args) = @_;
my $content = $self->submit_sequences($sequences, %args)
or return;
return $self->parse_html($content);
}
sub submit_sequences {
my ($self, $sequences, %args) = @_;
if (defined $args{base}) {
my $base = lc $args{base};
if ($base =~ /^n(uc(leotides?)?)?$/i) {
$args{base} = 1;
} elsif ($base =~ /^(a(mino( acids?)?)?|aa)$/i) {
$args{base} = 0;
} else {
warn "Unknown base type <$args{base}>, ignoring";
delete $args{base};
}
}
# Submit multiple sequences at once using FASTA
my $fasta = join "\n", map {
("> sequence_$_", $sequences->[$_ - 1])
} 1 .. @$sequences;
# LANL only presents the parseable table.txt we want if there's more
# than a single sequence. We always add it so we can reliably skip it.
$fasta .= "\n> " . $self->_bogus_slug . "\n";
return $self->_request(
POST $self->lanl_endpoint,
Content_Type => 'form-data',
( run in 0.966 second using v1.01-cache-2.11-cpan-39bf76dae61 )