endpoint results from the CPAN

Bio-WebService-LANL-SequenceLocator
                "aa_from_protein_start" : [ "1", "22" ],
                "na_from_cds_start" : [ "1", "66" ],
                "na_from_hxb2_start" : [ "1186", "1251" ],
                "na_from_query_start" : [ "28", "93" ],
                "protein_translation" : "PIVQNIQGQVVHQAISPRTLNA"
             }
          ]
       }
    ]

=cut

package Bio::WebService::LANL::SequenceLocator;

use Moo;
use Data::Dumper;
use HTML::LinkExtor;
use HTML::TableExtract;
use HTML::TokeParser;
use HTTP::Request::Common;
use List::AllUtils qw< pairwise part min max >;

our $VERSION = 20170324;

=head1 METHODS

=head2 new

Returns a new instance of this class.  An optional parameter C<agent_string>
should be provided to identify yourself to LANL out of politeness.  See the
L</SYNOPSIS> for an example.

=cut

has agent_string => (
    is      => 'ro',
    lazy    => 1,
    builder => sub { '' },
);

has agent => (
    is      => 'ro',
    lazy    => 1,
    builder => sub {
        require LWP::UserAgent;
        my $self  = shift;
        my $agent = LWP::UserAgent->new(
            agent => join(" ", __PACKAGE__ . "/$VERSION", $self->agent_string),
        );
        $agent->env_proxy;
        return $agent;
    },
);

has lanl_base => (
    is      => 'ro',
    lazy    => 1,
    builder => sub { 'https://www.hiv.lanl.gov' },
);

has lanl_endpoint => (
    is      => 'ro',
    lazy    => 1,
    builder => sub { shift->lanl_base . '/cgi-bin/LOCATE/locate.cgi' },
);

has _bogus_slug => (
    is      => 'ro',
    default => sub { 'BOGUS_SEQ_SO_TABULAR_FILES_ARE_LINKED_IN_OUTPUT' },
);

sub _request {
    my $self = shift;
    my $req  = shift;
    my $response = $self->agent->request($req);

    if (not $response->is_success) {
        warn sprintf "Request failed: %s %s -> %s\n",
            $req->method, $req->uri, $response->status_line;
        return;
    }

    return $response->decoded_content;
}

=head2 find

Takes an array ref of sequence strings.  Sequences may be in amino acids or
nucleotides and mixed freely.  Sequences should not be in FASTA format.

If sequence bases are not clearly nucleotides or clearly amino acids, LANL
seems to default to nucleotides.  This can be an issue for some sequences since
the full alphabet for nucleotides overlaps with the alphabet for amino acids.
To overcome this problem, you may specify C<< base => 'nucleotide' >>
or C<< base => 'amino acid' >> after the array ref of sequences.  This forces
every sequence to be interpreted as nucleotides or amino acids, so you cannot
mix base types in your sequences if you use this option.  C<n>, C<nuc>, and
C<nucleotides> are accepted aliases for C<nucleotide>.  C<a>, C<aa>, C<amino>,
and C<amino acids> are accepted aliases for C<amino acid>.

Returns a list of hashrefs when called in list context, otherwise returns an
arrayref of hashrefs.

See L</EXAMPLE RESULTS> for the structure of the data returned.

=cut

sub find {
    my ($self, $sequences, %args) = @_;

    my $content = $self->submit_sequences($sequences, %args)
        or return;

    return $self->parse_html($content);
}

sub submit_sequences {
    my ($self, $sequences, %args) = @_;

    if (defined $args{base}) {
        my $base = lc $args{base};
        if ($base =~ /^n(uc(leotides?)?)?$/i) {
            $args{base} = 1;
        } elsif ($base =~ /^(a(mino( acids?)?)?|aa)$/i) {
            $args{base} = 0;
        } else {
            warn "Unknown base type <$args{base}>, ignoring";
            delete $args{base};
        }
    }

    # Submit multiple sequences at once using FASTA
    my $fasta = join "\n", map {
        ("> sequence_$_", $sequences->[$_ - 1])
    } 1 .. @$sequences;

    # LANL only presents the parseable table.txt we want if there's more
    # than a single sequence.  We always add it so we can reliably skip it.
    $fasta .= "\n> " . $self->_bogus_slug . "\n";

    return $self->_request(
        POST $self->lanl_endpoint,
        Content_Type => 'form-data',
        Content      => [
            organism            => 'HIV',
            DoReverseComplement => 0,
            seq_input           => $fasta,
            (defined $args{base}
                ? ( base => $args{base} )
                : ()),
        ],
    );
}

sub parse_html {
    my ($self, $content) = @_;

    # Fetch and parse the two tables provided as links which removes the need
    # to parse all of the HTML.
    my @results = $self->parse_tsv($content);

    # Now parse the table data from the HTML
    my @tables = $self->parse_tables($content);

    # Extract the alignments, parsing the HTML a third time!
    my @alignments = $self->parse_alignments($content);

    unless (@results and @tables and @alignments) {
        warn "Didn't find all three of TSV, tables, and alignments!\n";
        warn "TSV:             ", scalar @results, "\n";
        warn "HTML tables:     ", scalar @tables, "\n";
        warn "HTML alignments: ", scalar @alignments, "\n";
        warn "Content:\n$content\n", "=" x 80, "\n";
        return;
    }

    unless (@results == @tables and @results == @alignments) {
        warn "Tab-delimited results count doesn't match parsed HTML result count.  Bug!\n";
        warn "TSV:             ", scalar @results, "\n";
        warn "HTML tables:     ", scalar @tables, "\n";
        warn "HTML alignments: ", scalar @alignments, "\n";
        warn "Content:\n$content\n", "=" x 80, "\n";
        return;
    }

    @results = pairwise {
        my $new = {
            %$a,
            base_type       => $b->{base_type},
            regions         => $b->{rows},
            region_names    => [ map { $_->{cds} } @{$b->{rows}} ],
        };
        delete $new->{$_} for qw(protein protein_start protein_end);
        $new;
    } @results, @tables;

    @results = pairwise { +{ %$b, %$a } } @results, @alignments;

    # Fill in genome start/end for amino acid sequences
    for my $r (@results) {
        next unless $r->{base_type} eq 'amino acid';
( run in 1.947 second using v1.01-cache-2.11-cpan-9581c071862 )