Bio-WebService-LANL-SequenceLocator

 view release on metacpan or  search on metacpan

lib/Bio/WebService/LANL/SequenceLocator.pm  view on Meta::CPAN

                if (    $row->[0]
                    and $row->[0] =~ /^[A-Za-z]+$/
                    and not grep { defined and length } @$row[1 .. scalar @$row - 1])
                {
                    $table{rows}->[-1]{protein_translation} = $row->[0];
                    next;
                }

                # Not all rows are data, some are informational sentences.
                next if grep { not defined } @$row;

                my %row;
                @row{@$our_cols} =
                    map { ($_ and $_ eq "NA")       ? undef     : $_ }
                    map { ($_ and /(\d+) → (\d+)/)  ? [$1, $2]  : $_ }
                        @$row;

                push @{$table{rows}}, \%row;
            }
            push @tables, \%table
                if @{$table{rows}};
        }
    }

    # Sort by depth, then within each depth by count
    @tables = sort {
        $a->{coords}[0] <=> $b->{coords}[0]
     or $a->{coords}[1] <=> $b->{coords}[1]
    } @tables;

    if (@tables > 1) {
        unless (    $tables[-1]->{rows}[0]{na_from_query_start} eq "1 →"
                and $tables[-1]->{rows}[0]{protein_translation} eq "X") {
            warn "Last table appears to be real!?  It should be the bogus table of the bogus sequence.";
            warn "Table is ", Dumper($tables[-1]), "\n";
            return;
        } else {
            pop @tables;
        }
    }

    return @tables;
}

sub parse_alignments {
    my ($self, $content) = @_;
    my @alignments;

    my $doc = HTML::TokeParser->new(
        \$content,
        unbroken_text => 1,
    );

    my $expect_alignment = 0;

    while (my $tag = $doc->get_tag("b", "pre")) {
        my $name = lc $tag->[0];
        my $text = $doc->get_text;
        next unless defined $text;

        # <pre>s are preceeded by a bold header, which we use as an indicator
        if ($name eq 'b') {
            $expect_alignment = $text =~ /Alignment\s+of\s+the\s+query\s+sequence\s+to\s+HXB2/i;
        } elsif ($name eq 'pre') {
            if ($text =~ /^\s*Query\b/m and $text =~ /^\s*HXB2\b/m) {
                push @alignments, $text;
                warn "Not expecting alignment, but found one‽"
                    unless $expect_alignment;
            }
            elsif ($text =~ /^\s+$/ and $expect_alignment) {
                push @alignments, undef;    # We appear to have found an unaligned sequence.
            }
            $expect_alignment = 0;
        }
    }

    if (defined $alignments[-1]) {
        warn "Last alignment is non-null!  It should be the empty alignment of the bogus sequence.";
        warn "Alignment is <$alignments[-1]>\n";
        return;
    } else {
        pop @alignments;
    }

    my @results;
    for (@alignments) {
        my @hxb2;
        if (defined) {
            push @hxb2, $1 =~ s/\s+//gr
                while /^\s*HXB2\b\s+(.+?)(?:\s+\d+|\s*)$/gm;
        }
        push @results, {
            alignment       => $_,
            hxb2_sequence   => @hxb2 ? join("", @hxb2) : undef,
        };
    }

    return @results;
}

=head1 AUTHOR

Thomas Sibley E<lt>trsibley@uw.eduE<gt>

=head1 COPYRIGHT

Copyright 2014 by the Mullins Lab, Department of Microbiology, University of
Washington.

=head1 LICENSE

Licensed under the same terms as Perl 5 itself.

=cut

42;



( run in 2.532 seconds using v1.01-cache-2.11-cpan-97f6503c9c8 )