Bio-MUST-Drivers

 view release on metacpan or  search on metacpan

bin/annotate-ali.pl  view on Meta::CPAN


    ### Processing: $infile
    my $queries = Bio::MUST::Core::Ali::Temporary->new( seqs => $infile );

    # let driver decide BLAST flavor except if nucl:nucl and --tblastx option
    my $method = $queries->type eq 'nucl' && $blastdb->type eq 'nucl'
        && $ARGV_tblastx ? 'tblastx' : 'blast';

    #### Performing BLAST...
    my $parser = $blastdb->$method($queries, {
        -outfmt => 6,
        -evalue => $ARGV_evalue,
        -max_target_seqs => $ARGV_max_hits
    } );

    #### Parsing BLAST report...
    tie my %ann_for,    'Tie::IxHash';
    tie my %hit_id_for, 'Tie::IxHash';
    my $curr_id = q{};

    HIT:
    while ( my $hit = $parser->next_hit ) {
        my ($qid, $hid, $identity)
            = map { $hit->$_ } qw(query_id hit_id percent_identity);

        next HIT if $identity < $ARGV_identity;     # skip weak-identity hits

        unless ($ARGV_hit_list) {                   # optionally
            next HIT if $qid eq $curr_id;           # skip non-first hits
            $curr_id = $qid;
        }

        # capture annotation bit in ref seq_id using regex
        my ($annotation) = $blastdb->long_id_for($hid) =~ $ARGV_ref_regex;
        my $full_qid = $queries->long_id_for($qid);
        $ann_for{$full_qid} //= $annotation;        # use only first hit

        # extract additional (optionally) wanted fields
        my @wanted_fields = map { $hit->$_ } @ARGV_fields;

        # collect all hits for at specified (E-value and) identity thresholds
        push @{ $hit_id_for{$full_qid} }, {
            annotation => $annotation,
            fields     => \@wanted_fields,
        };
    }
    ##### Annotations: %ann_for

    # output annotation report
    my @header = ('tag', 'id', @ARGV_fields);
    say '# ' . join "\t", @header;
    while (my ($id, $hits) = each %hit_id_for) {
        for my $hit ( @{$hits} ) {
            say join "\t", $hit->{annotation}, $id, @{ $hit->{fields} };
        }
    }

    # optionally output annotated file
    if ($ARGV_ann_file) {
        my $ali = Bio::MUST::Core::Ali->load($infile);
        $ali->dont_guess if $ARGV_noguessing;
        my $outfile = secure_outfile($infile, $ARGV_out_suffix);
        #### Writing annotated file: $outfile->stringify
        prefix_ids($ali, \%ann_for)->store_fasta($outfile);
    }
}

# TODO: replace by or add some --store-id-mapper option?
# TODO: move into BMC::Ali
sub prefix_ids {
    my $ali     = shift;
    my $ann_for = shift;

    for my $seq ($ali->all_seqs) {
        my $prefix = $ann_for->{$seq->full_id};
        $seq->set_seq_id( $prefix . q{-} . $seq->full_id ) if $prefix;
    }

    return $ali;
}

# TODO: check coherence of option layout with cdhit-tax-filter.pl
# e.g., replace --ann-file by --store-id-mapper

__END__

=pod

=head1 NAME

annotate-ali.pl - Annotate sequences by homology search using BLAST

=head1 VERSION

version 0.252830

=head1 USAGE

    annotate-ali.pl <infiles> --database=<file> --regex=<regex> [options]

=head1 REQUIRED ARGUMENTS

=over

=item <infiles>

Path to input ALI files [repeatable argument].

=for Euclid: infiles.type: readable
    repeatable

=item --ref-file [=] <file> | --database [=] <file>

Path to the FASTA file containing the sequence database.

=for Euclid: file.type: readable

=item --[ref-]regex [=] <regex>

Regular expression for capturing the reference seq id fragment that has to be
used for annotating infile seq ids.

bin/annotate-ali.pl  view on Meta::CPAN


=item --evalue [=] <float>

E-value threshold for annotating a sequence [default: 1e-10].

=for Euclid: float.type: number
    float.default: 1e-10

=item --identity [=] <number>

Identity threshold for annotating a sequence [default: 0]. When specified as a
fraction between 0 and 1 (included), it is first multiplied by 100 to be
interpreted in percentage.

=for Euclid: number.type: number
    number.default: 0

=item --max-hits [=] <number>

Number of hits to return for each query (BLAST -max_target_seqs option)
[default: 10]. Mostly useful in conjunction with the C<--hit-list> option.

=for Euclid: number.type: number
    number.default: 10

=item --tblastx

Enforce using TBLASTX (instead of the auto-selected BLASTN) when both the
infile and the database contain nucleotide sequences [default: no].

=item --ann-file

Write an annotated version (with prefixed ids) of the infile [default: no].

=item --hit-list

Print a list of id/hit pairs (at the specified E-value and identity thresholds)
instead of the standard annotation report [default: no].

=item --fields [=] <str>...

List of whitespace-separated BLAST fields to be displayed in final report
[default: no].

Valid fields are: percent_identity, hsp_length, mismatches, gaps, query_from,
query_to, hit_from, hit_to, evalue, bit_score, query_start, query_end,
hit_start, hit_end.

=for Euclid: str.type: string, str eq "percent_identity" || str eq "hsp_length" || str eq "mismatches" || str eq "gaps" || str eq "query_from" || str eq "query_to" || str eq "hit_from" || str eq "hit_to" || str eq "evalue" || str eq "bit_score" || st...
    str.default: []

=item --out[-suffix] [=] <suffix>

Suffix to append to infile basenames for deriving outfile names [default:
-ann]. When not specified, outfile names are taken from infiles but original
infiles are preserved by being appended a .bak suffix.

=for Euclid: suffix.type: string
    suffix.default: '-ann'

=item --[no]guessing

[Don't] guess whether sequences are aligned or not [default: yes].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

=back

=head1 AUTHOR

Denis BAURAIN <denis.baurain@uliege.be>

=head1 CONTRIBUTOR

=for stopwords Valerian LUPO

Valerian LUPO <valerian.lupo@uliege.be>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut



( run in 0.750 second using v1.01-cache-2.11-cpan-39bf76dae61 )