Bio-FastParsers

 view release on metacpan or  search on metacpan

lib/Bio/FastParsers/Uclust.pm  view on Meta::CPAN

package Bio::FastParsers::Uclust;
# ABSTRACT: Front-end class for UCLUST parser
# CONTRIBUTOR: Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
$Bio::FastParsers::Uclust::VERSION = '0.221230';
use Moose;
use namespace::autoclean;

use autodie;

use Tie::IxHash;

extends 'Bio::FastParsers::Base';


# public attributes (inherited)


with 'Bio::FastParsers::Roles::Clusterable';


sub BUILD {
    my $self = shift;

    my $infile = $self->filename;
    open my $in, '<', $infile;

    tie my %members_for, 'Tie::IxHash';

    LINE:
    while (my $line = <$in>) {
        chomp $line;
        my ($type, @fields) = split /\t/xms, $line;

        # https://www.drive5.com/usearch/manual/opt_uc.html
        # Field Description
        # - Record type S, H, C or N (see table below).
        # 0 Cluster number (0-based).
        # 1 Sequence length (S, N and H) or cluster size (C).
        # 2 For H records, percent identity with target.
        # 3 For H records, the strand: + or - for nucleotides, . for proteins.
        # 4 Not used, parsers should ignore this field. Included for backwards compatibility.
        # 5 Not used, parsers should ignore this field. Included for backwards compatibility.
        # 6 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
        # 7 Label of query sequence (always present).
        # 8 Label of target sequence (H records only).

        if    ($type eq 'C') {
            push @{ $members_for{ $fields[7] } }, ();
        }
        elsif ($type eq 'H') {
            push @{ $members_for{ $fields[8] } }, $fields[7];
        }
    }

    # store representative and member sequence ids
    $self->_set_members_for( \%members_for );

    return;
}

__PACKAGE__->meta->make_immutable;
1;

__END__

=pod

=head1 NAME

Bio::FastParsers::Uclust - Front-end class for UCLUST parser

=head1 VERSION

version 0.221230

=head1 SYNOPSIS

    use aliased 'Bio::FastParsers::Uclust';

    # open and parse UCLUST report
    my $infile = 'test/uclust.uc';
    my $report = Uclust->new( file => $infile );

    # loop through representatives to get members
    for my $repr ( $report->all_representatives ) {
        my $members = $report->members_for($repr);
        # ...
    }

    # get representatives ordered by descending cluster size
    my @reprs = $report->all_representatives_by_cluster_size;

    # create IdMapper
    # Note: this requires Bio::MUST::Core
    my $mapper = $report->clust_mapper(':');
    my @long_ids = $mapper->all_long_ids;

    # ...



( run in 0.621 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )