BioPerl

 view release on metacpan or  search on metacpan

Bio/SeqIO/genbank.pm  view on Meta::CPAN


sub _read_GenBank_Species {
    my ($self, $buffer) = @_;

    my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown',
                      'Unspecified', 'Unknown', 'None', 'unclassified',
                      'unidentified organism', 'not supplied');
    # dictionary of synonyms for taxid 32644
    my @unkn_genus = ('unknown', 'unclassified', 'uncultured', 'unidentified');
    # all above can be part of valid species name

    my $line = $$buffer;

    my( $sub_species, $species, $genus, $sci_name, $common,
        $class_lines, $source_flag, $abbr_name, $organelle, $sl );
    my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION);

    # upon first entering the loop, we must not read a new line -- the SOURCE
    # line is already in the buffer (HL 05/10/2000)
    my ($ann, $tag, $data);
    while (defined($line) or defined($line = $self->_readline)) {
        # de-HTMLify (links that may be encountered here don't contain
        # escaped '>', so a simple-minded approach suffices)
        $line =~ s{<[^>]+>}{}g;
        if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) {
            ($tag, $data) = ($1, $2 || '');
            last if ($tag and not exists $source{$tag});
        }
        else {
            return unless $tag;
            ($data = $line) =~ s{^\s+}{};
            chomp $data;
            $tag = 'CLASSIFICATION' if (    $tag ne 'CLASSIFICATION'
                                        and $tag eq 'ORGANISM'
                                        # Don't match "str." or "var." (fix NC_021815),
                                        # and don't match ".1" (fix NC_021902)
                                        and $line =~ m{(?<!\bstr|\bvar)[;\.]+(?!\d)});
        }
        (exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
        $line = undef;
    }

    ($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM});

    $$buffer = $line;

    $sci_name or return;

    # parse out organelle, common name, abbreviated name if present;
    # this should catch everything, but falls back to
    # entire SOURCE line just in case
    if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?
                  \s*(.*?)
                  \s*(?: \( (.*?) \) )?\.?
                  $
                 }xms
        ) {
        ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
    }
    else {
        $abbr_name = $sl; # nothing caught; this is a backup!
    }

    # Convert data in classification lines into classification array.
    # only split on ';' or '.' so that classification that is 2 or more words will
    # still get matched, use map() to remove trailing/leading/intervening spaces
    my @class = map { $_ =~ s/^\s+//;
                      $_ =~ s/\s+$//;
                      $_ =~ s/\s{2,}/ /g;
                      $_; }
                split /(?<!subgen)[;\.]+/, $class_lines;

    # do we have a genus?
    my $possible_genus =  quotemeta($class[-1])
                       . ($class[-2] ? "|" . quotemeta($class[-2]) : '');
    if ($sci_name =~ /^($possible_genus)/) {
        $genus = $1;
        ($species) = $sci_name =~ /^$genus\s+(.+)/;
    }
    else {
        $species = $sci_name;
    }

    # is this organism of rank species or is it lower?
    # (we don't catch everything lower than species, but it doesn't matter -
    # this is just so we abide by previous behaviour whilst not calling a
    # species a subspecies)
    if ($species and $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) {
        ($species, $sub_species) = ($1, $2);
    }

    # Don't make a species object if it's empty or "Unknown" or "None"
    # return unless $genus and  $genus !~ /^(Unknown|None)$/oi;
    # Don't make a species object if it belongs to taxid 32644
#    my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names;
    my $unkn = grep { $_ eq $sl } @unkn_names;
    return unless (defined $species or defined $genus) and $unkn == 0;

    # Bio::Species array needs array in Species -> Kingdom direction
    push @class, $sci_name;
    @class = reverse @class;

    my $make = Bio::Species->new;
    $make->scientific_name($sci_name);
    $make->classification(@class)          if @class > 0;
    $make->common_name( $common )          if $common;
    $make->name('abbreviated', $abbr_name) if $abbr_name;
    $make->organelle($organelle)           if $organelle;
    #$make->sub_species( $sub_species )     if $sub_species;
    return $make;
}

=head2 _read_FTHelper_GenBank

 Title   : _read_FTHelper_GenBank
 Usage   : _read_FTHelper_GenBank($buffer)
 Function: reads the next FT key line
 Example :
 Returns : Bio::SeqIO::FTHelper object
 Args    : filehandle and reference to a scalar



( run in 2.197 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )