BioPerl
view release on metacpan or search on metacpan
Bio/SeqIO/genbank.pm view on Meta::CPAN
sub _read_GenBank_Species {
my ($self, $buffer) = @_;
my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown',
'Unspecified', 'Unknown', 'None', 'unclassified',
'unidentified organism', 'not supplied');
# dictionary of synonyms for taxid 32644
my @unkn_genus = ('unknown', 'unclassified', 'uncultured', 'unidentified');
# all above can be part of valid species name
my $line = $$buffer;
my( $sub_species, $species, $genus, $sci_name, $common,
$class_lines, $source_flag, $abbr_name, $organelle, $sl );
my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION);
# upon first entering the loop, we must not read a new line -- the SOURCE
# line is already in the buffer (HL 05/10/2000)
my ($ann, $tag, $data);
while (defined($line) or defined($line = $self->_readline)) {
# de-HTMLify (links that may be encountered here don't contain
# escaped '>', so a simple-minded approach suffices)
$line =~ s{<[^>]+>}{}g;
if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) {
($tag, $data) = ($1, $2 || '');
last if ($tag and not exists $source{$tag});
}
else {
return unless $tag;
($data = $line) =~ s{^\s+}{};
chomp $data;
$tag = 'CLASSIFICATION' if ( $tag ne 'CLASSIFICATION'
and $tag eq 'ORGANISM'
# Don't match "str." or "var." (fix NC_021815),
# and don't match ".1" (fix NC_021902)
and $line =~ m{(?<!\bstr|\bvar)[;\.]+(?!\d)});
}
(exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
$line = undef;
}
($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM});
$$buffer = $line;
$sci_name or return;
# parse out organelle, common name, abbreviated name if present;
# this should catch everything, but falls back to
# entire SOURCE line just in case
if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?
\s*(.*?)
\s*(?: \( (.*?) \) )?\.?
$
}xms
) {
($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
}
else {
$abbr_name = $sl; # nothing caught; this is a backup!
}
# Convert data in classification lines into classification array.
# only split on ';' or '.' so that classification that is 2 or more words will
# still get matched, use map() to remove trailing/leading/intervening spaces
my @class = map { $_ =~ s/^\s+//;
$_ =~ s/\s+$//;
$_ =~ s/\s{2,}/ /g;
$_; }
split /(?<!subgen)[;\.]+/, $class_lines;
# do we have a genus?
my $possible_genus = quotemeta($class[-1])
. ($class[-2] ? "|" . quotemeta($class[-2]) : '');
if ($sci_name =~ /^($possible_genus)/) {
$genus = $1;
($species) = $sci_name =~ /^$genus\s+(.+)/;
}
else {
$species = $sci_name;
}
# is this organism of rank species or is it lower?
# (we don't catch everything lower than species, but it doesn't matter -
# this is just so we abide by previous behaviour whilst not calling a
# species a subspecies)
if ($species and $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) {
($species, $sub_species) = ($1, $2);
}
# Don't make a species object if it's empty or "Unknown" or "None"
# return unless $genus and $genus !~ /^(Unknown|None)$/oi;
# Don't make a species object if it belongs to taxid 32644
# my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names;
my $unkn = grep { $_ eq $sl } @unkn_names;
return unless (defined $species or defined $genus) and $unkn == 0;
# Bio::Species array needs array in Species -> Kingdom direction
push @class, $sci_name;
@class = reverse @class;
my $make = Bio::Species->new;
$make->scientific_name($sci_name);
$make->classification(@class) if @class > 0;
$make->common_name( $common ) if $common;
$make->name('abbreviated', $abbr_name) if $abbr_name;
$make->organelle($organelle) if $organelle;
#$make->sub_species( $sub_species ) if $sub_species;
return $make;
}
=head2 _read_FTHelper_GenBank
Title : _read_FTHelper_GenBank
Usage : _read_FTHelper_GenBank($buffer)
Function: reads the next FT key line
Example :
Returns : Bio::SeqIO::FTHelper object
Args : filehandle and reference to a scalar
( run in 2.197 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )