Bio-Community
view release on metacpan or search on metacpan
lib/Bio/Community/IO/FormatGuesser.pm view on Meta::CPAN
# BioPerl module for Bio::Community::IO::FormatGuesser
#
# Please direct questions and support issues to <bioperl-l@bioperl.org>
#
# Copyright 2011-2014 Florent Angly <florent.angly@gmail.com>
#
# You may distribute this module under the same terms as perl itself
=head1 NAME
Bio::Community::IO::FormatGuesser - Determine the format used by a community file
=head1 SYNOPSIS
use Bio::Community::IO::FormatGuesser;
my $guesser = Bio::Community::IO::FormatGuesser->new(
-file => 'file.txt',
);
my $format = $guesser->guess;
=head1 DESCRIPTION
Given a file containing one or several communities, try to guess the file format
used by examining the file content (not by looking at the file name).
The guess() method will examine the data, line by line, until it finds a line
that is specific to a format. If no conclusive guess can be made, undef is returned.
If the Bio::Community::IO::FormatGuesser object is given a filehandle which is
seekable, it will be restored to its original position on return from the
guess() method.
=head2 Formats
The following formats are currently supported:
=over
=item *
generic (tab-delimited matrix, site-by-species table, QIIME summarized OTU tables, ...)
=item *
gaas
=item *
qiime
=item *
unifrac
=item *
biom
=back
See the documentation for the corresponding IO drivers to read and write these
formats in the Bio::Community::IO::* namespace.
=head1 AUTHOR
Florent Angly L<florent.angly@gmail.com>
This module was inspired and based on the Bio::IO::GuessSeqFormat module written
by Andreas Kähäri <andreas.kahari@ebi.ac.uk> and contributors. Thanks to them!
=head1 SUPPORT AND BUGS
User feedback is an integral part of the evolution of this and other Bioperl
modules. Please direct usage questions or support issues to the mailing list,
L<bioperl-l@bioperl.org>, rather than to the module maintainer directly. Many
experienced and reponsive experts will be able look at the problem and quickly
address it. Please include a thorough description of the problem with code and
data examples if at all possible.
If you have found a bug, please report it on the BioPerl bug tracking system
to help us keep track the bugs and their resolution:
L<https://redmine.open-bio.org/projects/bioperl/>
=head1 COPYRIGHT
Copyright 2011-2014 by Florent Angly <florent.angly@gmail.com>
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.10.1 or,
at your option, any later version of Perl 5 you may have available.
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
=head2 new
Function: Create a new Bio::Community::IO::FormatGuesser object
Usage : my $guesser = Bio::Community::IO::FormatGuesser->new( );
Args : -text, -file or -fh. If more than one of these arguments was
provided, only one is used: -text has precendence over -file, which
has precedence over -fh.
Returns : a new Bio::Community::IO::FormatGuesser object
=cut
package Bio::Community::IO::FormatGuesser;
use Moose;
use MooseX::NonMoose;
use MooseX::StrictConstructor;
use Method::Signatures;
use namespace::autoclean;
extends 'Bio::Root::Root';
my %formats = (
biom => \&_possibly_biom ,
gaas => \&_possibly_gaas ,
unifrac => \&_possibly_unifrac ,
generic => \&_possibly_generic ,
qiime => \&_possibly_qiime ,
);
my $real_re = qr/^(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))$/;
# regular expression to match a real number, taken from Regexp::Common
=head2 file
Usage : my $file = $guesser->file;
Function: Get or set the file from which to guess the format
Args : file path (string)
Returns : file path (string)
=cut
has 'file' => (
is => 'rw',
isa => 'Str',
required => 0,
lazy => 1,
default => undef,
init_arg => '-file',
predicate => '_has_file',
);
=head2 fh
Usage : my $fh = $guesser->fh;
Function: Get or set the file handle from which to guess the format.
Args : file handle
Returns : file handle
=cut
has 'fh' => (
is => 'rw',
isa => 'FileHandle',
required => 0,
lazy => 1,
default => undef,
init_arg => '-fh',
predicate => '_has_fh',
);
=head2 text
Usage : my $text = $guesser->text;
Function: Get or set the text from which to guess the format. In most, if not
all cases, the first few lines of a text string should be enough to
determine the format.
Args : text string
Returns : text string
=cut
has 'text' => (
is => 'rw',
isa => 'Str',
required => 0,
lazy => 1,
default => undef,
init_arg => '-text',
predicate => '_has_text',
);
=head2 guess
Function: Guess the file format
Usage : my $format = $guesser->guess;
Args : format string (e.g. generic, qiime, etc)
Returns : format string (e.g. generic, qiime, etc)
=cut
method guess () {
my $format;
# Prepare input
my ($in, $original_pos);
{
####local $Bio::Root::IO::HAS_EOL = 1; # Need Bioperl-dev (>1.6.922) for this to work
if ($self->_has_text) {
$in = Bio::Root::IO->new(-string => $self->text);
} elsif ($self->_has_file) {
$in = Bio::Root::IO->new(-file => $self->file);
} elsif ($self->_has_fh) {
$original_pos = tell($self->fh);
$in = Bio::Root::IO->new(-fh => $self->fh, -noclose => 1);
} else {
$self->throw('Need to provide -file, -fh or -text');
}
}
# Read lines and try to attribute format
my %test_formats = %formats;
my %ok_formats;
my $line_num = 0;
while ( defined(my $line = $in->_readline) ) {
# Read next line (and convert line endings). Exit if no lines left.
$line_num++;
chomp $line;
# Skip white and empty lines.
next if $line =~ /^\s*$/;
# Split fields
my @fields = split /\t/, $line;
# Try all formats remaining
%ok_formats = ();
my ($test_format, $test_function);
while ( ($test_format, $test_function) = each (%test_formats) ) {
my $score = &$test_function(\@fields, $line, $line_num);
if ( $score == 2 ) {
# This line is specific of this format
%ok_formats = ( $test_format => undef );
last;
} elsif ($score == 1) {
# Line is possibly in this format
$ok_formats{$test_format} = undef;
} else {
# Do not try to match this format with upcoming lines
delete $test_formats{$test_format};
}
}
# Exit if there was a match to only one format
if (scalar keys %ok_formats == 1) {
last;
}
# Exit if no formats left to try
if (scalar keys %test_formats == 0) {
last;
( run in 0.517 second using v1.01-cache-2.11-cpan-39bf76dae61 )