Bio-Community

 view release on metacpan or  search on metacpan

lib/Bio/Community/IO.pm  view on Meta::CPAN


# Overriding new... Is there a better alternative?

func new ($class, @args) {
   my $real_class = Scalar::Util::blessed($class) || $class;

   # These all come from the same base, Moose::Object, so this is fine
   my $params = $real_class->BUILDARGS(@args);
   my $format = delete $params->{'-format'};
   if (not defined $format) {
      # Try to guess format
      my $guesser = Bio::Community::IO::FormatGuesser->new();
      if ($params->{'-file'}) {
         $guesser->file( $params->{'-file'} );
      } elsif ($params->{'-fh'}) {
         $guesser->fh( $params->{'-fh'} );
      }
      $format = $guesser->guess;
   }
   if (not defined $format) {
      $real_class->throw("Could not automatically detect input format.");
   }

   # Use the real driver class here
   $real_class = __PACKAGE__.'::Driver::'.$format;
   Module::Runtime::use_module($real_class);
   $class->throw("Module $real_class does not implement a community IO stream")
       unless $real_class->does('Bio::Community::Role::IO');

lib/Bio/Community/IO/FormatGuesser.pm  view on Meta::CPAN



=head1 NAME

Bio::Community::IO::FormatGuesser - Determine the format used by a community file

=head1 SYNOPSIS

  use Bio::Community::IO::FormatGuesser;

  my $guesser = Bio::Community::IO::FormatGuesser->new(
     -file => 'file.txt',
  );
  my $format = $guesser->guess;

=head1 DESCRIPTION

Given a file containing one or several communities, try to guess the file format
used by examining the file content (not by looking at the file name).

The guess() method will examine the data, line by line, until it finds a line
that is specific to a format. If no conclusive guess can be made, undef is returned.

If the Bio::Community::IO::FormatGuesser object is given a filehandle which is
seekable, it will be restored to its original position on return from the
guess() method.

=head2 Formats

The following formats are currently supported:

=over

=item *

generic (tab-delimited matrix, site-by-species table, QIIME summarized OTU tables, ...)

lib/Bio/Community/IO/FormatGuesser.pm  view on Meta::CPAN

at your option, any later version of Perl 5 you may have available.

=head1 APPENDIX

The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _

=head2 new

 Function: Create a new Bio::Community::IO::FormatGuesser object
 Usage   : my $guesser = Bio::Community::IO::FormatGuesser->new( );
 Args    : -text, -file or -fh. If more than one of these arguments was
           provided, only one is used: -text has precendence over -file, which
           has precedence over -fh.
 Returns : a new Bio::Community::IO::FormatGuesser object

=cut


package Bio::Community::IO::FormatGuesser;

lib/Bio/Community/IO/FormatGuesser.pm  view on Meta::CPAN

   unifrac => \&_possibly_unifrac ,
   generic => \&_possibly_generic ,
   qiime   => \&_possibly_qiime   ,
);

my $real_re = qr/^(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))$/;
# regular expression to match a real number, taken from Regexp::Common

=head2 file

 Usage   : my $file = $guesser->file;
 Function: Get or set the file from which to guess the format
 Args    : file path (string)
 Returns : file path (string)

=cut

has 'file' => (
   is => 'rw',
   isa => 'Str',
   required => 0,
   lazy => 1,
   default => undef,
   init_arg => '-file',
   predicate => '_has_file',
);


=head2 fh

 Usage   : my $fh = $guesser->fh;
 Function: Get or set the file handle from which to guess the format. 
 Args    : file handle
 Returns : file handle

=cut

has 'fh' => (
   is => 'rw',
   isa => 'FileHandle',
   required => 0,
   lazy => 1,
   default => undef,
   init_arg => '-fh',
   predicate => '_has_fh',
);


=head2 text

 Usage   : my $text = $guesser->text;
 Function: Get or set the text from which to guess the format. In most, if not
           all cases, the first few lines of a text string should be enough to
           determine the format.
 Args    : text string
 Returns : text string

=cut

has 'text' => (
   is => 'rw',
   isa => 'Str',
   required => 0,
   lazy => 1,
   default => undef,
   init_arg => '-text',
   predicate => '_has_text',
);


=head2 guess

 Function: Guess the file format
 Usage   : my $format = $guesser->guess;
 Args    : format string (e.g. generic, qiime, etc)
 Returns : format string (e.g. generic, qiime, etc)

=cut

method guess () {
   my $format;

   # Prepare input
   my ($in, $original_pos);
   {
      ####local $Bio::Root::IO::HAS_EOL = 1; # Need Bioperl-dev (>1.6.922) for this to work
      if ($self->_has_text) {
         $in = Bio::Root::IO->new(-string => $self->text);
      } elsif ($self->_has_file) {
         $in = Bio::Root::IO->new(-file => $self->file);

t/IO/FormatGuesser.t  view on Meta::CPAN

use strict;
use warnings;
use Bio::Root::Test;

use_ok($_) for qw(
    Bio::Community::IO::FormatGuesser
);


my ($guesser, $text, $fh, $file, $line);


# Bare object

ok $guesser = Bio::Community::IO::FormatGuesser->new(), 'bare object';
isa_ok $guesser, 'Bio::Community::IO::FormatGuesser';


# Test mixed input

$text = <<EOF;
{
    "id":null,
    "format": "Biological Observation Matrix 0.9.1-dev",
    "format_url": "http://biom-format.org",
    "type": "OTU table",
    "generated_by": "QIIME revision 1.4.0-dev",
    "date": "2011-12-19T19:00:00",
EOF

open $fh, '<', test_input_file('qiime_w_greengenes_taxo.txt');

$file = test_input_file('gaas_compo.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new(
   -file => $file, # gaas
   -text => $text, # biom
   -fh   => $fh,   # qiime
), 'mixed input';
is $guesser->file, $file;
is $guesser->text, $text;
is $guesser->fh, $fh;

is $guesser->guess, 'biom';

close $fh;


# Test input text

ok $guesser = Bio::Community::IO::FormatGuesser->new(), 'text input';
ok $guesser->text($text);
is $guesser->guess, 'biom';


# Test input filehandle

open $fh, '<', test_input_file('biom_minimal_dense.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -fh => $fh ), 'filehandle input';
is $guesser->fh, $fh;
is $guesser->guess, 'biom';
$line = <$fh>;
chomp $line;
is $line, '{', 'filehandle was rewinded';
close $fh;


# Test biom input file

$file = test_input_file('biom_minimal_dense.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'biom files';
is $guesser->file, $file;
is $guesser->guess, 'biom';

$file = test_input_file('biom_rich_sparse.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'biom';

$file = test_input_file('biom_float.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'biom';

$file = test_input_file('biom_dups.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'biom';

$file = test_input_file('biom_invalid.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'biom';

# Test generic input file

$file = test_input_file('generic_table_win.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'generic files (Windows)';
is $guesser->file, $file;
is $guesser->guess, 'generic';

$file = test_input_file('generic_table_mac.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'generic files (Mac)';
is $guesser->file, $file;
is $guesser->guess, 'generic';

$file = test_input_file('generic_table.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'generic files (Linux)';
is $guesser->file, $file;
is $guesser->guess, 'generic';

$file = test_input_file('generic_table_tricky.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'generic';

$file = test_input_file('qiime_w_silva_taxo_L2.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'generic';


# Test gaas input file

$file = test_input_file('gaas_compo.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'gaas files';
is $guesser->file, $file;
is $guesser->guess, 'gaas';

$file = test_input_file('gaas_seq_compo.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'gaas';

$file = test_input_file('gaas_other.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'gaas';


# Test unifrac input file

$file = test_input_file('unifrac_qualitative.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'unifrac files';
is $guesser->file, $file;
is $guesser->guess, 'unifrac';

$file = test_input_file('unifrac_quantitative.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'unifrac';

$file = test_input_file('unifrac_quantitative_tricky.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'unifrac';


# Test qiime input file

$file = test_input_file('qiime_w_no_taxo.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'qiime files';
is $guesser->file, $file;
is $guesser->guess, 'qiime';

$file = test_input_file('qiime_w_greengenes_taxo.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'qiime';

$file = test_input_file('qiime_alt_header.txt'); # alternative header
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'qiime';

$file = test_input_file('qiime_single_community.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'qiime';

$file = test_input_file('qiime_w_silva_taxo_and_dups.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'qiime';

$file = test_input_file('qiime_w_two_communities.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'qiime';

$file = test_input_file('qiime_alt_header.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file );
is $guesser->file, $file;
is $guesser->guess, 'qiime';


# Test unknown format

$file = test_input_file('lorem_ipsum.txt');
ok $guesser = Bio::Community::IO::FormatGuesser->new( -file => $file ), 'unknown file';
is $guesser->file, $file;
is $guesser->guess, undef;


# Test empty string

$text = '';
ok $guesser = Bio::Community::IO::FormatGuesser->new( -text => $text ), 'empty string';
is $guesser->text, $text;
is $guesser->guess, undef;


done_testing();

exit;



( run in 0.719 second using v1.01-cache-2.11-cpan-748bfb374f4 )