Bio-Community

 view release on metacpan or  search on metacpan

lib/Bio/Community/IO/FormatGuesser.pm  view on Meta::CPAN

      # Read next line (and convert line endings). Exit if no lines left.
      $line_num++;
      chomp $line;

      # Skip white and empty lines.
      next if $line =~ /^\s*$/;

      # Split fields
      my @fields = split /\t/, $line;

      # Try all formats remaining
      %ok_formats = ();
      my ($test_format, $test_function);
      while ( ($test_format, $test_function) = each (%test_formats) ) {
         my $score = &$test_function(\@fields, $line, $line_num);
         if ( $score == 2 ) {
            # This line is specific of this format
            %ok_formats = ( $test_format => undef );
            last;
         } elsif ($score == 1) {
            # Line is possibly in this format
            $ok_formats{$test_format} = undef;
         } else {
            # Do not try to match this format with upcoming lines
            delete $test_formats{$test_format};
         }
      }

      # Exit if there was a match to only one format
      if (scalar keys %ok_formats == 1) {
         last;
      }

      # Exit if no formats left to try
      if (scalar keys %test_formats == 0) {
         last;
      }

      # Give up after having tested 100 lines
      if ($line_num >= 100) {
         last;
      }

   }

   # If several formats matched. Assume 'generic' if possible, undef otherwise
   if (scalar keys %ok_formats > 1) {
      for my $ok_format (keys %ok_formats) {
         if (not $ok_format eq 'generic') {
            delete $ok_formats{$ok_format};
         }
      }
   }

   if (scalar keys %ok_formats == 1) {
      $format = (keys %ok_formats)[0];
   }

   # Cleanup
   if ($in->noclose) {
      # Reset filehandle cursor to original location
      seek($self->fh, $original_pos, 0)
         or $self->throw("Could not reset the cursor to its original position: $!");
   }
   $in->close;

   return $format;
}


#-----  Format-specific methods -----#
# These methods return:
#    1 is the given line is possibly in this format
#    2 if they are sure


func _possibly_biom ($fields, $line, $line_num) {
   # Example:
   # {
   #  "id":null,
   #  "format": "Biological Observation Matrix 0.9.1-dev",
   #  "format_url": "http://biom-format.org",
   #  ...
   my $ok = 0;
   if ($line_num == 1) {
      if ($line =~ m/^{/) {
         $ok = 1;
      }
   } else {
      if ( ($line =~ m/"\S+":/) || 
           ($line =~ m/Biological Observation Matrix/) ) {
         $ok = 2; # biom for sure
      }
   }
   return $ok;
}


func _possibly_generic ($fields, $line, $line_num) {
   # Example:
   #   Species	gut	soda lake
   #   Streptococcus	241	334
   #   ...
   # Columns from the second to the last must contain numbers.
   my $ok = 0;
   my $num_fields = scalar @$fields;
   if ($num_fields >= 2) {
      if ($line_num == 1) {
        $ok = 1;
      } else {
         for my $i (1 .. $num_fields - 1) {
            if ($fields->[$i] =~ $real_re) {
               $ok = 1;
            } else {
               $ok = 0;
               last;
            }
         }
      }
   }
   return $ok;
}



( run in 1.453 second using v1.01-cache-2.11-cpan-39bf76dae61 )