Bio-Kmer

 view release on metacpan or  search on metacpan

lib/Bio/Kmer.pm  view on Meta::CPAN


The BioPerl way

  use strict;
  use warnings;
  use Bio::SeqIO;
  use Bio::Kmer;

  # Load up any Bio::SeqIO object. Quality values will be
  # faked internally to help with compatibility even if
  # a fastq file is given.
  my $seqin = Bio::SeqIO->new(-file=>"input.fasta");
  my $kmer=Bio::Kmer->new($seqin);
  my $kmerHash=$kmer->kmers();
  my $countOfCounts=$kmer->histogram();

=head1 DESCRIPTION

A module for helping with kmer analysis. The basic methods help count kmers and can produce a count of counts.  Currently this module only supports fastq format.  Although this module can count kmers with pure perl, it is recommended to give the opti...

=head1 DEPENDENCIES

  * BioPerl
  * Jellyfish >=2
  * Perl threads
  * Perl >=5.10

=head1 VARIABLES

=over

=item $Bio::Kmer::iThreads

Boolean describing whether the module instance is using threads

=back

=head1 METHODS

=over

=item Bio::Kmer->new($filename, \%options)

Create a new instance of the kmer counter.  One object per file. 

  Filename can be either a file path or a Bio::SeqIO object.

  Applicable arguments for \%options:
  Argument     Default    Description
  kmercounter  perl       What kmer counter software to use.
                          Choices: Perl, Jellyfish.
  kmerlength|k 21         Kmer length
  numcpus      1          This module uses perl 
                          multithreading with pure perl or 
                          can supply this option to other 
                          software like jellyfish.
  gt           1          If the count of kmers is fewer 
                          than this, ignore the kmer. This 
                          might help speed analysis if you 
                          do not care about low-count kmers.
  sample       1          Retain only a percentage of kmers.
                          1 is 100%; 0 is 0%
                          Only works with the perl kmer counter.
  verbose      0          Print more messages.

  Examples:
  my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});

=back

=cut

sub new{
  my($class,$seqfile,$settings)=@_;

  croak "ERROR: need a sequence file or a Bio::SeqIO object" if(!$seqfile);

  # Set optional parameter defaults
  $$settings{kmerlength}    = $$settings{kmerlength} || $$settings{k} || 21;
  $$settings{numcpus}     ||= 1;
  $$settings{gt}          ||= 1;
  $$settings{kmercounter} ||= "perl";
  $$settings{tempdir}     ||= tempdir("Kmer.pm.XXXXXX",TMPDIR=>1,CLEANUP=>1);
  $$settings{sample}        = 1 if(!defined($$settings{sample}));
  $$settings{verbose}     ||= 0;

  # If the first parameter $seqfile is a Bio::SeqIO object,
  # then send it to a file to dovetail with the rest of
  # this module.
  if(ref($seqfile) && $seqfile->isa("Bio::SeqIO")){
    # BioPerl isn't required at compile tile but is
    # required if the user starts with a BioPerl object.
    eval {
      require Bio::SeqIO;
      require Bio::Seq::Quality;
    };
    if($@){
      croak "ERROR: cannot load Bio::SeqIO and Bio::Seq::Quality, but I need to because you supplied a Bio::SeqIO object";
    }
    my $tempfile="$$settings{tempdir}/bioperl_input.fastq";
    my $out=Bio::SeqIO->new(-file=>">".$tempfile);
    while(my $seq=$seqfile->next_seq){
      my $qual = $seq->qual || "I" x $seq->length;
      my $seqWithQual = Bio::Seq::Quality->new(
        # TODO preserve qual values if they exist, but
        # for now, it doesn't really matter.
        -qual=> $qual,
        -seq => $seq->seq,
        -id  => $seq->id,
        -verbose => -1,
      );
      $out->write_seq($seqWithQual);
    }
    $out->close; # make sure output is flushed

    # now redefine the seqfile as the new file on disk.
    $seqfile=$tempfile;
  }

  # Check if we have a valid sequence file path or at
  # the very least, double check that the file path we just



( run in 0.713 second using v1.01-cache-2.11-cpan-5735350b133 )