CracTools

 view release on metacpan or  search on metacpan

lib/CracTools/Utils.pm  view on Meta::CPAN


my %conversion_hash = ( '+' => 1, '-' => '-1', '1' => '+', '-1' => '-');
sub convertStrand($) {
  my $strand = shift;
  return defined $strand? $conversion_hash{$strand} : undef;
}

sub removeChrPrefix($) {
  my $string = shift;
  $string =~ s/^chr//i;
  return $string;
}

sub addChrPrefix($) {
  my $string = shift;
  return "chr".removeChrPrefix($string);
}


our $Base64_BITNESS = 6;
our @Base64_ENCODING = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 0 1 2 3 4 5 6 7 8 9 + /);

# Encode error list into base64
sub encodePosListToBase64 {
  my @pos_list = @_;
  my @bitList;
  my $encoded_str;

  return "" if @pos_list == 0;

  # Compute the appropriate length for the bitList
  my $length = $pos_list[-1] + 1;
  my $bitList_length = int($length / $Base64_BITNESS);
  if ($length % $Base64_BITNESS > 0) {
      $bitList_length++;
  }

  # Put 0 at all position
  for (my $i = 0; $i < $bitList_length; $i++) {
    $bitList[$i] = 0;
  }

  # Create bit vector in Base64_ENCODING
  foreach my $j (@pos_list) {
    $bitList[int($j / $Base64_BITNESS)] |= (1 << ($j % $Base64_BITNESS));
  }

  # Convert bitList into string_Base64_ENCODING
  for(my $k=0; $k < $bitList_length; $k++) {
    $encoded_str .= scalar($Base64_ENCODING[$bitList[$k]]);
  }
  return $encoded_str;
}


# Decode base 64 error list
sub decodePosListInBase64 {
  my $encoded_str = shift;

  my @encoded_list = split "", $encoded_str;
  my (@index,@decoded_list);

  #Looking for index of each char
  foreach my $j (@encoded_list) {
    my @ind = grep { $Base64_ENCODING[$_] eq $j } 0 .. $#Base64_ENCODING;
    push(@index,$ind[0]);
  }

  # Convert into list position
  for (my $e = 0; $e < (0+@encoded_list);$e++) {
    for(my $i=0; $i<$Base64_BITNESS; $i++) {
      if ($index[$e] & (1 << $i)) {
        my $pos = (int($i+$Base64_BITNESS*$e));
        push(@decoded_list,$pos);
      }
    }
  }

  return @decoded_list;
}




sub seqFileIterator {
  my ($file,$format) = @_;

  croak "Missing file in argument of seqFileIterator" if !defined $file;

  # Get file handle for $file
  my $fh = getReadingFileHandle($file);

  # Automatic file extension detection
  if(!defined $format) {
    if($file =~ /\.(fasta|fa)(\.|$)/) {
      $format = 'fasta';
    } elsif($file =~ /\.(fastq|fq)(\.|$)/) {
      $format = 'fastq';
    } else {
      croak "Undefined file extension";
    }
  } else {
    $format = lc $format;
  }

  # FASTA ITERATOR
  if ($format eq 'fasta') {
    # Read prev line for FASTA because we dont know the number
    # of line used for the sequence
    my $prev_line = <$fh>;
    chomp $prev_line;
    return sub {
      my ($name,$seq,$qual);
      if(defined $prev_line) {
        ($name) = $prev_line =~ />(.*)$/;
        $prev_line = <$fh>;
        # Until we find a new sequence identifier ">", we
        # concatenate the lines corresponding to the sequence
        while(defined $prev_line && $prev_line !~ /^>/) {
          chomp $prev_line;
          $seq .= $prev_line;
          $prev_line = <$fh>;
        }
        return {name => $name, seq => $seq, qual => $qual};
      } else {
        return undef;
      }
    };
  # FASTQ ITERATOR
  } elsif ($format eq 'fastq') {
    return sub {
      my ($name,$seq,$qual);
      my $line = <$fh>;
      ($name) = $line =~ /@(.*)$/ if defined $line;
      if(defined $name) {
        $seq = <$fh>;
        chomp $seq;
        <$fh>; # skip second seq name (useless line)
        $qual = <$fh>;

lib/CracTools/Utils.pm  view on Meta::CPAN


  my $seq_revcomp = reverseComplement($seq);

reverseComplement is more than B<100x faster than Bio-Perl> revcom_as_string()

=head2 reverse_tab

  Arg [1] : String - a string with values separated with coma.
  Example : $reverse = reverse_tab('2,1,1,1,0,0,1');
  Description : Reverse the values of the string in argument.
                For example : reverse_tab('1,2,0,1') returns : '1,0,2,1'.
  ReturnType  : String
  Exceptions  : none

=head2 isVersionGreaterOrEqual($v1,$v2)

Return true is version number v1 is greater than v2

=head2 convertStrand

Convert strand from '+/-' standard to '1/-1' standard and the opposite.

Example:

  say "Forward a: ",convertStrand('+');
  say "Forward b: ",convertStrand(1);
  say "Reverse a: ",convertStrand('-');
  say "Reverss b: ",convertStrand(-1);

will print

  Forward a: 1
  Forward b: +
  Reverse a: -1
  Reverse b: -

=head2 removeChrPrefix

Remove the "chr" prefix from a given string

Example:

  say "reference name: ",removeChrPrefix("chr1");

will print

  reference name: 1

=head2 addChrPrefix

Add the "chr" prefix to the given string

=head1 ENCODING

=head2 encodePosListToBase64

Encode a (0-based) list of increasing position to a string using Base64
encoding scheme : ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/

  my $encoded_list = CracTools::Utils::encodePosListToBase64(1,3,5,8,12,32);
  my @decoded_list = CracTools::Utils::decodePosListInBase64($encoded_list);

=head2 decodePosListInBase64

Decode position list encoded by encodePosListToBase64.

=head1 PARSING

This are some tools that aim to read (bio) files like

=over

=item Sequence files : FASTA, FASTQ

=item Annotation files : GFF3, GTF2, BED6, BED12, ...

=item Alignement files : SAM, BAM

=back

=head2 seqFileIterator

Open Fasta, or Fastq files (can be gziped).
seqFileIterator has an automatic file extension detection but you can force it
using a second parameter with the format : 'fasta' or 'fastq'.

Example:

  my $it = seqFileIterator('file.fastq','fastq');
  while(my $entry = $it->()) {
    print "Sequence name   : $entry->{name}
           Sequence        : $entry->{seq}
           Sequence quality: $entry->{qual}","\n";
  }

Return: HashRef

  { name => 'sequence_identifier',
    seq  => 'sequence_value',
    qual => 'sequence_quality', # only defined for FASTQ files
  }

seqFileIterator is more than B<50x faster than Bio-Perl> Bio::SeqIO for FASTQ files
seqFileIterator is 4x faster than Bio-Perl Bio::SeqIO for FASTA files

=head2 pairedEndSeqFileIterator

Open Paired-End Sequence files using seqFileIterator()

Paird-End files are generated by Next Generation Sequencing technologies (like Illumina) where two
reads are sequenced from the same DNA fragment and saved in separated files.

Example:

  my $it = pairedEndSeqFileIterator($reads1,$reads2,$format);
  while (my $entry = $it->()) {
    print "Read_1 : $entry->{read1}->{seq}
           Read_2 : $entry->{read2}->{seq}";
  }

Return: HashRef



( run in 1.535 second using v1.01-cache-2.11-cpan-2398b32b56e )