CracTools
view release on metacpan or search on metacpan
lib/CracTools/Utils.pm view on Meta::CPAN
my %conversion_hash = ( '+' => 1, '-' => '-1', '1' => '+', '-1' => '-');
sub convertStrand($) {
my $strand = shift;
return defined $strand? $conversion_hash{$strand} : undef;
}
sub removeChrPrefix($) {
my $string = shift;
$string =~ s/^chr//i;
return $string;
}
sub addChrPrefix($) {
my $string = shift;
return "chr".removeChrPrefix($string);
}
our $Base64_BITNESS = 6;
our @Base64_ENCODING = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 0 1 2 3 4 5 6 7 8 9 + /);
# Encode error list into base64
sub encodePosListToBase64 {
my @pos_list = @_;
my @bitList;
my $encoded_str;
return "" if @pos_list == 0;
# Compute the appropriate length for the bitList
my $length = $pos_list[-1] + 1;
my $bitList_length = int($length / $Base64_BITNESS);
if ($length % $Base64_BITNESS > 0) {
$bitList_length++;
}
# Put 0 at all position
for (my $i = 0; $i < $bitList_length; $i++) {
$bitList[$i] = 0;
}
# Create bit vector in Base64_ENCODING
foreach my $j (@pos_list) {
$bitList[int($j / $Base64_BITNESS)] |= (1 << ($j % $Base64_BITNESS));
}
# Convert bitList into string_Base64_ENCODING
for(my $k=0; $k < $bitList_length; $k++) {
$encoded_str .= scalar($Base64_ENCODING[$bitList[$k]]);
}
return $encoded_str;
}
# Decode base 64 error list
sub decodePosListInBase64 {
my $encoded_str = shift;
my @encoded_list = split "", $encoded_str;
my (@index,@decoded_list);
#Looking for index of each char
foreach my $j (@encoded_list) {
my @ind = grep { $Base64_ENCODING[$_] eq $j } 0 .. $#Base64_ENCODING;
push(@index,$ind[0]);
}
# Convert into list position
for (my $e = 0; $e < (0+@encoded_list);$e++) {
for(my $i=0; $i<$Base64_BITNESS; $i++) {
if ($index[$e] & (1 << $i)) {
my $pos = (int($i+$Base64_BITNESS*$e));
push(@decoded_list,$pos);
}
}
}
return @decoded_list;
}
sub seqFileIterator {
my ($file,$format) = @_;
croak "Missing file in argument of seqFileIterator" if !defined $file;
# Get file handle for $file
my $fh = getReadingFileHandle($file);
# Automatic file extension detection
if(!defined $format) {
if($file =~ /\.(fasta|fa)(\.|$)/) {
$format = 'fasta';
} elsif($file =~ /\.(fastq|fq)(\.|$)/) {
$format = 'fastq';
} else {
croak "Undefined file extension";
}
} else {
$format = lc $format;
}
# FASTA ITERATOR
if ($format eq 'fasta') {
# Read prev line for FASTA because we dont know the number
# of line used for the sequence
my $prev_line = <$fh>;
chomp $prev_line;
return sub {
my ($name,$seq,$qual);
if(defined $prev_line) {
($name) = $prev_line =~ />(.*)$/;
$prev_line = <$fh>;
# Until we find a new sequence identifier ">", we
# concatenate the lines corresponding to the sequence
while(defined $prev_line && $prev_line !~ /^>/) {
chomp $prev_line;
$seq .= $prev_line;
$prev_line = <$fh>;
}
return {name => $name, seq => $seq, qual => $qual};
} else {
return undef;
}
};
# FASTQ ITERATOR
} elsif ($format eq 'fastq') {
return sub {
my ($name,$seq,$qual);
my $line = <$fh>;
($name) = $line =~ /@(.*)$/ if defined $line;
if(defined $name) {
$seq = <$fh>;
chomp $seq;
<$fh>; # skip second seq name (useless line)
$qual = <$fh>;
lib/CracTools/Utils.pm view on Meta::CPAN
my $seq_revcomp = reverseComplement($seq);
reverseComplement is more than B<100x faster than Bio-Perl> revcom_as_string()
=head2 reverse_tab
Arg [1] : String - a string with values separated with coma.
Example : $reverse = reverse_tab('2,1,1,1,0,0,1');
Description : Reverse the values of the string in argument.
For example : reverse_tab('1,2,0,1') returns : '1,0,2,1'.
ReturnType : String
Exceptions : none
=head2 isVersionGreaterOrEqual($v1,$v2)
Return true is version number v1 is greater than v2
=head2 convertStrand
Convert strand from '+/-' standard to '1/-1' standard and the opposite.
Example:
say "Forward a: ",convertStrand('+');
say "Forward b: ",convertStrand(1);
say "Reverse a: ",convertStrand('-');
say "Reverss b: ",convertStrand(-1);
will print
Forward a: 1
Forward b: +
Reverse a: -1
Reverse b: -
=head2 removeChrPrefix
Remove the "chr" prefix from a given string
Example:
say "reference name: ",removeChrPrefix("chr1");
will print
reference name: 1
=head2 addChrPrefix
Add the "chr" prefix to the given string
=head1 ENCODING
=head2 encodePosListToBase64
Encode a (0-based) list of increasing position to a string using Base64
encoding scheme : ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/
my $encoded_list = CracTools::Utils::encodePosListToBase64(1,3,5,8,12,32);
my @decoded_list = CracTools::Utils::decodePosListInBase64($encoded_list);
=head2 decodePosListInBase64
Decode position list encoded by encodePosListToBase64.
=head1 PARSING
This are some tools that aim to read (bio) files like
=over
=item Sequence files : FASTA, FASTQ
=item Annotation files : GFF3, GTF2, BED6, BED12, ...
=item Alignement files : SAM, BAM
=back
=head2 seqFileIterator
Open Fasta, or Fastq files (can be gziped).
seqFileIterator has an automatic file extension detection but you can force it
using a second parameter with the format : 'fasta' or 'fastq'.
Example:
my $it = seqFileIterator('file.fastq','fastq');
while(my $entry = $it->()) {
print "Sequence name : $entry->{name}
Sequence : $entry->{seq}
Sequence quality: $entry->{qual}","\n";
}
Return: HashRef
{ name => 'sequence_identifier',
seq => 'sequence_value',
qual => 'sequence_quality', # only defined for FASTQ files
}
seqFileIterator is more than B<50x faster than Bio-Perl> Bio::SeqIO for FASTQ files
seqFileIterator is 4x faster than Bio-Perl Bio::SeqIO for FASTA files
=head2 pairedEndSeqFileIterator
Open Paired-End Sequence files using seqFileIterator()
Paird-End files are generated by Next Generation Sequencing technologies (like Illumina) where two
reads are sequenced from the same DNA fragment and saved in separated files.
Example:
my $it = pairedEndSeqFileIterator($reads1,$reads2,$format);
while (my $entry = $it->()) {
print "Read_1 : $entry->{read1}->{seq}
Read_2 : $entry->{read2}->{seq}";
}
Return: HashRef
( run in 1.535 second using v1.01-cache-2.11-cpan-2398b32b56e )