Bio-Minimizer
view release on metacpan or search on metacpan
t/21_compressionBenchmark.t view on Meta::CPAN
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use Test::More tests=>2;
use FindBin qw/$RealBin/;
use lib "$RealBin/../lib";
use_ok 'Bio::Minimizer';
subtest 'sorting fastqs to make smaller filesize' => sub{
my $numTests = 3;
plan tests => $numTests;
for(1..$numTests){
# Create a reference genome
my @nt = qw(A T C G);
my $alphabetSize = scalar(@nt);
my $sequence = "";
my $genomeSize = 1000000;
for(1..$genomeSize){ # 5Mbp genome
$sequence .= $nt[int(rand($alphabetSize))]
}
note "Simulating reference genome ".substr($sequence,0,10)."...${genomeSize}bp...".substr($sequence,-10,10);
# Simulate the reference genome into 100k reads
my $readLength = 250;
my $qual = 'I' x $readLength;
open(my $fh, '>', "$RealBin/simulated.fastq") or die "ERROR: could not write to $RealBin/simulated.fastq: $!";
for(my $i=0;$i<100000;$i++){
my $start = int(rand($genomeSize-$readLength));
my $seq = substr($sequence, $start, $readLength);
# Revcom half of the reads
if(rand(1) < 0.5){
$seq = reverse($seq);
$seq =~ tr/ATCG/TAGC/;
}
print $fh "\@read$i pos$start\n$seq\n+\n$qual\n";
}
close $fh;
# Sort the simulated file
system("gzip -f $RealBin/simulated.fastq"); # gzip first
system("zcat $RealBin/simulated.fastq.gz | perl -I$RealBin/../lib scripts/sortFastq.pl | gzip -fc > $RealBin/sorted.fastq.gz");
die if $?;
my $simulatedSize = (stat("$RealBin/simulated.fastq.gz"))[7];
my $sortedSize = (stat("$RealBin/sorted.fastq.gz"))[7];
my $reduction = sprintf("%0.2f",$sortedSize/$simulatedSize * 100);
diag "Filesize reduction when sorted: $reduction%";
cmp_ok($simulatedSize, '>', $sortedSize, "File sizes ($simulatedSize > $sortedSize, $reduction%)");
}
};
( run in 0.598 second using v1.01-cache-2.11-cpan-39bf76dae61 )