Compression-Util
view release on metacpan or search on metacpan
lib/Compression/Util.pm view on Meta::CPAN
}
}
return \@dec;
}
#####################
# Generic run-length
#####################
sub run_length ($arr, $max_run = undef) {
@$arr || return [];
my @result = [$arr->[0], 1];
my $prev_value = $arr->[0];
foreach my $i (1 .. $#$arr) {
my $curr_value = $arr->[$i];
if ($curr_value == $prev_value and (defined($max_run) ? $result[-1][1] < $max_run : 1)) {
++$result[-1][1];
}
else {
push(@result, [$curr_value, 1]);
}
$prev_value = $curr_value;
}
return \@result;
}
######################################
# Binary variable run-length encoding
######################################
sub binary_vrl_encode ($bitstring) {
my @bits = split(//, $bitstring);
my $encoded = $bits[0];
foreach my $rle (@{run_length(\@bits)}) {
my ($c, $v) = @$rle;
if ($v == 1) {
$encoded .= '0';
}
else {
my $t = sprintf('%b', $v - 1);
$encoded .= join('', '1' x length($t), '0', substr($t, 1));
}
}
return $encoded;
}
sub binary_vrl_decode ($bitstring) {
my $decoded = '';
my $bit = substr($bitstring, 0, 1, '');
while ($bitstring ne '') {
$decoded .= $bit;
my $bl = 0;
while (substr($bitstring, 0, 1, '') eq '1') {
++$bl;
}
if ($bl > 0) {
$decoded .= $bit x oct('0b1' . join('', map { substr($bitstring, 0, 1, '') } 1 .. $bl - 1));
}
$bit = ($bit eq '1' ? '0' : '1');
}
return $decoded;
}
############################
# Burrows-Wheeler transform
############################
sub bwt_sort ($s, $LOOKAHEAD_LEN = 128) { # O(n * LOOKAHEAD_LEN) space (fast)
my $len = length($s);
my $double_s = $s . $s; # Pre-compute doubled string
# Schwartzian transform with optimized sorting
return [
map { $_->[1] }
sort {
($a->[0] cmp $b->[0])
|| do {
my ($cmp, $s_len) = (0, $LOOKAHEAD_LEN << 2);
while (1) {
($cmp = substr($double_s, $a->[1], $s_len) cmp substr($double_s, $b->[1], $s_len)) && last;
$s_len <<= 1;
}
$cmp;
}
}
map {
my $pos = $_;
my $end = $pos + $LOOKAHEAD_LEN;
# Handle wraparound efficiently
my $t =
($end <= $len)
? substr($s, $pos, $LOOKAHEAD_LEN)
: substr($double_s, $pos, $LOOKAHEAD_LEN);
[$t, $pos]
} 0 .. $len - 1
];
}
sub bwt_encode ($s, $LOOKAHEAD_LEN = 128) {
if (ref($s) ne '') {
return bwt_encode_symbolic($s);
}
my $bwt = bwt_sort($s, $LOOKAHEAD_LEN);
my $ret = '';
my $idx = 0;
my $i = 0;
foreach my $pos (@$bwt) {
$ret .= substr($s, $pos - 1, 1);
$idx = $i if !$pos;
++$i;
}
return ($ret, $idx);
}
lib/Compression/Util.pm view on Meta::CPAN
if (ref($fh) eq '') {
open(my $fh2, '<:raw', \$fh) or confess "error: $!";
return __SUB__->($fh2);
}
my $buffer = '';
my $double = read_bit($fh, \$buffer);
my $with_rle = read_bit($fh, \$buffer);
my $has_negative = read_bit($fh, \$buffer);
my $with_deltas = read_bit($fh, \$buffer);
my $with_rle4 = read_bit($fh, \$buffer);
my @deltas;
my $len = 0;
for (my $k = 0 ; $k <= $len ; ++$k) {
my $bit = read_bit($fh, \$buffer);
if ($bit eq '0') {
push @deltas, 0;
}
elsif ($double) {
my $bit = $has_negative ? read_bit($fh, \$buffer) : 0;
my $bl = $has_negative ? 0 : 1;
++$bl while (read_bit($fh, \$buffer) eq '1');
my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
push @deltas, ($has_negative ? ($bit eq '1' ? 1 : -1) : 1) * ($int - 1);
}
else {
my $bit = $has_negative ? read_bit($fh, \$buffer) : 0;
my $n = $has_negative ? 0 : 1;
++$n while (read_bit($fh, \$buffer) eq '1');
my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
push @deltas, $has_negative ? ($bit eq '1' ? $d : -$d) : ($d - 1);
}
if ($with_rle) {
my $bl = 0;
while (read_bit($fh, \$buffer) == 1) {
++$bl;
}
if ($bl > 0) {
my $run = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl)) - 1;
$k += $run;
push @deltas, ($deltas[-1]) x $run;
}
}
if ($k == 0) {
$len = pop(@deltas);
}
}
my $decoded = \@deltas;
$decoded = rle4_decode($decoded) if $with_rle4;
$decoded = accumulate($decoded) if $with_deltas;
return $decoded;
}
################################
# Alphabet encoding (from Bzip2)
################################
sub encode_alphabet_256 ($alphabet) {
my %table;
@table{@$alphabet} = ();
my $populated = 0;
my @marked;
for (my $i = 0 ; $i <= 255 ; $i += 16) {
my $enc = 0;
foreach my $j (0 .. 15) {
if (exists($table{$i + $j})) {
$enc |= 1 << $j;
}
}
$populated <<= 1;
if ($enc > 0) {
$populated |= 1;
push @marked, $enc;
}
}
my $bitstring = join('', map { int2bits_lsb($_, 16) } @marked);
$VERBOSE && say STDERR "Populated : ", sprintf('%016b', $populated);
$VERBOSE && say STDERR "Marked : @marked";
$VERBOSE && say STDERR "Bits len : ", length($bitstring);
my $encoded = '';
$encoded .= int2bytes($populated, 2);
$encoded .= pack('B*', $bitstring);
return $encoded;
}
sub decode_alphabet_256 ($fh) {
if (ref($fh) eq '') {
open(my $fh2, '<:raw', \$fh) or confess "error: $!";
return __SUB__->($fh2);
}
my @alphabet;
my $l1 = bytes2int($fh, 2);
for my $i (0 .. 15) {
if ($l1 & (0x8000 >> $i)) {
my $l2 = bytes2int($fh, 2);
for my $j (0 .. 15) {
if ($l2 & (0x8000 >> $j)) {
push @alphabet, 16 * $i + $j;
}
lib/Compression/Util.pm view on Meta::CPAN
$VERBOSE && say STDERR "Number or trees: $num_trees";
my $num_sels = bits2int($fh, 15, \$buffer);
$VERBOSE && say STDERR "Number of selectors: $num_sels";
my @idxs;
for (1 .. $num_sels) {
my $i = 0;
while (read_bit($fh, \$buffer)) {
$i += 1;
($i < $num_trees) or confess "error";
}
push @idxs, $i;
}
my $sels = mtf_decode(\@idxs, [0 .. $num_trees - 1]);
$VERBOSE && say STDERR "Selectors: (@$sels)";
my $num_syms = scalar(@alphabet) + 2;
my @trees;
for (1 .. $num_trees) {
my @clens;
my $clen = bits2int($fh, 5, \$buffer);
for (1 .. $num_syms) {
while (1) {
($clen > 0 and $clen <= $MaxHuffmanBits) or confess "invalid code length: $clen";
if (not read_bit($fh, \$buffer)) {
last;
}
$clen -= read_bit($fh, \$buffer) ? 1 : -1;
}
push @clens, $clen;
}
push @trees, \@clens;
$VERBOSE && say STDERR "Code lengths: (@clens)";
}
foreach my $tree (@trees) {
my $maxLen = max(@$tree);
my $sum = 1 << $maxLen;
for my $clen (@$tree) {
$sum -= (1 << $maxLen) >> $clen;
}
$sum == 0 or confess "incomplete tree not supported: (@$tree)";
}
my @huffman_trees = map { (huffman_from_code_lengths($_))[1] } @trees;
my $eob = @alphabet + 1;
my @zrle;
my $code = '';
my $sel_idx = 0;
my $tree = $huffman_trees[$sels->[$sel_idx]];
my $decoded = 50;
while (!eof($fh)) {
$code .= read_bit($fh, \$buffer);
if (length($code) > $MaxHuffmanBits) {
confess "[!] Something went wrong: length of code `$code` is > $MaxHuffmanBits.";
}
if (exists($tree->{$code})) {
my $sym = $tree->{$code};
if ($sym == $eob) { # end of block marker
$VERBOSE && say STDERR "EOB detected: $sym";
last;
}
push @zrle, $sym;
$code = '';
if (--$decoded <= 0) {
if (++$sel_idx <= $#$sels) {
$tree = $huffman_trees[$sels->[$sel_idx]];
}
else {
confess "No more selectors"; # should not happen
}
$decoded = 50;
}
}
}
my @mtf = reverse @{zrle_decode([reverse @zrle])};
my $bwt = symbols2string mtf_decode(\@mtf, \@alphabet);
my $rle4 = string2symbols bwt_decode($bwt, $bwt_idx);
my $data = rle4_decode($rle4);
my $dec = symbols2string($data);
my $new_crc32 = oct('0b' . int2bits_lsb(crc32(pack('b*', unpack('B*', $dec))), 32));
$VERBOSE && say STDERR "Computed CRC32: $new_crc32";
if ($crc32 != $new_crc32) {
confess "CRC32 error: $crc32 (stored) != $new_crc32 (actual)";
}
$decompressed .= $dec;
}
elsif ($block_magic eq "\27rE8P\x90") { # BlockFooter
$VERBOSE && say STDERR "Block footer detected";
my $stored_stream_crc32 = bits2int($fh, 32, \$buffer);
$VERBOSE && say STDERR "Stream CRC: $stored_stream_crc32";
if ($stored_stream_crc32 != $stream_crc32) {
confess "Stream CRC32 error: $stored_stream_crc32 (stored) != $stream_crc32 (actual)";
}
$buffer = '';
last;
}
else {
confess "Unknown block magic: $block_magic";
}
}
$VERBOSE && say STDERR "End of container";
}
return $decompressed;
}
########################################
# GZIP compressor
########################################
sub _code_length_encoding ($dict) {
my @lengths;
foreach my $symbol (0 .. max(keys %$dict) // 0) {
if (exists($dict->{$symbol})) {
push @lengths, length($dict->{$symbol});
}
else {
push @lengths, 0;
}
}
lib/Compression/Util.pm view on Meta::CPAN
if ($block ne '') {
$compressed .= int2bytes_lsb(length($block), 4);
$compressed .= $block;
}
}
$compressed .= int2bytes_lsb(0x00000000, 4); # EndMark
return $compressed;
}
###############################
# LZ4 decompressor
###############################
sub lz4_decompress($fh) {
if (ref($fh) eq '') {
open(my $fh2, '<:raw', \$fh) or confess "error: $!";
return __SUB__->($fh2);
}
my $decompressed = '';
while (!eof($fh)) {
bytes2int_lsb($fh, 4) == 0x184D2204 or confess "Incorrect LZ4 Frame magic number";
my $FLG = ord(getc($fh));
my $BD = ord(getc($fh));
my $version = $FLG & 0b11_00_00_00;
my $B_indep = $FLG & 0b00_10_00_00;
my $B_checksum = $FLG & 0b00_01_00_00;
my $C_size = $FLG & 0b00_00_10_00;
my $C_checksum = $FLG & 0b00_00_01_00;
my $DictID = $FLG & 0b00_00_00_01;
my $Block_MaxSize = $BD & 0b0_111_0000;
$VERBOSE && say STDERR "Maximum block size: $Block_MaxSize";
if ($version != 0b01_00_00_00) {
confess "Error: Invalid version number";
}
if ($C_size) {
my $content_size = bytes2int_lsb($fh, 8);
$VERBOSE && say STDERR "Content size: ", $content_size;
}
if ($DictID) {
my $dict_id = bytes2int_lsb($fh, 4);
$VERBOSE && say STDERR "Dictionary ID: ", $dict_id;
}
my $header_checksum = ord(getc($fh));
# TODO: compute and verify the header checksum
$VERBOSE && say STDERR "Header checksum: ", $header_checksum;
my $decoded = '';
while (!eof($fh)) {
my $block_size = bytes2int_lsb($fh, 4);
if ($block_size == 0x00000000) { # signifies an EndMark
$VERBOSE && say STDERR "Block size == 0";
last;
}
$VERBOSE && say STDERR "Block size: $block_size";
if ($block_size >> 31) {
$VERBOSE && say STDERR "Highest bit set: ", $block_size;
$block_size &= ((1 << 31) - 1);
$VERBOSE && say STDERR "Block size: ", $block_size;
my $uncompressed = '';
read($fh, $uncompressed, $block_size);
$decoded .= $uncompressed;
}
else {
my $compressed = '';
read($fh, $compressed, $block_size);
while ($compressed ne '') {
my $len_byte = ord(substr($compressed, 0, 1, ''));
my $literals_length = $len_byte >> 4;
my $match_len = $len_byte & 0b1111;
## say STDERR "Literal: ", $literals_length;
## say STDERR "Match len: ", $match_len;
if ($literals_length == 15) {
while (1) {
my $byte_len = ord(substr($compressed, 0, 1, ''));
$literals_length += $byte_len;
last if $byte_len != 255;
}
}
## say STDERR "Total literals length: ", $literals_length;
my $literals = '';
if ($literals_length > 0) {
$literals = substr($compressed, 0, $literals_length, '');
}
if ($compressed eq '') { # end of block
$decoded .= $literals;
last;
}
my $offset = oct('0b' . reverse unpack('b16', substr($compressed, 0, 2, '')));
if ($offset == 0) {
confess "Corrupted block";
}
## say STDERR "Offset: $offset";
if ($match_len == 15) {
while (1) {
my $byte_len = ord(substr($compressed, 0, 1, ''));
$match_len += $byte_len;
last if $byte_len != 255;
}
}
$decoded .= $literals;
$match_len += 4;
## say STDERR "Total match len: $match_len\n";
if ($offset >= $match_len) { # non-overlapping matches
$decoded .= substr($decoded, length($decoded) - $offset, $match_len);
}
elsif ($offset == 1) {
$decoded .= substr($decoded, -1) x $match_len;
}
else { # overlapping matches
foreach my $i (1 .. $match_len) {
$decoded .= substr($decoded, length($decoded) - $offset, 1);
}
}
}
}
if ($B_checksum) {
my $content_checksum = bytes2int_lsb($fh, 4);
$VERBOSE && say STDERR "Block checksum: $content_checksum";
}
if ($B_indep) { # blocks are independent of each other
$decompressed .= $decoded;
$decoded = '';
}
elsif (length($decoded) > 2**16) { # blocks are dependent
$decompressed .= substr($decoded, 0, -(2**16), '');
}
}
# TODO: compute and verify checksum
if ($C_checksum) {
my $content_checksum = bytes2int_lsb($fh, 4);
$VERBOSE && say STDERR "Content checksum: $content_checksum";
}
$decompressed .= $decoded;
}
return $decompressed;
}
1;
__END__
=encoding utf-8
=head1 NAME
Compression::Util - Implementation of various techniques used in data compression.
=head1 SYNOPSIS
use 5.036;
use Getopt::Std qw(getopts);
use Compression::Util qw(:all);
use constant {CHUNK_SIZE => 1 << 17};
local $Compression::Util::VERBOSE = 0;
getopts('d', \my %opts);
sub compress ($fh, $out_fh) {
while (read($fh, (my $chunk), CHUNK_SIZE)) {
print $out_fh bwt_compress($chunk);
}
}
sub decompress ($fh, $out_fh) {
while (!eof($fh)) {
print $out_fh bwt_decompress($fh);
}
}
$opts{d} ? decompress(\*STDIN, \*STDOUT) : compress(\*STDIN, \*STDOUT);
=head1 DESCRIPTION
B<Compression::Util> is a function-based module, implementing various techniques used in data compression, such as:
* Burrows-Wheeler transform
* Move-to-front transform
* Huffman Coding
* Arithmetic Coding (in fixed bits)
* Run-length encoding
* Fibonacci coding
* Elias gamma/omega coding
* Delta coding
* BWT-based (de)compression
* LZ77/LZSS (de)compression
* LZW (de)compression
* Bzip2 (de)compression
* GZIP (de)compression
* ZLIB (de)compression
* LZ4 (de)compression
lib/Compression/Util.pm view on Meta::CPAN
The function returns a 2D-array, with pairs: C<[symbol, run_length]>, such that the following code reconstructs the C<\@symbols> array:
my @symbols = map { ($_->[0]) x $_->[1] } @$rl;
By default, the maximum run-length is unlimited.
=head2 rle4_encode
my $rle4 = rle4_encode($string);
my $rle4 = rle4_encode(\@symbols);
my $rle4 = rle4_encode(\@symbols, $max_run);
Performs Run-Length Encoding (RLE) on a sequence of symbolic elements, specifically designed for runs of four or more consecutive symbols.
It takes two parameters: C<\@symbols>, representing an array of symbols, and C<$max_run>, indicating the maximum run length allowed during encoding.
The function returns the encoded RLE sequence as an array-ref of symbols.
By default, the maximum run-length is limited to C<255>.
=head2 rle4_decode
my $symbols = rle4_decode(\@rle4);
my $symbols = rle4_decode($rle4_string);
Inverse of C<rle4_encode()>.
=head2 zrle_encode
my $zrle = zrle_encode(\@symbols);
Performs Zero-Run-Length Encoding (ZRLE) on a sequence of symbolic elements, returning the encoded ZRLE sequence as an array-ref of symbols.
This function efficiently encodes runs of zeros, but also increments each symbol by C<1>.
=head2 zrle_decode
my $symbols = zrle_decode($zrle);
Inverse of C<zrle_encode()>.
=head2 ac_encode
my ($bitstring, $freq) = ac_encode(\@symbols);
Performs Arithmetic Coding on the provided symbols.
It takes a single parameter, C<\@symbols>, representing the symbols to be encoded.
The function returns two values: C<$bitstring>, which is a string of 1s and 0s, and C<$freq>, representing the frequency table used for encoding.
=head2 ac_decode
my $symbols = ac_decode($bits_fh, \%freq);
my $symbols = ac_decode($bitstring, \%freq);
Performs Arithmetic Coding decoding using the provided frequency table and a string of 1s and 0s. Inverse of C<ac_encode()>.
It takes two parameters: C<$bitstring>, representing a string of 1s and 0s containing the arithmetic coded data, and C<\%freq>, representing the frequency table used for encoding.
The function returns the decoded sequence of symbols.
=head2 adaptive_ac_encode
my ($bitstring, $alphabet) = adaptive_ac_encode(\@symbols);
Performs Adaptive Arithmetic Coding on the provided symbols.
It takes a single parameter, C<\@symbols>, representing the symbols to be encoded.
The function returns two values: C<$bitstring>, which is a string of 1s and 0s, and C<$alphabet>, which is an array-ref of distinct sorted symbols.
=head2 adaptive_ac_decode
my $symbols = adaptive_ac_decode($bits_fh, \@alphabet);
my $symbols = adaptive_ac_decode($bitstring, \@alphabet);
Performs Adaptive Arithmetic Coding decoding using the provided frequency table and a string of 1s and 0s.
It takes two parameters: C<$bitstring>, representing a string of 1s and 0s containing the adaptive arithmetic coded data, and C<\@alphabet>, representing the array of distinct sorted symbols that appear in the encoded data.
The function returns the decoded sequence of symbols.
=head2 lzw_encode
my $symbols = lzw_encode($string);
Performs Lempel-Ziv-Welch (LZW) encoding on the provided string.
It takes a single parameter, C<$string>, representing the data to be encoded.
The function returns an array-ref of symbols.
=head2 lzw_decode
my $string = lzw_decode(\@symbols);
Performs Lempel-Ziv-Welch (LZW) decoding on the provided symbols. Inverse of C<lzw_encode()>.
The function returns the decoded string.
=head1 INTERFACE FOR LOW-LEVEL FUNCTIONS
=head2 crc32
my $int32 = crc32($data);
my $int32 = crc32($data, $prev_crc32);
Compute the CRC32 checksum of a given string.
=head2 adler32
my $int32 = adler32($data);
my $int32 = adler32($data, $prev_adler32);
Compute the Adler32 checksum of a given string.
=head2 read_bit
my $bit = read_bit($fh, \$buffer);
Reads a single bit from a file-handle C<$fh> (MSB order).
The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
=head2 read_bit_lsb
my $bit = read_bit_lsb($fh, \$buffer);
Reads a single bit from a file-handle C<$fh> (LSB order).
The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
=head2 read_bits
my $bitstring = read_bits($fh, $bits_len);
Reads a specified number of bits (C<$bits_len>) from a file-handle (C<$fh>) and returns them as a string, in MSB order.
=head2 read_bits_lsb
my $bitstring = read_bits_lsb($fh, $bits_len);
Reads a specified number of bits (C<$bits_len>) from a file-handle (C<$fh>) and returns them as a string, in LSB order.
=head2 int2bits
my $bitstring = int2bits($symbol, $size)
Convert a non-negative integer to a bitstring of width C<$size>, in MSB order.
=head2 int2bits_lsb
my $bitstring = int2bits_lsb($symbol, $size)
Convert a non-negative integer to a bitstring of width C<$size>, in LSB order.
=head2 int2bytes
my $string = int2bytes($symbol, $size);
lib/Compression/Util.pm view on Meta::CPAN
Convert a non-negative integer to a byte-string of width C<$size>, in LSB order.
=head2 bits2int
my $integer = bits2int($fh, $size, \$buffer);
Read C<$size> bits from a file-handle C<$fh> and convert them to an integer, in MSB order. Inverse of C<int2bits()>.
The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
=head2 bits2int_lsb
my $integer = bits2int_lsb($fh, $size, \$buffer);
Read C<$size> bits from a file-handle C<$fh> and convert them to an integer, in LSB order. Inverse of C<int2bits_lsb()>.
The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
=head2 bytes2int
my $integer = bytes2int($fh, $n);
my $integer = bytes2int($str, $n);
Read C<$n> bytes from a file-handle C<$fh> or from a string C<$str> and convert them to an integer, in MSB order.
=head2 bytes2int_lsb
my $integer = bytes2int_lsb($fh, $n);
my $integer = bytes2int_lsb($str, $n);
Read C<$n> bytes from a file-handle C<$fh> or from a string C<$str> and convert them to an integer, in LSB order.
=head2 string2symbols
my $symbols = string2symbols($string)
Returns an array-ref of code points, given a string.
=head2 symbols2string
my $string = symbols2string(\@symbols)
Returns a string, given an array-ref of code points.
=head2 read_null_terminated
my $string = read_null_terminated($fh)
Read a string from file-handle C<$fh> that ends with a NULL character ("\0").
=head2 binary_vrl_encode
my $bitstring_enc = binary_vrl_encode($bitstring);
Given a string of 1s and 0s, returns back a bitstring of 1s and 0s encoded using variable run-length encoding.
=head2 binary_vrl_decode
my $bitstring = binary_vrl_decode($bitstring_enc);
Given an encoded bitstring, returned by C<binary_vrl_encode()>, gives back the decoded string of 1s and 0s.
=head2 bwt_sort
my $indices = bwt_sort($string);
my $indices = bwt_sort($string, $lookahead_len);
Low-level function that sorts the rotations of a given string using the Burrows-Wheeler Transform (BWT) algorithm.
It takes two parameters: C<$string>, which is the input string to be transformed, and C<$LOOKAHEAD_LEN> (optional), representing the length of look-ahead during sorting.
The function returns an array-ref of indices.
There is probably no need to call this function explicitly. Use C<bwt_encode()> instead!
=head2 bwt_sort_symbolic
my $indices = bwt_sort_symbolic(\@symbols);
Low-level function that sorts the rotations of a sequence of symbolic elements using the Burrows-Wheeler Transform (BWT) algorithm.
It takes a single parameter C<\@symbols>, which represents the input sequence of symbolic elements. The function returns an array of indices.
There is probably no need to call this function explicitly. Use C<bwt_encode_symbolic()> instead!
=head2 huffman_from_freq
my $dict = huffman_from_freq(\%freq);
my ($dict, $rev_dict) = huffman_from_freq(\%freq);
Low-level function that constructs Huffman prefix codes, based on the frequency of symbols provided in a hash table.
It takes a single parameter, C<\%freq>, representing the hash table where keys are symbols, and values are their corresponding frequencies.
The function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.
The prefix codes are in canonical form, as defined in RFC 1951 (Section 3.2.2).
=head2 huffman_from_symbols
my $dict = huffman_from_symbols(\@symbols);
my ($dict, $rev_dict) = huffman_from_symbols(\@symbols);
Low-level function that constructs Huffman prefix codes, given an array-ref of symbols.
It takes a single parameter, C<\@symbols>, from which it computes the frequency of each symbol and generates the corresponding Huffman prefix codes.
The function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.
The prefix codes are in canonical form, as defined in RFC 1951 (Section 3.2.2).
=head2 huffman_from_code_lengths
my $dict = huffman_from_code_lengths(\@code_lengths);
my ($dict, $rev_dict) = huffman_from_code_lengths(\@code_lengths);
my $dict = huffman_from_code_lengths(\%code_lengths);
my ($dict, $rev_dict) = huffman_from_code_lengths(\%code_lengths);
Low-level function that constructs a dictionary of canonical prefix codes as defined in RFC 1951 (Section 3.2.2), given an array-ref of code lengths or a hash-ref of (symbol => length) values,
It takes a single parameter, C<\@code_lengths>, where entry C<$i> in the array corresponds to the code length for symbol C<$i>.
Similarily, when a hash-ref table is given, C<\%code_lengths>, keys are the symbols and values are the code lengths. This variant is useful for large symbols.
In list context, the function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.
In scalar context, it returns only the C<$dict> table.
=head2 huffman_encode
my $bitstring = huffman_encode(\@symbols, $dict);
Low-level function that performs Huffman encoding on a sequence of symbols using a provided dictionary, returned by C<huffman_from_freq()>.
It takes two parameters: C<\@symbols>, representing the sequence of symbols to be encoded, and C<$dict>, representing the Huffman dictionary mapping symbols to their corresponding Huffman codes.
The function returns a concatenated string of 1s and 0s, representing the Huffman-encoded sequence of symbols.
=head2 huffman_decode
my $symbols = huffman_decode($bitstring, $rev_dict);
Low-level function that decodes a Huffman-encoded binary string into a sequence of symbols using a provided reverse dictionary.
It takes two parameters: C<$bitstring>, representing the Huffman-encoded string of 1s and 0s, as returned by C<huffman_encode()>, and C<$rev_dict>, representing the reverse dictionary mapping Huffman codes to their corresponding symbols.
The function returns the decoded sequence of symbols as an array-ref.
=head2 lz77_encode / lz77_encode_symbolic
my ($literals, $distances, $lengths, $matches) = lz77_encode($string);
my ($literals, $distances, $lengths, $matches) = lz77_encode(\@symbols);
Low-level function that combines LZSS with ideas from the LZ4 method.
The function returns four values:
$literals # array-ref of uncompressed symbols
$distances # array-ref of back-reference distances
$lengths # array-ref of literal lengths
$matches # array-ref of match lengths
The output can be decoded with C<lz77_decode()> and C<lz77_decode_symbolic()>, respectively.
=head2 lz77_decode / lz77_decode_symbolic
my $string = lz77_decode(\@literals, \@distances, \@lengths, \@matches);
my $symbols = lz77_decode_symbolic(\@literals, \@distances, \@lengths, \@matches);
Low-level function that performs decoding using the provided literals, distances, lengths and matches, returned by LZ77 encoding.
Inverse of C<lz77_encode()> and C<lz77_encode_symbolic()>, respectively.
=head2 lzss_encode / lzss_encode_fast / lzss_encode_symbolic / lzss_encode_fast_symbolic
# Standard version
my ($literals, $distances, $lengths) = lzss_encode($data, %params);
my ($literals, $distances, $lengths) = lzss_encode(\@symbols, %params);
# Faster version
my ($literals, $distances, $lengths) = lzss_encode_fast($data, %params);
my ($literals, $distances, $lengths) = lzss_encode_fast(\@symbols, %params);
Low-level function that applies the LZSS (Lempel-Ziv-Storer-Szymanski) algorithm on the provided data.
The accepted C<%params> are:
min_len => $LZ_MIN_LEN,
max_len => $LZ_MAX_LEN,
max_dist => $LZ_MAX_DIST,
max_chain_len => $LZ_MAX_CHAIN_LEN,
The function returns three values:
$literals # array-ref of uncompressed symbols
$distances # array-ref of back-reference distances
$lengths # array-ref of match lengths
The output can be decoded with C<lzss_decode()> and C<lzss_decode_symbolic()>, respectively.
=head2 lzss_decode / lzss_decode_symbolic
my $string = lzss_decode(\@literals, \@distances, \@lengths);
my $symbols = lzss_decode_symbolic(\@literals, \@distances, \@lengths);
Low-level function that decodes the LZSS encoding, using the provided literals, distances, and lengths of matched sub-strings.
Inverse of C<lzss_encode()> and C<lzss_encode_fast()>.
=head2 deflate_encode
# Returns a binary string
my $string = deflate_encode(\@literals, \@distances, \@lengths);
my $string = deflate_encode(\@literals, \@distances, \@lengths, \&create_ac_entry);
Low-level function that encodes the results returned by C<lzss_encode()> and C<lzss_encode_fast()>, using a DEFLATE-like approach, combined with Huffman coding.
=head2 deflate_decode
# Huffman decoding
my ($literals, $distances, $lengths) = deflate_decode($fh);
my ($literals, $distances, $lengths) = deflate_decode($string);
# Arithmetic decoding
my ($literals, $distances, $lengths) = deflate_decode($fh, \&decode_ac_entry);
my ($literals, $distances, $lengths) = deflate_decode($string, \&decode_ac_entry);
Inverse of C<deflate_encode()>.
=head2 make_deflate_tables
my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = make_deflate_tables($max_dist, $max_len);
Low-level function that returns a list of tables used in encoding the relative back-reference distances and lengths returned by C<lzss_encode()> and C<lzss_encode_fast()>.
When no arguments are provided:
$max_dist = $Compression::Util::LZ_MAX_DIST
$max_len = $Compression::Util::LZ_MAX_LEN
There is no need to call this function explicitly. Use C<deflate_encode()> instead!
=head2 find_deflate_index
my $index = find_deflate_index($value, $DISTANCE_SYMBOLS);
Low-level function that returns the index inside the DEFLATE tables for a given value.
=head2 deflate_create_block_type_0_header
my $bt0_header = deflate_create_block_type_0_header($chunk);
Creates the header for a DEFLATE block of type 0 (uncompressed), as a bitstring, without including the block code number C<00>.
The length of the C<$chunk> must not exceed C<2^16 - 1>.
To create a DEFLATE block of type 0, including the content, use:
my $block_type_0 = pack('b*', '00') . pack('b*', $bt0_header) . $chunk;
which can be recovered as:
open my $fh, '<:raw', \$block_type_0;
my ($buffer, $search_window) = ('', '');
my $chunk = deflate_extract_next_block($fh, \$buffer, \$search_window);
=head2 deflate_create_block_type_1
my $bitstring = deflate_create_block_type_1($literals, $distances, $lengths);
Creates a DEFLATE block of type 1 (fixed prefix-codes), as a bitstring, given the ARRAY-refs of literals, distances and lengths, returned by C<lzss_encode()>.
This type of block uses fixed prefix-codes and is pretty fast.
=head2 deflate_create_block_type_2
my $bitstring = deflate_create_block_type_1($literals, $distances, $lengths);
Creates a DEFLATE block of type 2 (dynamic prefix-codes), as a bitstring, given the ARRAY-refs of literals, distances and lengths, returned by C<lzss_encode()>.
This type of block uses dynamic prefix-codes (Huffman codes) and produces good compression ratio on most inputs.
=head2 deflate_extract_block_type_0
my $data = deflate_extract_block_type_0($fh, \$buffer, \$search_window);
Given an input filehandle, it extracts a DEFLATE block of type 0 (uncompressed).
my ($buffer, $search_window) = ('', '');
my $block_type = bits2int_lsb($fh, 2, \$buffer);
$block_type == 0 or die "Not a block of type 0";
my $decoded_chunk = deflate_extract_block_type_0($fh, \$buffer, \$search_window);
=head2 deflate_extract_block_type_1
my $data = deflate_extract_block_type_1($fh, \$buffer, \$search_window);
Given an input filehandle, a bitstring buffer and a search window, it extracts a DEFLATE block of type 1 (fixed prefix-codes).
my ($buffer, $search_window) = ('', '');
my $block_type = bits2int_lsb($fh, 2, \$buffer);
$block_type == 1 or die "Not a block of type 1";
my $decoded_chunk = deflate_extract_block_type_1($fh, \$buffer, \$search_window);
=head2 deflate_extract_block_type_2
my $data = deflate_extract_block_type_2($fh, \$buffer, \$search_window);
Given an input filehandle, a bitstring buffer and a search window, it extracts a DEFLATE block of type 2 (dynamic prefix-codes).
my ($buffer, $search_window) = ('', '');
my $block_type = bits2int_lsb($fh, 2, \$buffer);
$block_type == 2 or die "Not a block of type 2";
my $decoded_chunk = deflate_extract_block_type_2($fh, \$buffer, \$search_window);
=head2 deflate_extract_next_block
my $data = deflate_extract_next_block($fh, \$buffer, \$search_window);
Given an input filehandle, a bitstring buffer and a search window, it extracts the next DEFLATE block. The next two bits in the input file-handle (or in the bitstring buffer) must contain the block-type number.
=head1 EXPORT
Each function can be exported individually, as:
use Compression::Util qw(bwt_compress);
By specifying the B<:all> keyword, will export all the exportable functions:
use Compression::Util qw(:all);
Nothing is exported by default.
=head1 EXAMPLES
The functions can be combined in various ways, easily creating novel compression methods, as illustrated in the following examples.
=head2 Combining LZSS + MRL compression:
my $enc = lzss_compress($str, \&mrl_compress_symbolic);
my $dec = lzss_decompress($enc, \&mrl_decompress_symbolic);
=head2 Combining LZ77 + OBH encoding:
my $enc = lz77_compress($str, \&obh_encode);
my $dec = lz77_decompress($enc, \&obh_decode);
=head2 Combining LZSS + symbolic BWT compression:
my $enc = lzss_compress($str, \&bwt_compress_symbolic);
my $dec = lzss_decompress($enc, \&bwt_decompress_symbolic);
=head2 Combining BWT + symbolic LZSS:
my $enc = bwt_compress($str, \&lzss_compress_symbolic);
my $dec = bwt_decompress($enc, \&lzss_decompress_symbolic);
=head2 Combining LZW + Fibonacci encoding:
my $enc = lzw_compress($str, \&fibonacci_encode);
my $dec = lzw_decompress($enc, \&fibonacci_decode);
=head2 Combining BWT + symbolic LZ77 + symbolic MRL:
my $enc = bwt_compress($str, sub ($s) { lz77_compress_symbolic($s, \&mrl_compress_symbolic) });
my $dec = bwt_decompress($enc, sub ($s) { lz77_decompress_symbolic($s, \&mrl_decompress_symbolic) });
=head2 Combining LZ77 + BWT compression + Fibonacci encoding + Huffman coding + OBH encoding + MRL compression:
# Compression
my $enc = do {
my ($literals, $distances, $lengths, $matches) = lz77_encode($str);
bwt_compress(symbols2string($literals))
. fibonacci_encode($lengths)
( run in 1.111 second using v1.01-cache-2.11-cpan-df04353d9ac )