Compression-Util

 view release on metacpan or  search on metacpan

lib/Compression/Util.pm  view on Meta::CPAN

        }
    }

    return \@dec;
}

#####################
# Generic run-length
#####################

sub run_length ($arr, $max_run = undef) {

    @$arr || return [];

    my @result     = [$arr->[0], 1];
    my $prev_value = $arr->[0];

    foreach my $i (1 .. $#$arr) {

        my $curr_value = $arr->[$i];

        if ($curr_value == $prev_value and (defined($max_run) ? $result[-1][1] < $max_run : 1)) {
            ++$result[-1][1];
        }
        else {
            push(@result, [$curr_value, 1]);
        }

        $prev_value = $curr_value;
    }

    return \@result;
}

######################################
# Binary variable run-length encoding
######################################

sub binary_vrl_encode ($bitstring) {

    my @bits    = split(//, $bitstring);
    my $encoded = $bits[0];

    foreach my $rle (@{run_length(\@bits)}) {
        my ($c, $v) = @$rle;

        if ($v == 1) {
            $encoded .= '0';
        }
        else {
            my $t = sprintf('%b', $v - 1);
            $encoded .= join('', '1' x length($t), '0', substr($t, 1));
        }
    }

    return $encoded;
}

sub binary_vrl_decode ($bitstring) {

    my $decoded = '';
    my $bit     = substr($bitstring, 0, 1, '');

    while ($bitstring ne '') {

        $decoded .= $bit;

        my $bl = 0;
        while (substr($bitstring, 0, 1, '') eq '1') {
            ++$bl;
        }

        if ($bl > 0) {
            $decoded .= $bit x oct('0b1' . join('', map { substr($bitstring, 0, 1, '') } 1 .. $bl - 1));
        }

        $bit = ($bit eq '1' ? '0' : '1');
    }

    return $decoded;
}

############################
# Burrows-Wheeler transform
############################

sub bwt_sort ($s, $LOOKAHEAD_LEN = 128) {    # O(n * LOOKAHEAD_LEN) space (fast)
    my $len      = length($s);
    my $double_s = $s . $s;                  # Pre-compute doubled string

    # Schwartzian transform with optimized sorting
    return [
        map { $_->[1] }
        sort {
            ($a->[0] cmp $b->[0])
              || do {
                my ($cmp, $s_len) = (0, $LOOKAHEAD_LEN << 2);
                while (1) {
                    ($cmp = substr($double_s, $a->[1], $s_len) cmp substr($double_s, $b->[1], $s_len)) && last;
                    $s_len <<= 1;
                }
                $cmp;
            }
        }
        map {
            my $pos = $_;
            my $end = $pos + $LOOKAHEAD_LEN;

            # Handle wraparound efficiently
            my $t =
              ($end <= $len)
              ? substr($s,        $pos, $LOOKAHEAD_LEN)
              : substr($double_s, $pos, $LOOKAHEAD_LEN);

            [$t, $pos]
          } 0 .. $len - 1
    ];
}

sub bwt_encode ($s, $LOOKAHEAD_LEN = 128) {

    if (ref($s) ne '') {
        return bwt_encode_symbolic($s);
    }

    my $bwt = bwt_sort($s, $LOOKAHEAD_LEN);

    my $ret = '';
    my $idx = 0;

    my $i = 0;
    foreach my $pos (@$bwt) {
        $ret .= substr($s, $pos - 1, 1);
        $idx = $i if !$pos;
        ++$i;
    }

    return ($ret, $idx);
}

lib/Compression/Util.pm  view on Meta::CPAN

    if (ref($fh) eq '') {
        open(my $fh2, '<:raw', \$fh) or confess "error: $!";
        return __SUB__->($fh2);
    }

    my $buffer       = '';
    my $double       = read_bit($fh, \$buffer);
    my $with_rle     = read_bit($fh, \$buffer);
    my $has_negative = read_bit($fh, \$buffer);
    my $with_deltas  = read_bit($fh, \$buffer);
    my $with_rle4    = read_bit($fh, \$buffer);

    my @deltas;
    my $len = 0;

    for (my $k = 0 ; $k <= $len ; ++$k) {

        my $bit = read_bit($fh, \$buffer);

        if ($bit eq '0') {
            push @deltas, 0;
        }
        elsif ($double) {
            my $bit = $has_negative ? read_bit($fh, \$buffer) : 0;

            my $bl = $has_negative ? 0 : 1;
            ++$bl while (read_bit($fh, \$buffer) eq '1');

            my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
            my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));

            push @deltas, ($has_negative ? ($bit eq '1' ? 1 : -1) : 1) * ($int - 1);
        }
        else {
            my $bit = $has_negative ? read_bit($fh, \$buffer) : 0;
            my $n   = $has_negative ? 0                       : 1;
            ++$n while (read_bit($fh, \$buffer) eq '1');
            my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
            push @deltas, $has_negative ? ($bit eq '1' ? $d : -$d) : ($d - 1);
        }

        if ($with_rle) {

            my $bl = 0;
            while (read_bit($fh, \$buffer) == 1) {
                ++$bl;
            }

            if ($bl > 0) {
                my $run = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl)) - 1;
                $k += $run;
                push @deltas, ($deltas[-1]) x $run;
            }
        }

        if ($k == 0) {
            $len = pop(@deltas);
        }
    }

    my $decoded = \@deltas;
    $decoded = rle4_decode($decoded) if $with_rle4;
    $decoded = accumulate($decoded)  if $with_deltas;
    return $decoded;
}

################################
# Alphabet encoding (from Bzip2)
################################

sub encode_alphabet_256 ($alphabet) {

    my %table;
    @table{@$alphabet} = ();

    my $populated = 0;
    my @marked;

    for (my $i = 0 ; $i <= 255 ; $i += 16) {

        my $enc = 0;
        foreach my $j (0 .. 15) {
            if (exists($table{$i + $j})) {
                $enc |= 1 << $j;
            }
        }

        $populated <<= 1;

        if ($enc > 0) {
            $populated |= 1;
            push @marked, $enc;
        }
    }

    my $bitstring = join('', map { int2bits_lsb($_, 16) } @marked);

    $VERBOSE && say STDERR "Populated : ", sprintf('%016b', $populated);
    $VERBOSE && say STDERR "Marked    : @marked";
    $VERBOSE && say STDERR "Bits len  : ", length($bitstring);

    my $encoded = '';
    $encoded .= int2bytes($populated, 2);
    $encoded .= pack('B*', $bitstring);
    return $encoded;
}

sub decode_alphabet_256 ($fh) {

    if (ref($fh) eq '') {
        open(my $fh2, '<:raw', \$fh) or confess "error: $!";
        return __SUB__->($fh2);
    }

    my @alphabet;
    my $l1 = bytes2int($fh, 2);

    for my $i (0 .. 15) {
        if ($l1 & (0x8000 >> $i)) {
            my $l2 = bytes2int($fh, 2);
            for my $j (0 .. 15) {
                if ($l2 & (0x8000 >> $j)) {
                    push @alphabet, 16 * $i + $j;
                }

lib/Compression/Util.pm  view on Meta::CPAN

                $VERBOSE && say STDERR "Number or trees: $num_trees";

                my $num_sels = bits2int($fh, 15, \$buffer);
                $VERBOSE && say STDERR "Number of selectors: $num_sels";

                my @idxs;
                for (1 .. $num_sels) {
                    my $i = 0;
                    while (read_bit($fh, \$buffer)) {
                        $i += 1;
                        ($i < $num_trees) or confess "error";
                    }
                    push @idxs, $i;
                }

                my $sels = mtf_decode(\@idxs, [0 .. $num_trees - 1]);
                $VERBOSE && say STDERR "Selectors: (@$sels)";

                my $num_syms = scalar(@alphabet) + 2;

                my @trees;
                for (1 .. $num_trees) {
                    my @clens;
                    my $clen = bits2int($fh, 5, \$buffer);
                    for (1 .. $num_syms) {
                        while (1) {

                            ($clen > 0 and $clen <= $MaxHuffmanBits) or confess "invalid code length: $clen";

                            if (not read_bit($fh, \$buffer)) {
                                last;
                            }

                            $clen -= read_bit($fh, \$buffer) ? 1 : -1;
                        }

                        push @clens, $clen;
                    }
                    push @trees, \@clens;
                    $VERBOSE && say STDERR "Code lengths: (@clens)";
                }

                foreach my $tree (@trees) {
                    my $maxLen = max(@$tree);
                    my $sum    = 1 << $maxLen;
                    for my $clen (@$tree) {
                        $sum -= (1 << $maxLen) >> $clen;
                    }
                    $sum == 0 or confess "incomplete tree not supported: (@$tree)";
                }

                my @huffman_trees = map { (huffman_from_code_lengths($_))[1] } @trees;

                my $eob = @alphabet + 1;

                my @zrle;
                my $code = '';

                my $sel_idx = 0;
                my $tree    = $huffman_trees[$sels->[$sel_idx]];
                my $decoded = 50;

                while (!eof($fh)) {
                    $code .= read_bit($fh, \$buffer);

                    if (length($code) > $MaxHuffmanBits) {
                        confess "[!] Something went wrong: length of code `$code` is > $MaxHuffmanBits.";
                    }

                    if (exists($tree->{$code})) {

                        my $sym = $tree->{$code};

                        if ($sym == $eob) {    # end of block marker
                            $VERBOSE && say STDERR "EOB detected: $sym";
                            last;
                        }

                        push @zrle, $sym;
                        $code = '';

                        if (--$decoded <= 0) {
                            if (++$sel_idx <= $#$sels) {
                                $tree = $huffman_trees[$sels->[$sel_idx]];
                            }
                            else {
                                confess "No more selectors";    # should not happen
                            }
                            $decoded = 50;
                        }
                    }
                }

                my @mtf = reverse @{zrle_decode([reverse @zrle])};
                my $bwt = symbols2string mtf_decode(\@mtf, \@alphabet);

                my $rle4 = string2symbols bwt_decode($bwt, $bwt_idx);
                my $data = rle4_decode($rle4);
                my $dec  = symbols2string($data);

                my $new_crc32 = oct('0b' . int2bits_lsb(crc32(pack('b*', unpack('B*', $dec))), 32));

                $VERBOSE && say STDERR "Computed CRC32: $new_crc32";

                if ($crc32 != $new_crc32) {
                    confess "CRC32 error: $crc32 (stored) != $new_crc32 (actual)";
                }

                $decompressed .= $dec;
            }
            elsif ($block_magic eq "\27rE8P\x90") {    # BlockFooter
                $VERBOSE && say STDERR "Block footer detected";
                my $stored_stream_crc32 = bits2int($fh, 32, \$buffer);
                $VERBOSE && say STDERR "Stream CRC: $stored_stream_crc32";

                if ($stored_stream_crc32 != $stream_crc32) {
                    confess "Stream CRC32 error: $stored_stream_crc32 (stored) != $stream_crc32 (actual)";
                }

                $buffer = '';
                last;
            }
            else {
                confess "Unknown block magic: $block_magic";
            }
        }

        $VERBOSE && say STDERR "End of container";
    }

    return $decompressed;
}

########################################
# GZIP compressor
########################################

sub _code_length_encoding ($dict) {

    my @lengths;

    foreach my $symbol (0 .. max(keys %$dict) // 0) {
        if (exists($dict->{$symbol})) {
            push @lengths, length($dict->{$symbol});
        }
        else {
            push @lengths, 0;
        }
    }

lib/Compression/Util.pm  view on Meta::CPAN

        if ($block ne '') {
            $compressed .= int2bytes_lsb(length($block), 4);
            $compressed .= $block;
        }
    }

    $compressed .= int2bytes_lsb(0x00000000, 4);    # EndMark
    return $compressed;
}

###############################
# LZ4 decompressor
###############################

sub lz4_decompress($fh) {

    if (ref($fh) eq '') {
        open(my $fh2, '<:raw', \$fh) or confess "error: $!";
        return __SUB__->($fh2);
    }

    my $decompressed = '';

    while (!eof($fh)) {

        bytes2int_lsb($fh, 4) == 0x184D2204 or confess "Incorrect LZ4 Frame magic number";

        my $FLG = ord(getc($fh));
        my $BD  = ord(getc($fh));

        my $version    = $FLG & 0b11_00_00_00;
        my $B_indep    = $FLG & 0b00_10_00_00;
        my $B_checksum = $FLG & 0b00_01_00_00;
        my $C_size     = $FLG & 0b00_00_10_00;
        my $C_checksum = $FLG & 0b00_00_01_00;
        my $DictID     = $FLG & 0b00_00_00_01;

        my $Block_MaxSize = $BD & 0b0_111_0000;

        $VERBOSE && say STDERR "Maximum block size: $Block_MaxSize";

        if ($version != 0b01_00_00_00) {
            confess "Error: Invalid version number";
        }

        if ($C_size) {
            my $content_size = bytes2int_lsb($fh, 8);
            $VERBOSE && say STDERR "Content size: ", $content_size;
        }

        if ($DictID) {
            my $dict_id = bytes2int_lsb($fh, 4);
            $VERBOSE && say STDERR "Dictionary ID: ", $dict_id;
        }

        my $header_checksum = ord(getc($fh));

        # TODO: compute and verify the header checksum
        $VERBOSE && say STDERR "Header checksum: ", $header_checksum;

        my $decoded = '';

        while (!eof($fh)) {

            my $block_size = bytes2int_lsb($fh, 4);

            if ($block_size == 0x00000000) {    # signifies an EndMark
                $VERBOSE && say STDERR "Block size == 0";
                last;
            }

            $VERBOSE && say STDERR "Block size: $block_size";

            if ($block_size >> 31) {
                $VERBOSE && say STDERR "Highest bit set: ", $block_size;
                $block_size &= ((1 << 31) - 1);
                $VERBOSE && say STDERR "Block size: ", $block_size;
                my $uncompressed = '';
                read($fh, $uncompressed, $block_size);
                $decoded .= $uncompressed;
            }
            else {

                my $compressed = '';
                read($fh, $compressed, $block_size);

                while ($compressed ne '') {
                    my $len_byte = ord(substr($compressed, 0, 1, ''));

                    my $literals_length = $len_byte >> 4;
                    my $match_len       = $len_byte & 0b1111;

                    ## say STDERR "Literal: ",   $literals_length;
                    ## say STDERR "Match len: ", $match_len;

                    if ($literals_length == 15) {
                        while (1) {
                            my $byte_len = ord(substr($compressed, 0, 1, ''));
                            $literals_length += $byte_len;
                            last if $byte_len != 255;
                        }
                    }

                    ## say STDERR "Total literals length: ", $literals_length;

                    my $literals = '';

                    if ($literals_length > 0) {
                        $literals = substr($compressed, 0, $literals_length, '');
                    }

                    if ($compressed eq '') {    # end of block
                        $decoded .= $literals;
                        last;
                    }

                    my $offset = oct('0b' . reverse unpack('b16', substr($compressed, 0, 2, '')));

                    if ($offset == 0) {
                        confess "Corrupted block";
                    }

                    ## say STDERR "Offset: $offset";

                    if ($match_len == 15) {
                        while (1) {
                            my $byte_len = ord(substr($compressed, 0, 1, ''));
                            $match_len += $byte_len;
                            last if $byte_len != 255;
                        }
                    }

                    $decoded .= $literals;
                    $match_len += 4;

                    ## say STDERR "Total match len: $match_len\n";

                    if ($offset >= $match_len) {    # non-overlapping matches
                        $decoded .= substr($decoded, length($decoded) - $offset, $match_len);
                    }
                    elsif ($offset == 1) {
                        $decoded .= substr($decoded, -1) x $match_len;
                    }
                    else {                          # overlapping matches
                        foreach my $i (1 .. $match_len) {
                            $decoded .= substr($decoded, length($decoded) - $offset, 1);
                        }
                    }
                }
            }

            if ($B_checksum) {
                my $content_checksum = bytes2int_lsb($fh, 4);
                $VERBOSE && say STDERR "Block checksum: $content_checksum";
            }

            if ($B_indep) {    # blocks are independent of each other
                $decompressed .= $decoded;
                $decoded = '';
            }
            elsif (length($decoded) > 2**16) {    # blocks are dependent
                $decompressed .= substr($decoded, 0, -(2**16), '');
            }
        }

        # TODO: compute and verify checksum
        if ($C_checksum) {
            my $content_checksum = bytes2int_lsb($fh, 4);
            $VERBOSE && say STDERR "Content checksum: $content_checksum";
        }

        $decompressed .= $decoded;
    }

    return $decompressed;
}

1;

__END__

=encoding utf-8

=head1 NAME

Compression::Util - Implementation of various techniques used in data compression.

=head1 SYNOPSIS

    use 5.036;
    use Getopt::Std       qw(getopts);
    use Compression::Util qw(:all);

    use constant {CHUNK_SIZE => 1 << 17};

    local $Compression::Util::VERBOSE = 0;

    getopts('d', \my %opts);

    sub compress ($fh, $out_fh) {
        while (read($fh, (my $chunk), CHUNK_SIZE)) {
            print $out_fh bwt_compress($chunk);
        }
    }

    sub decompress ($fh, $out_fh) {
        while (!eof($fh)) {
            print $out_fh bwt_decompress($fh);
        }
    }

    $opts{d} ? decompress(\*STDIN, \*STDOUT) : compress(\*STDIN, \*STDOUT);

=head1 DESCRIPTION

B<Compression::Util> is a function-based module, implementing various techniques used in data compression, such as:

    * Burrows-Wheeler transform
    * Move-to-front transform
    * Huffman Coding
    * Arithmetic Coding (in fixed bits)
    * Run-length encoding
    * Fibonacci coding
    * Elias gamma/omega coding
    * Delta coding
    * BWT-based (de)compression
    * LZ77/LZSS (de)compression
    * LZW (de)compression
    * Bzip2 (de)compression
    * GZIP (de)compression
    * ZLIB (de)compression
    * LZ4 (de)compression

lib/Compression/Util.pm  view on Meta::CPAN

The function returns a 2D-array, with pairs: C<[symbol, run_length]>, such that the following code reconstructs the C<\@symbols> array:

    my @symbols = map { ($_->[0]) x $_->[1] } @$rl;

By default, the maximum run-length is unlimited.

=head2 rle4_encode

    my $rle4 = rle4_encode($string);
    my $rle4 = rle4_encode(\@symbols);
    my $rle4 = rle4_encode(\@symbols, $max_run);

Performs Run-Length Encoding (RLE) on a sequence of symbolic elements, specifically designed for runs of four or more consecutive symbols.

It takes two parameters: C<\@symbols>, representing an array of symbols, and C<$max_run>, indicating the maximum run length allowed during encoding.

The function returns the encoded RLE sequence as an array-ref of symbols.

By default, the maximum run-length is limited to C<255>.

=head2 rle4_decode

    my $symbols = rle4_decode(\@rle4);
    my $symbols = rle4_decode($rle4_string);

Inverse of C<rle4_encode()>.

=head2 zrle_encode

    my $zrle = zrle_encode(\@symbols);

Performs Zero-Run-Length Encoding (ZRLE) on a sequence of symbolic elements, returning the encoded ZRLE sequence as an array-ref of symbols.

This function efficiently encodes runs of zeros, but also increments each symbol by C<1>.

=head2 zrle_decode

    my $symbols = zrle_decode($zrle);

Inverse of C<zrle_encode()>.

=head2 ac_encode

    my ($bitstring, $freq) = ac_encode(\@symbols);

Performs Arithmetic Coding on the provided symbols.

It takes a single parameter, C<\@symbols>, representing the symbols to be encoded.

The function returns two values: C<$bitstring>, which is a string of 1s and 0s, and C<$freq>, representing the frequency table used for encoding.

=head2 ac_decode

    my $symbols = ac_decode($bits_fh, \%freq);
    my $symbols = ac_decode($bitstring, \%freq);

Performs Arithmetic Coding decoding using the provided frequency table and a string of 1s and 0s. Inverse of C<ac_encode()>.

It takes two parameters: C<$bitstring>, representing a string of 1s and 0s containing the arithmetic coded data, and C<\%freq>, representing the frequency table used for encoding.

The function returns the decoded sequence of symbols.

=head2 adaptive_ac_encode

    my ($bitstring, $alphabet) = adaptive_ac_encode(\@symbols);

Performs Adaptive Arithmetic Coding on the provided symbols.

It takes a single parameter, C<\@symbols>, representing the symbols to be encoded.

The function returns two values: C<$bitstring>, which is a string of 1s and 0s, and C<$alphabet>, which is an array-ref of distinct sorted symbols.

=head2 adaptive_ac_decode

    my $symbols = adaptive_ac_decode($bits_fh, \@alphabet);
    my $symbols = adaptive_ac_decode($bitstring, \@alphabet);

Performs Adaptive Arithmetic Coding decoding using the provided frequency table and a string of 1s and 0s.

It takes two parameters: C<$bitstring>, representing a string of 1s and 0s containing the adaptive arithmetic coded data, and C<\@alphabet>, representing the array of distinct sorted symbols that appear in the encoded data.

The function returns the decoded sequence of symbols.

=head2 lzw_encode

    my $symbols = lzw_encode($string);

Performs Lempel-Ziv-Welch (LZW) encoding on the provided string.

It takes a single parameter, C<$string>, representing the data to be encoded.

The function returns an array-ref of symbols.

=head2 lzw_decode

    my $string = lzw_decode(\@symbols);

Performs Lempel-Ziv-Welch (LZW) decoding on the provided symbols. Inverse of C<lzw_encode()>.

The function returns the decoded string.

=head1 INTERFACE FOR LOW-LEVEL FUNCTIONS

=head2 crc32

    my $int32 = crc32($data);
    my $int32 = crc32($data, $prev_crc32);

Compute the CRC32 checksum of a given string.

=head2 adler32

    my $int32 = adler32($data);
    my $int32 = adler32($data, $prev_adler32);

Compute the Adler32 checksum of a given string.

=head2 read_bit

    my $bit = read_bit($fh, \$buffer);

Reads a single bit from a file-handle C<$fh> (MSB order).

The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.

=head2 read_bit_lsb

    my $bit = read_bit_lsb($fh, \$buffer);

Reads a single bit from a file-handle C<$fh> (LSB order).

The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.

=head2 read_bits

    my $bitstring = read_bits($fh, $bits_len);

Reads a specified number of bits (C<$bits_len>) from a file-handle (C<$fh>) and returns them as a string, in MSB order.

=head2 read_bits_lsb

    my $bitstring = read_bits_lsb($fh, $bits_len);

Reads a specified number of bits (C<$bits_len>) from a file-handle (C<$fh>) and returns them as a string, in LSB order.

=head2 int2bits

    my $bitstring = int2bits($symbol, $size)

Convert a non-negative integer to a bitstring of width C<$size>, in MSB order.

=head2 int2bits_lsb

    my $bitstring = int2bits_lsb($symbol, $size)

Convert a non-negative integer to a bitstring of width C<$size>, in LSB order.

=head2 int2bytes

    my $string = int2bytes($symbol, $size);

lib/Compression/Util.pm  view on Meta::CPAN

Convert a non-negative integer to a byte-string of width C<$size>, in LSB order.

=head2 bits2int

    my $integer = bits2int($fh, $size, \$buffer);

Read C<$size> bits from a file-handle C<$fh> and convert them to an integer, in MSB order. Inverse of C<int2bits()>.

The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.

=head2 bits2int_lsb

    my $integer = bits2int_lsb($fh, $size, \$buffer);

Read C<$size> bits from a file-handle C<$fh> and convert them to an integer, in LSB order. Inverse of C<int2bits_lsb()>.

The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.

=head2 bytes2int

    my $integer = bytes2int($fh, $n);
    my $integer = bytes2int($str, $n);

Read C<$n> bytes from a file-handle C<$fh> or from a string C<$str> and convert them to an integer, in MSB order.

=head2 bytes2int_lsb

    my $integer = bytes2int_lsb($fh, $n);
    my $integer = bytes2int_lsb($str, $n);

Read C<$n> bytes from a file-handle C<$fh> or from a string C<$str> and convert them to an integer, in LSB order.

=head2 string2symbols

    my $symbols = string2symbols($string)

Returns an array-ref of code points, given a string.

=head2 symbols2string

    my $string = symbols2string(\@symbols)

Returns a string, given an array-ref of code points.

=head2 read_null_terminated

    my $string = read_null_terminated($fh)

Read a string from file-handle C<$fh> that ends with a NULL character ("\0").

=head2 binary_vrl_encode

    my $bitstring_enc = binary_vrl_encode($bitstring);

Given a string of 1s and 0s, returns back a bitstring of 1s and 0s encoded using variable run-length encoding.

=head2 binary_vrl_decode

    my $bitstring = binary_vrl_decode($bitstring_enc);

Given an encoded bitstring, returned by C<binary_vrl_encode()>, gives back the decoded string of 1s and 0s.

=head2 bwt_sort

    my $indices = bwt_sort($string);
    my $indices = bwt_sort($string, $lookahead_len);

Low-level function that sorts the rotations of a given string using the Burrows-Wheeler Transform (BWT) algorithm.

It takes two parameters: C<$string>, which is the input string to be transformed, and C<$LOOKAHEAD_LEN> (optional), representing the length of look-ahead during sorting.

The function returns an array-ref of indices.

There is probably no need to call this function explicitly. Use C<bwt_encode()> instead!

=head2 bwt_sort_symbolic

    my $indices = bwt_sort_symbolic(\@symbols);

Low-level function that sorts the rotations of a sequence of symbolic elements using the Burrows-Wheeler Transform (BWT) algorithm.

It takes a single parameter C<\@symbols>, which represents the input sequence of symbolic elements. The function returns an array of indices.

There is probably no need to call this function explicitly. Use C<bwt_encode_symbolic()> instead!

=head2 huffman_from_freq

    my $dict = huffman_from_freq(\%freq);
    my ($dict, $rev_dict) = huffman_from_freq(\%freq);

Low-level function that constructs Huffman prefix codes, based on the frequency of symbols provided in a hash table.

It takes a single parameter, C<\%freq>, representing the hash table where keys are symbols, and values are their corresponding frequencies.

The function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.

The prefix codes are in canonical form, as defined in RFC 1951 (Section 3.2.2).

=head2 huffman_from_symbols

    my $dict = huffman_from_symbols(\@symbols);
    my ($dict, $rev_dict) = huffman_from_symbols(\@symbols);

Low-level function that constructs Huffman prefix codes, given an array-ref of symbols.

It takes a single parameter, C<\@symbols>, from which it computes the frequency of each symbol and generates the corresponding Huffman prefix codes.

The function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.

The prefix codes are in canonical form, as defined in RFC 1951 (Section 3.2.2).

=head2 huffman_from_code_lengths

    my $dict = huffman_from_code_lengths(\@code_lengths);
    my ($dict, $rev_dict) = huffman_from_code_lengths(\@code_lengths);

    my $dict = huffman_from_code_lengths(\%code_lengths);
    my ($dict, $rev_dict) = huffman_from_code_lengths(\%code_lengths);

Low-level function that constructs a dictionary of canonical prefix codes as defined in RFC 1951 (Section 3.2.2), given an array-ref of code lengths or a hash-ref of (symbol => length) values,

It takes a single parameter, C<\@code_lengths>, where entry C<$i> in the array corresponds to the code length for symbol C<$i>.

Similarily, when a hash-ref table is given, C<\%code_lengths>, keys are the symbols and values are the code lengths. This variant is useful for large symbols.

In list context, the function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.

In scalar context, it returns only the C<$dict> table.

=head2 huffman_encode

    my $bitstring = huffman_encode(\@symbols, $dict);

Low-level function that performs Huffman encoding on a sequence of symbols using a provided dictionary, returned by C<huffman_from_freq()>.

It takes two parameters: C<\@symbols>, representing the sequence of symbols to be encoded, and C<$dict>, representing the Huffman dictionary mapping symbols to their corresponding Huffman codes.

The function returns a concatenated string of 1s and 0s, representing the Huffman-encoded sequence of symbols.

=head2 huffman_decode

    my $symbols = huffman_decode($bitstring, $rev_dict);

Low-level function that decodes a Huffman-encoded binary string into a sequence of symbols using a provided reverse dictionary.

It takes two parameters: C<$bitstring>, representing the Huffman-encoded string of 1s and 0s, as returned by C<huffman_encode()>, and C<$rev_dict>, representing the reverse dictionary mapping Huffman codes to their corresponding symbols.

The function returns the decoded sequence of symbols as an array-ref.

=head2 lz77_encode / lz77_encode_symbolic

    my ($literals, $distances, $lengths, $matches) = lz77_encode($string);
    my ($literals, $distances, $lengths, $matches) = lz77_encode(\@symbols);

Low-level function that combines LZSS with ideas from the LZ4 method.

The function returns four values:

    $literals   # array-ref of uncompressed symbols
    $distances  # array-ref of back-reference distances
    $lengths    # array-ref of literal lengths
    $matches    # array-ref of match lengths

The output can be decoded with C<lz77_decode()> and C<lz77_decode_symbolic()>, respectively.

=head2 lz77_decode / lz77_decode_symbolic

    my $string  = lz77_decode(\@literals, \@distances, \@lengths, \@matches);
    my $symbols = lz77_decode_symbolic(\@literals, \@distances, \@lengths, \@matches);

Low-level function that performs decoding using the provided literals, distances, lengths and matches, returned by LZ77 encoding.

Inverse of C<lz77_encode()> and C<lz77_encode_symbolic()>, respectively.

=head2 lzss_encode / lzss_encode_fast / lzss_encode_symbolic / lzss_encode_fast_symbolic

    # Standard version
    my ($literals, $distances, $lengths) = lzss_encode($data, %params);
    my ($literals, $distances, $lengths) = lzss_encode(\@symbols, %params);

    # Faster version
    my ($literals, $distances, $lengths) = lzss_encode_fast($data, %params);
    my ($literals, $distances, $lengths) = lzss_encode_fast(\@symbols, %params);

Low-level function that applies the LZSS (Lempel-Ziv-Storer-Szymanski) algorithm on the provided data.

The accepted C<%params> are:

    min_len         => $LZ_MIN_LEN,
    max_len         => $LZ_MAX_LEN,
    max_dist        => $LZ_MAX_DIST,
    max_chain_len   => $LZ_MAX_CHAIN_LEN,

The function returns three values:

    $literals   # array-ref of uncompressed symbols
    $distances  # array-ref of back-reference distances
    $lengths    # array-ref of match lengths

The output can be decoded with C<lzss_decode()> and C<lzss_decode_symbolic()>, respectively.

=head2 lzss_decode / lzss_decode_symbolic

    my $string  = lzss_decode(\@literals, \@distances, \@lengths);
    my $symbols = lzss_decode_symbolic(\@literals, \@distances, \@lengths);

Low-level function that decodes the LZSS encoding, using the provided literals, distances, and lengths of matched sub-strings.

Inverse of C<lzss_encode()> and C<lzss_encode_fast()>.

=head2 deflate_encode

    # Returns a binary string
    my $string = deflate_encode(\@literals, \@distances, \@lengths);
    my $string = deflate_encode(\@literals, \@distances, \@lengths, \&create_ac_entry);

Low-level function that encodes the results returned by C<lzss_encode()> and C<lzss_encode_fast()>, using a DEFLATE-like approach, combined with Huffman coding.

=head2 deflate_decode

    # Huffman decoding
    my ($literals, $distances, $lengths) = deflate_decode($fh);
    my ($literals, $distances, $lengths) = deflate_decode($string);

    # Arithmetic decoding
    my ($literals, $distances, $lengths) = deflate_decode($fh, \&decode_ac_entry);
    my ($literals, $distances, $lengths) = deflate_decode($string, \&decode_ac_entry);

Inverse of C<deflate_encode()>.

=head2 make_deflate_tables

    my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = make_deflate_tables($max_dist, $max_len);

Low-level function that returns a list of tables used in encoding the relative back-reference distances and lengths returned by C<lzss_encode()> and C<lzss_encode_fast()>.

When no arguments are provided:

    $max_dist = $Compression::Util::LZ_MAX_DIST
    $max_len  = $Compression::Util::LZ_MAX_LEN

There is no need to call this function explicitly. Use C<deflate_encode()> instead!

=head2 find_deflate_index

    my $index = find_deflate_index($value, $DISTANCE_SYMBOLS);

Low-level function that returns the index inside the DEFLATE tables for a given value.

=head2 deflate_create_block_type_0_header

    my $bt0_header = deflate_create_block_type_0_header($chunk);

Creates the header for a DEFLATE block of type 0 (uncompressed), as a bitstring, without including the block code number C<00>.

The length of the C<$chunk> must not exceed C<2^16 - 1>.

To create a DEFLATE block of type 0, including the content, use:

    my $block_type_0 = pack('b*', '00') . pack('b*', $bt0_header) . $chunk;

which can be recovered as:

    open my $fh, '<:raw', \$block_type_0;
    my ($buffer, $search_window) = ('', '');
    my $chunk = deflate_extract_next_block($fh, \$buffer, \$search_window);

=head2 deflate_create_block_type_1

    my $bitstring = deflate_create_block_type_1($literals, $distances, $lengths);

Creates a DEFLATE block of type 1 (fixed prefix-codes), as a bitstring, given the ARRAY-refs of literals, distances and lengths, returned by C<lzss_encode()>.

This type of block uses fixed prefix-codes and is pretty fast.

=head2 deflate_create_block_type_2

    my $bitstring = deflate_create_block_type_1($literals, $distances, $lengths);

Creates a DEFLATE block of type 2 (dynamic prefix-codes), as a bitstring, given the ARRAY-refs of literals, distances and lengths, returned by C<lzss_encode()>.

This type of block uses dynamic prefix-codes (Huffman codes) and produces good compression ratio on most inputs.

=head2 deflate_extract_block_type_0

    my $data = deflate_extract_block_type_0($fh, \$buffer, \$search_window);

Given an input filehandle, it extracts a DEFLATE block of type 0 (uncompressed).

    my ($buffer, $search_window) = ('', '');
    my $block_type = bits2int_lsb($fh, 2, \$buffer);
    $block_type == 0 or die "Not a block of type 0";
    my $decoded_chunk = deflate_extract_block_type_0($fh, \$buffer, \$search_window);

=head2 deflate_extract_block_type_1

    my $data = deflate_extract_block_type_1($fh, \$buffer, \$search_window);

Given an input filehandle, a bitstring buffer and a search window, it extracts a DEFLATE block of type 1 (fixed prefix-codes).

    my ($buffer, $search_window) = ('', '');
    my $block_type = bits2int_lsb($fh, 2, \$buffer);
    $block_type == 1 or die "Not a block of type 1";
    my $decoded_chunk = deflate_extract_block_type_1($fh, \$buffer, \$search_window);

=head2 deflate_extract_block_type_2

    my $data = deflate_extract_block_type_2($fh, \$buffer, \$search_window);

Given an input filehandle, a bitstring buffer and a search window, it extracts a DEFLATE block of type 2 (dynamic prefix-codes).

    my ($buffer, $search_window) = ('', '');
    my $block_type = bits2int_lsb($fh, 2, \$buffer);
    $block_type == 2 or die "Not a block of type 2";
    my $decoded_chunk = deflate_extract_block_type_2($fh, \$buffer, \$search_window);

=head2 deflate_extract_next_block

    my $data = deflate_extract_next_block($fh, \$buffer, \$search_window);

Given an input filehandle, a bitstring buffer and a search window, it extracts the next DEFLATE block. The next two bits in the input file-handle (or in the bitstring buffer) must contain the block-type number.

=head1 EXPORT

Each function can be exported individually, as:

    use Compression::Util qw(bwt_compress);

By specifying the B<:all> keyword, will export all the exportable functions:

    use Compression::Util qw(:all);

Nothing is exported by default.

=head1 EXAMPLES

The functions can be combined in various ways, easily creating novel compression methods, as illustrated in the following examples.

=head2 Combining LZSS + MRL compression:

    my $enc = lzss_compress($str, \&mrl_compress_symbolic);
    my $dec = lzss_decompress($enc, \&mrl_decompress_symbolic);

=head2 Combining LZ77 + OBH encoding:

    my $enc = lz77_compress($str, \&obh_encode);
    my $dec = lz77_decompress($enc, \&obh_decode);

=head2 Combining LZSS + symbolic BWT compression:

    my $enc = lzss_compress($str, \&bwt_compress_symbolic);
    my $dec = lzss_decompress($enc, \&bwt_decompress_symbolic);

=head2 Combining BWT + symbolic LZSS:

    my $enc = bwt_compress($str, \&lzss_compress_symbolic);
    my $dec = bwt_decompress($enc, \&lzss_decompress_symbolic);

=head2 Combining LZW + Fibonacci encoding:

    my $enc = lzw_compress($str, \&fibonacci_encode);
    my $dec = lzw_decompress($enc, \&fibonacci_decode);

=head2 Combining BWT + symbolic LZ77 + symbolic MRL:

    my $enc = bwt_compress($str, sub ($s) { lz77_compress_symbolic($s, \&mrl_compress_symbolic) });
    my $dec = bwt_decompress($enc, sub ($s) { lz77_decompress_symbolic($s, \&mrl_decompress_symbolic) });

=head2 Combining LZ77 + BWT compression + Fibonacci encoding + Huffman coding + OBH encoding + MRL compression:

    # Compression
    my $enc = do {
        my ($literals, $distances, $lengths, $matches) = lz77_encode($str);
        bwt_compress(symbols2string($literals))
          . fibonacci_encode($lengths)



( run in 1.111 second using v1.01-cache-2.11-cpan-df04353d9ac )