ClickHouse-Encoder

 view release on metacpan or  search on metacpan

xt/fuzz-decoder.t  view on Meta::CPAN

#!/usr/bin/env perl
# Decoder fuzz: feed random bytes / structurally-corrupted Native
# buffers to decode_block and assert the process never segfaults.
# The only acceptable outcomes are: clean decode or a Perl-level
# croak. ASAN-clean too (pair with xt/asan.t for tighter coverage).
#
# Skipped unless RELEASE_TESTING=1. Tweak FUZZ_DECODER_ITERS to scale.

use strict;
use warnings;
use Test::More;
use lib 'blib/lib', 'blib/arch';
use ClickHouse::Encoder;

plan skip_all => 'set RELEASE_TESTING=1 to run decoder fuzz'
    unless $ENV{RELEASE_TESTING};

# Bound address space so a pathological decode allocation gets killed
# rather than driving the host into swap. RLIMIT_AS counts ALL virtual
# memory including the perl interpreter, shared libraries, and Perl's
# own arena slabs - a freshly-loaded perl already uses 100-300 MiB of
# VA on Linux. Set a generous cap (2 GiB); any legitimate decoder
# allocation stays well under it. The cap is what catches a pathological
# `Newx(N, ...)` for a corrupted N before it drives the host into swap.
my $cap_ok = eval {
    require BSD::Resource;
    BSD::Resource::setrlimit(
        BSD::Resource::RLIMIT_AS(),
        2 * 1024 * 1024 * 1024,
        2 * 1024 * 1024 * 1024);
    1;
};
plan skip_all => 'BSD::Resource not available; skipping unbounded fuzz'
    unless $cap_ok;

srand($ENV{FUZZ_DECODER_SEED} // 19937);

# 150 is the empirical safe ceiling under RLIMIT_AS=2 GiB with the
# default seed: enough iterations to exercise each seed shape several
# times without the cumulative Perl arena footprint plus a worst-case
# bounded allocation tripping the cap. Bump FUZZ_DECODER_ITERS in CI
# when adding more aggressive fuzz patterns.
my $iters = $ENV{FUZZ_DECODER_ITERS} // 150;

# Build a known-good seed block for each interesting type, then
# mutate. Both fully-random and surgically-perturbed inputs exercise
# different decoder paths.
my @seeds;

# Generate seed buffers we can corrupt later.
for my $type (
    'Int32', 'String', 'Array(Int64)', 'Nullable(String)',
    'Tuple(Int32, String)', 'Map(String, Int32)',
    'LowCardinality(String)', 'Variant(Int32, String)',
    'JSON', 'Dynamic',
) {
    my $enc = eval { ClickHouse::Encoder->new(columns => [['c', $type]]) };
    next unless $enc;
    my @rows;
    if ($type eq 'JSON') {
        @rows = ([{a => 1, b => "x"}], [{tags => [1,2,3]}], [undef]);
    } elsif ($type eq 'Dynamic') {
        @rows = ([1], ["x"], [[1,2]], [undef]);
    } elsif ($type =~ /^Variant/) {
        @rows = ([[0, 42]], [[1, "hi"]], [undef]);
    } elsif ($type =~ /Map/) {
        @rows = ([{a=>1,b=>2}], [{}]);
    } elsif ($type =~ /Tuple/) {
        @rows = ([[1,"x"]], [[2,"y"]]);
    } elsif ($type =~ /Array/) {
        @rows = ([[1,2,3]], [[]]);
    } elsif ($type eq 'Nullable(String)') {
        @rows = (['x'], [undef], ['y']);
    } else {
        @rows = ([1], [2], [3]);
    }
    my $bytes = eval { $enc->encode(\@rows) };
    push @seeds, $bytes if defined $bytes && length $bytes;
}

ok(@seeds, 'have ' . scalar(@seeds) . ' seed buffers');

my $crashed = 0;
my $survived = 0;

# Per-iteration size cap: skip fuzz buffers whose first varints would
# claim more memory than makes sense. Cheap pre-screen so we never
# even invoke decode_block on obvious OOM bait. Decoder hardening
# catches the rest; this is belt-and-suspenders.
sub _looks_safe {
    my $b = shift;
    return 0 if length($b) < 2;
    my @bytes = unpack 'C*', substr($b, 0, 20);
    my ($v, $shift) = (0, 0);
    for my $byte (@bytes) {
        $v |= ($byte & 0x7f) << $shift;
        return $v < 1024 * 1024 if !($byte & 0x80);
        $shift += 7;
        return 0 if $shift >= 64;
    }
    return 0;
}

for my $i (1 .. $iters) {
    my $bytes;
    if ($i % 3 == 0) {
        # Fully random bytes
        my $n = int(rand 200) + 1;
        $bytes = join '', map chr(int rand 256), 1..$n;
    } else {
        # Pick a seed, flip a few bytes / chop the end / inject zeros.
        my $seed = $seeds[int rand @seeds];
        $bytes = $seed;
        my $mode = int rand 4;
        if    ($mode == 0) {
            # Chop the buffer to a random prefix.
            $bytes = substr($bytes, 0, int rand length($bytes));
        }
        elsif ($mode == 1) {
            # Flip ~5% of bytes.
            my $n = int(length($bytes) * 0.05) + 1;
            for (1..$n) {
                my $pos = int rand length($bytes);
                substr($bytes, $pos, 1, chr(int rand 256));
            }
        }
        elsif ($mode == 2) {
            # Insert junk bytes mid-buffer.
            my $pos = int rand length($bytes);
            my $junk = join '', map chr(int rand 256), 1..int(rand 16)+1;
            substr($bytes, $pos, 0, $junk);
        }
        else {
            # Append trailing garbage.
            $bytes .= join '', map chr(int rand 256), 1..int(rand 32);
        }
    }
    next unless _looks_safe($bytes);
    # Wrap in a fresh sub call so the decoder's return value and any
    # mortal SVs created during decode are reclaimed before the next
    # iteration. Without this, mortals can pile up across hundreds
    # of iterations until the test process's heap is exhausted.
    eval { _try_decode($bytes); 1 };
    # "Out of memory" / "Killed" from RLIMIT_AS aborts the test process
    # and never returns to this branch - but catching the symbolic text
    # in $@ surfaces a real allocation gap if the eval somehow recovered
    # without the process dying.
    if ($@ && $@ =~ /Segmentation|stack overflow|Out of memory|Killed/i) {
        $crashed++;
        diag "CRASH at iter $i: $@";
    } else {
        $survived++;
    }
}

sub _try_decode {
    ClickHouse::Encoder->decode_block($_[0]);
    return;
}

is($crashed, 0, "$iters fuzz iterations: 0 crashes ($survived survived)");

done_testing();



( run in 0.958 second using v1.01-cache-2.11-cpan-71847e10f99 )