ClickHouse-Encoder

 view release on metacpan or  search on metacpan

bench/wide_table_benchmark.pl  view on Meta::CPAN

        rand(100),                             # score
        'meta_' . $i . '_' . int(rand(1000)),     # metadata
    ];
}
my $gen_time = time() - $t0;
printf "Data generation: %.2f sec\n\n", $gen_time;

# Setup encoder
my $encoder = ClickHouse::Encoder->new(columns => \@columns);

# Encode Native format
print "Encoding Native format...\n";
$t0 = time();
my $native_data = $encoder->encode(\@data);
my $native_encode_time = time() - $t0;

# Encode TabSeparated (CSV-like)
print "Encoding TabSeparated format...\n";
$t0 = time();
my $csv_data = '';
for my $row (@data) {
    my @escaped = map {
        my $v = $_;
        $v =~ s/\\/\\\\/g;
        $v =~ s/\t/\\t/g;
        $v =~ s/\n/\\n/g;
        $v;
    } @$row;
    $csv_data .= join("\t", @escaped) . "\n";
}
my $csv_encode_time = time() - $t0;

print "\n", "-" x 70, "\n";
print "Encoding Results\n";
print "-" x 70, "\n\n";

printf "Native format:\n";
printf "  Time:       %.3f sec\n", $native_encode_time;
printf "  Speed:      %.0f rows/sec\n", $ROWS / $native_encode_time;
printf "  Size:       %.2f MB\n", length($native_data) / 1024 / 1024;

printf "\nTabSeparated format:\n";
printf "  Time:       %.3f sec\n", $csv_encode_time;
printf "  Speed:      %.0f rows/sec\n", $ROWS / $csv_encode_time;
printf "  Size:       %.2f MB\n", length($csv_data) / 1024 / 1024;

printf "\nNative encoding is %.1fx faster\n", $csv_encode_time / $native_encode_time;
printf "Native data is %.0f%% smaller\n", (1 - length($native_data) / length($csv_data)) * 100;

# Create test table
print "\n", "=" x 70, "\n";
print "INSERT Benchmark\n";
print "=" x 70, "\n\n";

print "Setting up test table...\n";
system("clickhouse-client --port $PORT --query 'drop table if exists bench_wide'");
system("clickhouse-client --port $PORT --query 'create table bench_wide (\n    $col_defs\n) engine = Null'");

# Benchmark function
sub bench_insert {
    my ($format, $data, $iterations) = @_;
    my @times;

    for my $i (1 .. $iterations) {
        my $t0 = time();
        open my $fh, '|-', "clickhouse-client --port $PORT --query 'insert into bench_wide format $format' 2>/dev/null"
            or die "Cannot run clickhouse-client: $!";
        binmode $fh;
        print $fh $data;
        close $fh;
        my $elapsed = time() - $t0;
        push @times, $elapsed;
        printf "  Run %d: %.3f sec\n", $i, $elapsed;
    }

    return @times;
}

# Warmup
print "Warming up...\n";
bench_insert('Native', $native_data, 1);
bench_insert('TabSeparated', $csv_data, 1);

# Benchmark
print "\nBenchmarking Native format ($ITERATIONS iterations)...\n";
my @native_times = bench_insert('Native', $native_data, $ITERATIONS);

print "\nBenchmarking TabSeparated format ($ITERATIONS iterations)...\n";
my @csv_times = bench_insert('TabSeparated', $csv_data, $ITERATIONS);

# Calculate statistics
sub stats {
    my @times = @_;
    my $sum = 0;
    $sum += $_ for @times;
    my $avg = $sum / @times;
    my $min = (sort { $a <=> $b } @times)[0];
    return ($avg, $min);
}

my ($native_avg, $native_min) = stats(@native_times);
my ($csv_avg, $csv_min) = stats(@csv_times);

print "\n", "=" x 70, "\n";
print "RESULTS: $ROWS rows x 20 columns\n";
print "=" x 70, "\n\n";

printf "Native format:\n";
printf "  Avg time:   %.3f sec\n", $native_avg;
printf "  Best time:  %.3f sec\n", $native_min;
printf "  Throughput: %.0f rows/sec\n", $ROWS / $native_avg;
printf "  Bandwidth:  %.1f MB/sec\n", length($native_data) / $native_avg / 1024 / 1024;

printf "\nTabSeparated format:\n";
printf "  Avg time:   %.3f sec\n", $csv_avg;
printf "  Best time:  %.3f sec\n", $csv_min;
printf "  Throughput: %.0f rows/sec\n", $ROWS / $csv_avg;
printf "  Bandwidth:  %.1f MB/sec\n", length($csv_data) / $csv_avg / 1024 / 1024;

my $speedup = $csv_avg / $native_avg;
print "\n", "=" x 70, "\n";
printf "Native INSERT is %.2fx faster than TabSeparated\n", $speedup;
print "=" x 70, "\n";

# Cleanup
system("clickhouse-client --port $PORT --query 'drop table if exists bench_wide'");

print "\nDone.\n";



( run in 1.056 second using v1.01-cache-2.11-cpan-71847e10f99 )