ClickHouse-Encoder
view release on metacpan or search on metacpan
bench/wide_table_benchmark.pl view on Meta::CPAN
rand(100), # score
'meta_' . $i . '_' . int(rand(1000)), # metadata
];
}
my $gen_time = time() - $t0;
printf "Data generation: %.2f sec\n\n", $gen_time;
# Setup encoder
my $encoder = ClickHouse::Encoder->new(columns => \@columns);
# Encode Native format
print "Encoding Native format...\n";
$t0 = time();
my $native_data = $encoder->encode(\@data);
my $native_encode_time = time() - $t0;
# Encode TabSeparated (CSV-like)
print "Encoding TabSeparated format...\n";
$t0 = time();
my $csv_data = '';
for my $row (@data) {
my @escaped = map {
my $v = $_;
$v =~ s/\\/\\\\/g;
$v =~ s/\t/\\t/g;
$v =~ s/\n/\\n/g;
$v;
} @$row;
$csv_data .= join("\t", @escaped) . "\n";
}
my $csv_encode_time = time() - $t0;
print "\n", "-" x 70, "\n";
print "Encoding Results\n";
print "-" x 70, "\n\n";
printf "Native format:\n";
printf " Time: %.3f sec\n", $native_encode_time;
printf " Speed: %.0f rows/sec\n", $ROWS / $native_encode_time;
printf " Size: %.2f MB\n", length($native_data) / 1024 / 1024;
printf "\nTabSeparated format:\n";
printf " Time: %.3f sec\n", $csv_encode_time;
printf " Speed: %.0f rows/sec\n", $ROWS / $csv_encode_time;
printf " Size: %.2f MB\n", length($csv_data) / 1024 / 1024;
printf "\nNative encoding is %.1fx faster\n", $csv_encode_time / $native_encode_time;
printf "Native data is %.0f%% smaller\n", (1 - length($native_data) / length($csv_data)) * 100;
# Create test table
print "\n", "=" x 70, "\n";
print "INSERT Benchmark\n";
print "=" x 70, "\n\n";
print "Setting up test table...\n";
system("clickhouse-client --port $PORT --query 'drop table if exists bench_wide'");
system("clickhouse-client --port $PORT --query 'create table bench_wide (\n $col_defs\n) engine = Null'");
# Benchmark function
sub bench_insert {
my ($format, $data, $iterations) = @_;
my @times;
for my $i (1 .. $iterations) {
my $t0 = time();
open my $fh, '|-', "clickhouse-client --port $PORT --query 'insert into bench_wide format $format' 2>/dev/null"
or die "Cannot run clickhouse-client: $!";
binmode $fh;
print $fh $data;
close $fh;
my $elapsed = time() - $t0;
push @times, $elapsed;
printf " Run %d: %.3f sec\n", $i, $elapsed;
}
return @times;
}
# Warmup
print "Warming up...\n";
bench_insert('Native', $native_data, 1);
bench_insert('TabSeparated', $csv_data, 1);
# Benchmark
print "\nBenchmarking Native format ($ITERATIONS iterations)...\n";
my @native_times = bench_insert('Native', $native_data, $ITERATIONS);
print "\nBenchmarking TabSeparated format ($ITERATIONS iterations)...\n";
my @csv_times = bench_insert('TabSeparated', $csv_data, $ITERATIONS);
# Calculate statistics
sub stats {
my @times = @_;
my $sum = 0;
$sum += $_ for @times;
my $avg = $sum / @times;
my $min = (sort { $a <=> $b } @times)[0];
return ($avg, $min);
}
my ($native_avg, $native_min) = stats(@native_times);
my ($csv_avg, $csv_min) = stats(@csv_times);
print "\n", "=" x 70, "\n";
print "RESULTS: $ROWS rows x 20 columns\n";
print "=" x 70, "\n\n";
printf "Native format:\n";
printf " Avg time: %.3f sec\n", $native_avg;
printf " Best time: %.3f sec\n", $native_min;
printf " Throughput: %.0f rows/sec\n", $ROWS / $native_avg;
printf " Bandwidth: %.1f MB/sec\n", length($native_data) / $native_avg / 1024 / 1024;
printf "\nTabSeparated format:\n";
printf " Avg time: %.3f sec\n", $csv_avg;
printf " Best time: %.3f sec\n", $csv_min;
printf " Throughput: %.0f rows/sec\n", $ROWS / $csv_avg;
printf " Bandwidth: %.1f MB/sec\n", length($csv_data) / $csv_avg / 1024 / 1024;
my $speedup = $csv_avg / $native_avg;
print "\n", "=" x 70, "\n";
printf "Native INSERT is %.2fx faster than TabSeparated\n", $speedup;
print "=" x 70, "\n";
# Cleanup
system("clickhouse-client --port $PORT --query 'drop table if exists bench_wide'");
print "\nDone.\n";
( run in 1.056 second using v1.01-cache-2.11-cpan-71847e10f99 )