ClickHouse-Encoder

 view release on metacpan or  search on metacpan

eg/clickhouse_replication.pl  view on Meta::CPAN

#!/usr/bin/env perl
# Replicate one ClickHouse table to another (potentially on a different
# server) by streaming Native bytes end-to-end -- no Perl-side row
# decode at all. The encoder isn't strictly needed here since both
# sides speak Native, but for_table() validates the destination schema
# and the script demonstrates the streaming pipe pattern users build
# their own variants on (filtering, transforms, etc.).
#
# This is the "free" case: source select format native, destination
# insert format native, identical schemas. With matching schemas the
# server-side codepath is a copy through the column ColumnPtr layer,
# which is the fastest cluster-reshard primitive ClickHouse offers.
#
# Usage:
#     CH_SRC=src.example:8123 CH_DST=dst.example:8123 \
#     perl eg/clickhouse_replication.pl src.events dst.events
#
#     # With a where clause:
#     SQL_FILTER="dt >= today() - 7" perl eg/clickhouse_replication.pl ...

use strict;
use warnings;
use lib 'blib/lib', 'blib/arch';
use ClickHouse::Encoder;
use HTTP::Tiny;
use File::Temp qw(tempfile);

my ($src_table, $dst_table) = @ARGV;
die "Usage: $0 <src_table> <dst_table>\n" unless $src_table && $dst_table;

my $src_endpoint = $ENV{CH_SRC} // 'localhost:8123';
my $dst_endpoint = $ENV{CH_DST} // 'localhost:8123';
my $src_url = "http://$src_endpoint/";
my $dst_url = "http://$dst_endpoint/";
my $where   = $ENV{SQL_FILTER};

my ($dst_host, $dst_port) = split /:/, $dst_endpoint, 2;
$dst_port //= 8123;

# Validate the destination schema against the actual destination server
# (not localhost). We don't use the encoder for the data path -- rows go
# server-to-server as Native bytes.
my $enc = ClickHouse::Encoder->for_table($dst_table,
    via  => 'http',
    host => $dst_host,
    port => $dst_port,
);
print STDERR "destination schema validates: ", scalar(@{ $enc->columns }),
    " columns\n";

# Pull from source as Native, spool to a temp file, post to destination.
# HTTP::Tiny is synchronous so we can't pipe one connection straight into
# the other; the temp file keeps process memory bounded by HTTP::Tiny's
# chunk buffer (Perl-side), trading bandwidth-disk-bandwidth for RAM.
# For tables that don't fit on local disk, partition the source query
# (where on a key range) and run this script per partition.
my $select = "select * from $src_table"
           . ($where ? " where $where" : '')
           . " format native";
my $insert = "insert into $dst_table format native";

my $http = HTTP::Tiny->new(timeout => 600);

my ($spool, $spool_path) = tempfile('ch-repl-XXXXXX', UNLINK => 1, TMPDIR => 1);
binmode $spool;



( run in 2.137 seconds using v1.01-cache-2.11-cpan-140bd7fdf52 )