Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
lower_level[channel],
((upper_level[channel] - lower_level[channel]) / bins));
}
Initialize<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
max_level, entropy_reduction, h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
SampleT* d_samples = NULL;
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 8;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Check canary zones
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (h_samples) delete[] h_samples;
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
}
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
// Allocate and initialize device data
SampleT* d_samples = NULL;
LevelT* d_levels[NUM_ACTIVE_CHANNELS];
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel], sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
int bins = num_levels[channel] - 1;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * bins));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * bins));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 9;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Check canary zones
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (h_samples) delete[] h_samples;
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
if (d_levels[channel])
CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
}
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
( run in 0.710 second using v1.01-cache-2.11-cpan-e1769b4cff6 )