Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_allocator.cu view on Meta::CPAN
unsigned int power;
size_t rounded_bytes;
allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
// Check that that we have 4096 free bytes cached on the initial gpu
AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
// Check that that we have 1 cached blocks across all GPUs
AssertEquals(allocator.cached_blocks.size(), 1);
// Check that that still we have 0 live block across all GPUs
AssertEquals(allocator.live_blocks.size(), 0);
#ifndef CUB_CDP
// BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
if (num_gpus > 1)
{
printf("\nRunning multi-gpu tests...\n"); fflush(stdout);
//
// Test9
//
// Allocate 768 bytes on the next gpu
int next_gpu = (initial_gpu + 1) % num_gpus;
char *d_768B_2;
CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
// DeviceFree d_768B on the next gpu
CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
// Re-allocate 768 bytes on the next gpu
CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
// Re-free d_768B on the next gpu
CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
// Check that that we have 4096 free bytes cached on the initial gpu
AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
// Check that that we have 4096 free bytes cached on the second gpu
AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
// Check that that we have 2 cached blocks across all GPUs
AssertEquals(allocator.cached_blocks.size(), 2);
// Check that that still we have 0 live block across all GPUs
AssertEquals(allocator.live_blocks.size(), 0);
}
#endif // CUB_CDP
//
// Performance
//
printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
fflush(stdout); fflush(stderr);
// CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times
CpuTimer cpu_timer;
char *d_1024MB = NULL;
allocator.debug = false;
// Prime the caching allocator and the kernel
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
CubDebugExit(allocator.DeviceFree(d_1024MB));
cub::EmptyKernel<void><<<1, 32>>>();
// CUDA
cpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
CubDebugExit(cudaFree(d_1024MB));
}
cpu_timer.Stop();
float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
// CUB
cpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
CubDebugExit(allocator.DeviceFree(d_1024MB));
}
cpu_timer.Stop();
float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
cuda_malloc_elapsed_millis / timing_iterations,
cub_calloc_elapsed_millis / timing_iterations);
// GPU performance comparisons. Allocate and free a 1MB block 2000 times
GpuTimer gpu_timer;
printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
fflush(stdout); fflush(stderr);
// Kernel-only
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
cub::EmptyKernel<void><<<1, 32>>>();
}
gpu_timer.Stop();
float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
// CUDA
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
cub::EmptyKernel<void><<<1, 32>>>();
CubDebugExit(cudaFree(d_1024MB));
}
gpu_timer.Stop();
cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
// CUB
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
cub::EmptyKernel<void><<<1, 32>>>();
CubDebugExit(allocator.DeviceFree(d_1024MB));
}
gpu_timer.Stop();
cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
cuda_malloc_elapsed_millis / timing_iterations,
cub_calloc_elapsed_millis / timing_iterations);
#endif
printf("Success\n");
return 0;
}
( run in 1.529 second using v1.01-cache-2.11-cpan-2ed5026b665 )