timer results from the CPAN

Alien-XGBoost
    unsigned int power;
    size_t rounded_bytes;
    allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);

    // Check that that we have 4096 free bytes cached on the initial gpu
    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);

    // Check that that we have 1 cached blocks across all GPUs
    AssertEquals(allocator.cached_blocks.size(), 1);

    // Check that that still we have 0 live block across all GPUs
    AssertEquals(allocator.live_blocks.size(), 0);

#ifndef CUB_CDP
    // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not

    if (num_gpus > 1)
    {
        printf("\nRunning multi-gpu tests...\n"); fflush(stdout);

        //
        // Test9
        //

        // Allocate 768 bytes on the next gpu
        int next_gpu = (initial_gpu + 1) % num_gpus;
        char *d_768B_2;
        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));

        // DeviceFree d_768B on the next gpu
        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));

        // Re-allocate 768 bytes on the next gpu
        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));

        // Re-free d_768B on the next gpu
        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));

        // Check that that we have 4096 free bytes cached on the initial gpu
        AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);

        // Check that that we have 4096 free bytes cached on the second gpu
        AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);

        // Check that that we have 2 cached blocks across all GPUs
        AssertEquals(allocator.cached_blocks.size(), 2);

        // Check that that still we have 0 live block across all GPUs
        AssertEquals(allocator.live_blocks.size(), 0);
    }
#endif  // CUB_CDP

    //
    // Performance
    //

    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
    fflush(stdout); fflush(stderr);

    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
    CpuTimer    cpu_timer;
    char        *d_1024MB                       = NULL;
    allocator.debug                             = false;

    // Prime the caching allocator and the kernel
    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
    CubDebugExit(allocator.DeviceFree(d_1024MB));
    cub::EmptyKernel<void><<<1, 32>>>();

    // CUDA
    cpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
        CubDebugExit(cudaFree(d_1024MB));
    }
    cpu_timer.Stop();
    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();

    // CUB
    cpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
        CubDebugExit(allocator.DeviceFree(d_1024MB));
    }
    cpu_timer.Stop();
    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();

    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
        cuda_malloc_elapsed_millis / timing_iterations,
        cub_calloc_elapsed_millis / timing_iterations);

    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
    GpuTimer gpu_timer;

    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
    fflush(stdout); fflush(stderr);

    // Kernel-only
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        cub::EmptyKernel<void><<<1, 32>>>();
    }
    gpu_timer.Stop();
    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();

    // CUDA
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
        cub::EmptyKernel<void><<<1, 32>>>();
        CubDebugExit(cudaFree(d_1024MB));
    }
    gpu_timer.Stop();
    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

    // CUB
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
        cub::EmptyKernel<void><<<1, 32>>>();
        CubDebugExit(allocator.DeviceFree(d_1024MB));
    }
    gpu_timer.Stop();
    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
        cuda_malloc_elapsed_millis / timing_iterations,
        cub_calloc_elapsed_millis / timing_iterations);


#endif

    printf("Success\n");

    return 0;
}
( run in 1.163 second using v1.01-cache-2.11-cpan-524268b4103 )