Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
// Allocate device output array
OutputT *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(Dispatch(
Int2Type<BACKEND>(),
Int2Type<Traits<OutputT>::PRIMITIVE>(),
1,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
initial_value,
num_items,
0,
true));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Clear device output array
CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
// Run warmup/correctness iteration
CubDebugExit(Dispatch(
Int2Type<BACKEND>(),
Int2Type<Traits<OutputT>::PRIMITIVE>(),
1,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
initial_value,
num_items,
0,
true));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(),
Int2Type<Traits<OutputT>::PRIMITIVE>(),
g_timing_iterations,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
initial_value,
num_items,
0,
false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
}
printf("\n\n");
// Cleanup
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, compare);
}
/**
* Test DeviceScan on pointer type
*/
template <
Backend BACKEND,
typename InputT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
void TestPointer(
int num_items,
GenMode gen_mode,
ScanOpT scan_op,
InitialValueT initial_value)
{
printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
(BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
(Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
(Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
num_items,
typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
(gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
fflush(stdout);
// Allocate host arrays
InputT* h_in = new InputT[num_items];
OutputT* h_reference = new OutputT[num_items];
// Initialize problem and solution
Initialize(gen_mode, h_in, num_items);
Solve(h_in, h_reference, num_items, scan_op, initial_value);
// Allocate problem device arrays
InputT *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
// Initialize device input
( run in 0.880 second using v1.01-cache-2.11-cpan-39bf76dae61 )