Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
Backend BACKEND,
typename DeviceKeyInputIteratorT,
typename DeviceValueInputIteratorT,
typename KeyT,
typename ValueT,
typename EqualityOpT,
typename ReductionOpT>
void Test(
DeviceKeyInputIteratorT d_keys_in,
DeviceValueInputIteratorT d_values_in,
KeyT* h_keys_reference,
ValueT* h_values_reference,
EqualityOpT equality_op,
ReductionOpT reduction_op,
int num_segments,
int num_items)
{
// Allocate device output arrays and number of segments
KeyT* d_keys_out = NULL;
ValueT* d_values_out = NULL;
int* d_num_runs = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Clear device output arrays
CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
// Run warmup/correctness iteration
CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
// Check for correctness (and display results, if specified)
int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
int bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
printf("\n\n");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Cleanup
if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, compare1 | compare2 | compare3);
}
/**
* Test DeviceSelect on pointer type
*/
template <
Backend BACKEND,
typename KeyT,
typename ValueT,
typename ReductionOpT>
void TestPointer(
int num_items,
int entropy_reduction,
int max_segment,
ReductionOpT reduction_op)
{
// Allocate host arrays
KeyT* h_keys_in = new KeyT[num_items];
KeyT* h_keys_reference = new KeyT[num_items];
ValueT* h_values_in = new ValueT[num_items];
ValueT* h_values_reference = new ValueT[num_items];
for (int i = 0; i < num_items; ++i)
InitValue(INTEGER_SEED, h_values_in[i], 1);
// Initialize problem and solution
Equality equality_op;
Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
(BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
( run in 0.625 second using v1.01-cache-2.11-cpan-524268b4103 )