Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/tune/tune_device_reduce.cu view on Meta::CPAN
//---------------------------------------------------------------------
// Test methods
//---------------------------------------------------------------------
/**
* Test a configuration
*/
void TestConfiguration(
MultiDispatchTuple &multi_dispatch,
SingleDispatchTuple &single_dispatch,
T* d_in,
T* d_out,
T* h_reference,
OffsetT num_items,
ReductionOp reduction_op)
{
// Clear output
if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Warmup/correctness iteration
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
if (g_verify) CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
int compare = (g_verify) ?
CompareDeviceResults(h_reference, d_out, 1, true, false) :
0;
// Performance
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i < g_timing_iterations; i++)
{
gpu_timer.Start();
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Mooch
CubDebugExit(cudaDeviceSynchronize());
float avg_elapsed = elapsed_millis / g_timing_iterations;
float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
float avg_bandwidth = avg_throughput * sizeof(T);
multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
if (avg_throughput > multi_dispatch.best_avg_throughput)
{
multi_dispatch.best_avg_throughput = avg_throughput;
multi_dispatch.best_size = num_items;
}
single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
if (avg_throughput > single_dispatch.best_avg_throughput)
{
single_dispatch.best_avg_throughput = avg_throughput;
single_dispatch.best_size = num_items;
}
if (g_verbose)
{
printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
multi_dispatch.params.Print();
printf(" ), single_dispatch( ");
single_dispatch.params.Print();
printf(" )\n");
fflush(stdout);
}
AssertEquals(0, compare);
// Cleanup temporaries
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
}
/**
* Evaluate multi-block configurations
*/
void TestMulti(
T* h_in,
T* d_in,
T* d_out,
ReductionOp reduction_op)
{
// Simple single kernel tuple for use with multi kernel sweep
typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
SingleDispatchTuple simple_single_tuple;
simple_single_tuple.params.template Init<SimpleSinglePolicy>();
simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;
double max_exponent = log2(double(g_max_items));
double min_exponent = log2(double(simple_single_tuple.params.tile_size));
unsigned int max_int = (unsigned int) -1;
for (int sample = 0; sample < g_samples; ++sample)
( run in 0.576 second using v1.01-cache-2.11-cpan-39bf76dae61 )