Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN
h_values = new Value[num_values];
vector<OffsetT> segment_offsets;
Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);
// Allocate simple offsets array and copy STL vector into it
h_segment_offsets = new OffsetT[segment_offsets.size()];
for (int i = 0; i < segment_offsets.size(); ++i)
h_segment_offsets[i] = segment_offsets[i];
OffsetT num_segments = segment_offsets.size() - 1;
if (g_verbose)
{
printf("%d segment offsets: ", num_segments);
for (int i = 0; i < num_segments; ++i)
std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
if (g_verbose) std::cout << std::endl << std::endl;
}
// Solve problem on host
h_reference = new Value[num_segments];
ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);
printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
(CDP) ? "CDP device invoked" : "Host-invoked",
(Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
num_values, (int) sizeof(Value), type_string,
num_segments, (int) sizeof(OffsetT));
fflush(stdout);
// Allocate and initialize problem on device
Value *d_values = NULL;
OffsetT *d_segment_offsets = NULL;
Value *d_output = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
// Request and allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Clear device output
CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));
// Run warmup/correctness iteration
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
for (int i = 0; i < g_timing_iterations; ++i)
{
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
}
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
float giga_bandwidth = giga_rate *
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
// Device cleanup
if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Host cleanup
if (h_values) delete[] h_values;
if (h_segment_offsets) delete[] h_segment_offsets;
if (h_reference) delete[] h_reference;
}
/**
* Main
*/
int main(int argc, char** argv)
{
int num_values = 32 * 1024 * 1024;
int avg_segment_size = 500;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_values);
args.GetCmdLineArgument("ss", avg_segment_size);
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--v] "
"[--i=<timing iterations>] "
"[--n=<input samples>]\n"
"[--ss=<average segment size>]\n"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
return 0;
}
( run in 0.512 second using v1.01-cache-2.11-cpan-39bf76dae61 )