Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
}
// Allocate temporary storage (and make it un-aligned)
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, true));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
// Initialize/clear device arrays
d_keys.selector = 0;
CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
d_values.selector = 0;
CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
}
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
// Run warmup/correctness iteration
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, true));
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
printf("Warmup done. Checking results:\n"); fflush(stdout);
int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
if (!KEYS_ONLY)
{
int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
compare |= values_compare;
printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
}
if (BACKEND == CUB_NO_OVERWRITE)
{
// Check that input isn't overwritten
int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
compare |= input_compare;
printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Performance
if (g_timing_iterations)
printf("\nPerforming timing iterations:\n"); fflush(stdout);
GpuTimer gpu_timer;
float elapsed_millis = 0.0f;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Initialize/clear device arrays
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
}
gpu_timer.Start();
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, false));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = (KEYS_ONLY) ?
giga_rate * sizeof(KeyT) * 2 :
giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
// Correctness asserts
AssertEquals(0, compare);
}
/**
* Test backend
*/
template <bool IS_DESCENDING, typename KeyT, typename ValueT>
void TestBackend(
KeyT *h_keys,
int num_items,
int num_segments,
int *h_segment_offsets,
int begin_bit,
int end_bit,
KeyT *h_reference_keys,
int *h_reference_ranks)
{
const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
ValueT *h_values = NULL;
ValueT *h_reference_values = NULL;
if (!KEYS_ONLY)
{
h_values = new ValueT[num_items];
h_reference_values = new ValueT[num_items];
for (int i = 0; i < num_items; ++i)
{
InitValue(INTEGER_SEED, h_values[i], i);
InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
}
( run in 0.483 second using v1.01-cache-2.11-cpan-39bf76dae61 )