Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_block_scan.cu view on Meta::CPAN
(SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(),
BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
ITEMS_PER_THREAD, TILE_SIZE,
typeid(T).name(), (int) sizeof(T));
fflush(stdout);
// Initialize/clear device arrays
T *d_in = NULL;
T *d_out = NULL;
T *d_aggregate = NULL;
clock_t *d_elapsed = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2)));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS));
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1)));
CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS));
// Display input problem data
if (g_verbose)
{
printf("Input data: ");
for (int i = 0; i < TILE_SIZE; i++)
{
std::cout << CoutCast(h_in[i]) << ", ";
}
printf("\n\n");
}
// Run block_aggregate/prefix kernel
dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
BlockScanKernel<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM><<<1, block_dims>>>(
d_in,
d_out,
d_aggregate,
scan_op,
initial_value,
d_elapsed);
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
printf("\tScan results: ");
int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
if (TEST_MODE == AGGREGATE)
{
// Copy out and display block_aggregate
printf("\tScan block aggregate: ");
compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
}
if (TEST_MODE == PREFIX)
{
// Copy out and display updated prefix
printf("\tScan running total: ");
T running_total = scan_op(initial_value, block_aggregate);
compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
}
printf("\tElapsed clocks: ");
DisplayDeviceResults(d_elapsed, 1);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (h_aggregate) delete[] h_aggregate;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
}
/**
* Test threadblock scan. (Specialized for insufficient resources)
*/
template <
int BLOCK_DIM_X,
int BLOCK_DIM_Y,
int BLOCK_DIM_Z,
int ITEMS_PER_THREAD,
ScanMode SCAN_MODE,
TestMode TEST_MODE,
BlockScanAlgorithm ALGORITHM,
typename ScanOpT,
typename T>
void Test(
GenMode gen_mode,
ScanOpT scan_op,
T initial_value,
Int2Type<false> sufficient_resources)
{}
/**
* Test threadblock scan.
*/
template <
int BLOCK_DIM_X,
int BLOCK_DIM_Y,
int BLOCK_DIM_Z,
int ITEMS_PER_THREAD,
ScanMode SCAN_MODE,
TestMode TEST_MODE,
BlockScanAlgorithm ALGORITHM,
typename ScanOpT,
typename T>
void Test(
GenMode gen_mode,
ScanOpT scan_op,
T initial_value)
{
( run in 0.669 second using v1.01-cache-2.11-cpan-39bf76dae61 )