Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_block_scan.cu view on Meta::CPAN
/// Inclusive sum (PREFIX, 1)
template <typename BlockScanT, typename T, typename PrefixCallbackOp>
__device__ __forceinline__ void DeviceTest(
BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
{
block_scan.InclusiveSum(data[0], data[0], prefix_op);
}
/// Inclusive sum (PREFIX, ITEMS_PER_THREAD)
template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
__device__ __forceinline__ void DeviceTest(
BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
{
block_scan.InclusiveSum(data, data, prefix_op);
}
//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------
/**
* BlockScan test kernel.
*/
template <
int BLOCK_DIM_X,
int BLOCK_DIM_Y,
int BLOCK_DIM_Z,
int ITEMS_PER_THREAD,
ScanMode SCAN_MODE,
TestMode TEST_MODE,
BlockScanAlgorithm ALGORITHM,
typename T,
typename ScanOpT>
__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
__global__ void BlockScanKernel(
T *d_in,
T *d_out,
T *d_aggregate,
ScanOpT scan_op,
T initial_value,
clock_t *d_elapsed)
{
const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
// Parameterize BlockScan type for our thread block
typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
// Allocate temp storage in shared memory
__shared__ typename BlockScanT::TempStorage temp_storage;
int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
// Per-thread tile data
T data[ITEMS_PER_THREAD];
LoadDirectBlocked(linear_tid, d_in, data);
__threadfence_block(); // workaround to prevent clock hoisting
clock_t start = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Test scan
T block_aggregate;
BlockScanT block_scan(temp_storage);
BlockPrefixCallbackOp<T, ScanOpT> prefix_op(linear_tid, initial_value, scan_op);
DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());
// Stop cycle timer
__threadfence_block(); // workaround to prevent clock hoisting
clock_t stop = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Store output
StoreDirectBlocked(linear_tid, d_out, data);
// Store block_aggregate
if (TEST_MODE != BASIC)
d_aggregate[linear_tid] = block_aggregate;
// Store prefix
if (TEST_MODE == PREFIX)
{
if (linear_tid == 0)
d_out[TILE_SIZE] = prefix_op.prefix;
}
// Store time
if (linear_tid == 0)
*d_elapsed = (start > stop) ? start - stop : stop - start;
}
//---------------------------------------------------------------------
// Host utility subroutines
//---------------------------------------------------------------------
/**
* Initialize exclusive-scan problem (and solution)
*/
template <typename T, typename ScanOpT>
T Initialize(
GenMode gen_mode,
T *h_in,
T *h_reference,
int num_items,
ScanOpT scan_op,
T initial_value,
Int2Type<EXCLUSIVE>)
{
InitValue(gen_mode, h_in[0], 0);
T block_aggregate = h_in[0];
h_reference[0] = initial_value;
T inclusive = scan_op(initial_value, h_in[0]);
for (int i = 1; i < num_items; ++i)
{
InitValue(gen_mode, h_in[i], i);
h_reference[i] = inclusive;
inclusive = scan_op(inclusive, h_in[i]);
block_aggregate = scan_op(block_aggregate, h_in[i]);
}
return block_aggregate;
}
/**
* Initialize inclusive-scan problem (and solution)
*/
template <typename T, typename ScanOpT>
T Initialize(
GenMode gen_mode,
( run in 1.262 second using v1.01-cache-2.11-cpan-39bf76dae61 )