Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_block_reduce.cu view on Meta::CPAN
{
return block_reduce.Sum(data[0]);
}
/// Sum reduction (full, ITEMS_PER_THREAD)
template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
__device__ __forceinline__ T DeviceTest(
BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
{
return block_reduce.Sum(data);
}
/// Sum reduction (partial, 1)
template <typename BlockReduceT, typename T>
__device__ __forceinline__ T DeviceTest(
BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
{
return block_reduce.Sum(data, valid_threads);
}
/**
* Test full-tile reduction kernel (where num_items is an even
* multiple of BLOCK_THREADS)
*/
template <
BlockReduceAlgorithm ALGORITHM,
int BLOCK_DIM_X,
int BLOCK_DIM_Y,
int BLOCK_DIM_Z,
int ITEMS_PER_THREAD,
typename T,
typename ReductionOp>
__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
__global__ void FullTileReduceKernel(
T *d_in,
T *d_out,
ReductionOp reduction_op,
int tiles,
clock_t *d_elapsed)
{
const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
// Cooperative threadblock reduction utility type (returns aggregate in thread 0)
typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
// Allocate temp storage in shared memory
__shared__ typename BlockReduceT::TempStorage temp_storage;
int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
// Per-thread tile data
T data[ITEMS_PER_THREAD];
// Load first tile of data
int block_offset = 0;
if (block_offset < TILE_SIZE * tiles)
{
LoadDirectBlocked(linear_tid, d_in + block_offset, data);
block_offset += TILE_SIZE;
// Start cycle timer
clock_t start = clock();
// Cooperative reduce first tile
BlockReduceT block_reduce(temp_storage) ;
T block_aggregate = DeviceTest(block_reduce, data, reduction_op);
// Stop cycle timer
#if CUB_PTX_ARCH == 100
// Bug: recording stop clock causes mis-write of running prefix value
clock_t stop = 0;
#else
clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
clock_t elapsed = (start > stop) ? start - stop : stop - start;
// Loop over input tiles
while (block_offset < TILE_SIZE * tiles)
{
// TestBarrier between threadblock reductions
__syncthreads();
// Load tile of data
LoadDirectBlocked(linear_tid, d_in + block_offset, data);
block_offset += TILE_SIZE;
// Start cycle timer
clock_t start = clock();
// Cooperatively reduce the tile's aggregate
BlockReduceT block_reduce(temp_storage) ;
T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);
// Stop cycle timer
#if CUB_PTX_ARCH == 100
// Bug: recording stop clock causes mis-write of running prefix value
clock_t stop = 0;
#else
clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
elapsed += (start > stop) ? start - stop : stop - start;
// Reduce threadblock aggregate
block_aggregate = reduction_op(block_aggregate, tile_aggregate);
}
// Store data
if (linear_tid == 0)
{
d_out[0] = block_aggregate;
*d_elapsed = elapsed;
}
}
}
/**
* Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
*/
template <
BlockReduceAlgorithm ALGORITHM,
int BLOCK_DIM_X,
int BLOCK_DIM_Y,
int BLOCK_DIM_Z,
typename T,
typename ReductionOp>
__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
__global__ void PartialTileReduceKernel(
T *d_in,
T *d_out,
int num_items,
ReductionOp reduction_op,
clock_t *d_elapsed)
{
// Cooperative threadblock reduction utility type (returns aggregate only in thread-0)
typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
// Allocate temp storage in shared memory
__shared__ typename BlockReduceT::TempStorage temp_storage;
int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
// Per-thread tile data
( run in 0.777 second using v1.01-cache-2.11-cpan-39bf76dae61 )