Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
template <
typename T,
int BLOCK_DIM_X,
BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class BlockReduce
{
private:
/******************************************************************************
* Constants and type definitions
******************************************************************************/
/// Constants
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};
typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpReductions;
typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> RakingCommutativeOnly;
typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> Raking;
/// Internal specialization type
typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
WarpReductions,
typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
RakingCommutativeOnly,
Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking
/// Shared memory storage layout type for BlockReduce
typedef typename InternalBlockReduce::TempStorage _TempStorage;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
unsigned int linear_tid;
public:
/// \smemstorage{BlockReduce}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage.
*/
__device__ __forceinline__ BlockReduce()
:
temp_storage(PrivateStorage()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage.
*/
__device__ __forceinline__ BlockReduce(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Generic reductions
*********************************************************************/
//@{
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. Each thread contributes one input element.
*
* \par
* - The return value is undefined in threads other than thread<sub>0</sub>.
* - \rowmajor
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a max reduction of 128 integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
( run in 1.434 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )