Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/warp/warp_reduce.cuh view on Meta::CPAN
* __shared__ typename WarpReduce::TempStorage temp_storage[4];
*
* // Obtain one input item per thread
* int thread_data = ...
*
* // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
* int warp_id = threadIdx.x / 32;
* int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
* \p 2544, and \p 3568, respectively (and is undefined in other threads).
*
* \par
* The code snippet below illustrates a single warp sum reduction within a block of
* 128 threads.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for type int
* typedef cub::WarpReduce<int> WarpReduce;
*
* // Allocate WarpReduce shared memory for one warp
* __shared__ typename WarpReduce::TempStorage temp_storage;
* ...
*
* // Only the first warp performs a reduction
* if (threadIdx.x < 32)
* {
* // Obtain one input item per thread
* int thread_data = ...
*
* // Return the warp-wide sum to lane0
* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
* The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
*
*/
template <
typename T,
int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
int PTX_ARCH = CUB_PTX_ARCH>
class WarpReduce
{
private:
/******************************************************************************
* Constants and type definitions
******************************************************************************/
enum
{
/// Whether the logical warp size and the PTX warp size coincide
IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
/// Whether the logical warp size is a power-of-two
IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
};
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
#endif // DOXYGEN_SHOULD_SKIP_THIS
private:
/// Shared memory storage layout type for WarpReduce
typedef typename InternalWarpReduce::TempStorage _TempStorage;
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/******************************************************************************
* Utility methods
******************************************************************************/
public:
/// \smemstorage{WarpReduce}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ WarpReduce(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias())
{}
//@} end member group
/******************************************************************//**
( run in 2.145 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )