Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
* when the GPU is under-occupied.
*/
BLOCK_REDUCE_RAKING,
/**
* \par Overview
* A quick "tiled warp-reductions" reduction algorithm that supports commutative
* (e.g., addition) and non-commutative (e.g., string concatenation) reduction
* operators.
*
* \par
* Execution is comprised of four phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
* reduction within each warp.
* -# A propagation phase where the warp reduction outputs in each warp are
* updated with the aggregate from each preceding warp.
*
* \par
* \image html block_scan_warpscans.png
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
* or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
* throughput across the GPU. However turn-around latency may be lower and
* thus useful when the GPU is under-occupied.
*/
BLOCK_REDUCE_WARP_REDUCTIONS,
};
/******************************************************************************
* Block reduce
******************************************************************************/
/**
* \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. 
* \ingroup BlockModule
*
* \tparam T Data type being reduced
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam ALGORITHM <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
*
* \par Overview
* - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
* - \rowmajor
* - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
* -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
* -# <b>cub::BLOCK_REDUCE_RAKING</b>. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
* -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
*
* \par Performance Considerations
* - \granularity
* - Very efficient (only one synchronization barrier).
* - Incurs zero bank conflicts for most types
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
* - Summation (<b><em>vs.</em></b> generic reduction)
* - \p BLOCK_THREADS is a multiple of the architecture's warp size
* - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
* - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
*
* \par A Simple Example
* \blockcollective{BlockReduce}
* \par
* The code snippet below illustrates a sum reduction of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
*
* \endcode
*
*/
template <
typename T,
int BLOCK_DIM_X,
BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class BlockReduce
{
private:
/******************************************************************************
* Constants and type definitions
******************************************************************************/
/// Constants
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};
typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpReductions;
typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> RakingCommutativeOnly;
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Generic reductions
*********************************************************************/
//@{
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. Each thread contributes one input element.
*
* \par
* - The return value is undefined in threads other than thread<sub>0</sub>.
* - \rowmajor
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a max reduction of 128 integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Each thread obtains an input item
* int thread_data;
* ...
*
* // Compute the block-wide max for thread0
* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
*
* \endcode
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input
ReductionOp reduction_op) ///< [in] Binary reduction functor
{
return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
*
* \par
* - The return value is undefined in threads other than thread<sub>0</sub>.
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a max reduction of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Compute the block-wide max for thread0
* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
*
* \endcode
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam ReductionOp <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int ITEMS_PER_THREAD,
typename ReductionOp>
__device__ __forceinline__ T Reduce(
T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment
ReductionOp reduction_op) ///< [in] Binary reduction functor
{
// Reduce partials
T partial = ThreadReduce(inputs, reduction_op);
return Reduce(partial, reduction_op);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. The first \p num_valid threads each contribute one input element.
*
* \par
* - The return value is undefined in threads other than thread<sub>0</sub>.
* - \rowmajor
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a max reduction of a partially-full tile of integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(int num_valid, ...)
* {
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
}
else
{
return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
}
}
//@} end member group
/******************************************************************//**
* \name Summation reductions
*********************************************************************/
//@{
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
*
* \par
* - The return value is undefined in threads other than thread<sub>0</sub>.
* - \rowmajor
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a sum reduction of 128 integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Each thread obtains an input item
* int thread_data;
* ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
*
* \endcode
*
*/
__device__ __forceinline__ T Sum(
T input) ///< [in] Calling thread's input
{
return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
*
* \par
* - The return value is undefined in threads other than thread<sub>0</sub>.
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a sum reduction of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
*
* \endcode
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
*/
template <int ITEMS_PER_THREAD>
__device__ __forceinline__ T Sum(
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
{
// Reduce partials
T partial = ThreadReduce(inputs, cub::Sum());
return Sum(partial);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element.
*
* \par
* - The return value is undefined in threads other than thread<sub>0</sub>.
* - \rowmajor
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(int num_valid, ...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
( run in 1.334 second using v1.01-cache-2.11-cpan-75ffa21a3d4 )