Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
*/
#pragma once
#include "specializations/block_reduce_raking.cuh"
#include "specializations/block_reduce_raking_commutative_only.cuh"
#include "specializations/block_reduce_warp_reductions.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "../thread/thread_operators.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Algorithmic variants
******************************************************************************/
/**
* BlockReduceAlgorithm enumerates alternative algorithms for parallel
* reduction across a CUDA threadblock.
*/
enum BlockReduceAlgorithm
{
/**
* \par Overview
* An efficient "raking" reduction algorithm that only supports commutative
* reduction operators (true for most operations, e.g., addition).
*
* \par
* Execution is comprised of three phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Threads in warps other than the first warp place
* their partial reductions into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within the first
* warp continue to accumulate by raking across segments of shared partial reductions
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
*
* \par
* \image html block_reduce.png
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
* and is preferable when the reduction operator is commutative. This variant
* applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
* throughput across the GPU when suitably occupied. However, turn-around latency may be
* higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
* when the GPU is under-occupied.
*/
BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
/**
* \par Overview
* An efficient "raking" reduction algorithm that supports commutative
* (e.g., addition) and non-commutative (e.g., string concatenation) reduction
* operators. \blocked.
*
* \par
* Execution is comprised of three phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within a
* single warp rake across segments of shared partial reductions.
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
*
* \par
* \image html block_reduce.png
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant performs more communication than BLOCK_REDUCE_RAKING
* and is only preferable when the reduction operator is non-commutative. This variant
* applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
* throughput across the GPU when suitably occupied. However, turn-around latency may be
* higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
* when the GPU is under-occupied.
*/
BLOCK_REDUCE_RAKING,
/**
* \par Overview
* A quick "tiled warp-reductions" reduction algorithm that supports commutative
* (e.g., addition) and non-commutative (e.g., string concatenation) reduction
* operators.
*
* \par
* Execution is comprised of four phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
* reduction within each warp.
* -# A propagation phase where the warp reduction outputs in each warp are
* updated with the aggregate from each preceding warp.
*
* \par
* \image html block_scan_warpscans.png
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
* or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
* throughput across the GPU. However turn-around latency may be lower and
* thus useful when the GPU is under-occupied.
*/
BLOCK_REDUCE_WARP_REDUCTIONS,
};
/******************************************************************************
* Block reduce
******************************************************************************/
/**
* \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. 
* \ingroup BlockModule
*
* \tparam T Data type being reduced
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam ALGORITHM <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
*
* \par Overview
* - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
* - \rowmajor
* - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
* -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
* -# <b>cub::BLOCK_REDUCE_RAKING</b>. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
* -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
*
* \par Performance Considerations
* - \granularity
* - Very efficient (only one synchronization barrier).
* - Incurs zero bank conflicts for most types
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
* - Summation (<b><em>vs.</em></b> generic reduction)
* - \p BLOCK_THREADS is a multiple of the architecture's warp size
* - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
* - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
*
* \par A Simple Example
* \blockcollective{BlockReduce}
* \par
* The code snippet below illustrates a sum reduction of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_reduce.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for a 1D block of 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
*
* \endcode
*
*/
template <
typename T,
int BLOCK_DIM_X,
BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class BlockReduce
{
private:
/******************************************************************************
* Constants and type definitions
******************************************************************************/
/// Constants
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};
typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpReductions;
typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> RakingCommutativeOnly;
typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> Raking;
( run in 2.368 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )