\\g results from the CPAN

Alien-XGBoost

     *   when the GPU is under-occupied.
     */
    BLOCK_REDUCE_RAKING,


    /**
     * \par Overview
     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
     * operators.
     *
     * \par
     * Execution is comprised of four phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more
     *    than one input each).  Each thread then places the partial reduction
     *    of its item(s) into shared memory.
     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
     *    reduction within each warp.
     * -# A propagation phase where the warp reduction outputs in each warp are
     *    updated with the aggregate from each preceding warp.
     *
     * \par
     * \image html block_scan_warpscans.png
     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
     *   throughput across the GPU.  However turn-around latency may be lower and
     *   thus useful when the GPU is under-occupied.
     */
    BLOCK_REDUCE_WARP_REDUCTIONS,
};


/******************************************************************************
 * Block reduce
 ******************************************************************************/

/**
 * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
 * \ingroup BlockModule
 *
 * \tparam T                Data type being reduced
 * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
 * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
 * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
 * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
 * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
 *
 * \par Overview
 * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
 *   uses a binary combining operator to compute a single aggregate from a list of input elements.
 * - \rowmajor
 * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
 *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *
 * \par Performance Considerations
 * - \granularity
 * - Very efficient (only one synchronization barrier).
 * - Incurs zero bank conflicts for most types
 * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
 *   - Summation (<b><em>vs.</em></b> generic reduction)
 *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
 *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
 * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
 *
 * \par A Simple Example
 * \blockcollective{BlockReduce}
 * \par
 * The code snippet below illustrates a sum reduction of 512 integer items that
 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
 * where each thread owns 4 consecutive items.
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
 *
 * __global__ void ExampleKernel(...)
 * {
 *     // Specialize BlockReduce for a 1D block of 128 threads on type int
 *     typedef cub::BlockReduce<int, 128> BlockReduce;
 *
 *     // Allocate shared memory for BlockReduce
 *     __shared__ typename BlockReduce::TempStorage temp_storage;
 *
 *     // Obtain a segment of consecutive items that are blocked across threads
 *     int thread_data[4];
 *     ...
 *
 *     // Compute the block-wide sum for thread0
 *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
 *
 * \endcode
 *
 */
template <
    typename                T,
    int                     BLOCK_DIM_X,
    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
    int                     BLOCK_DIM_Y     = 1,
    int                     BLOCK_DIM_Z     = 1,
    int                     PTX_ARCH        = CUB_PTX_ARCH>
class BlockReduce
{
private:

    /******************************************************************************
     * Constants and type definitions
     ******************************************************************************/

    /// Constants
    enum
    {
        /// The thread block size in threads
        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
    };

    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;

xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN

        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    //@}  end member group
    /******************************************************************//**
     * \name Generic reductions
     *********************************************************************/
    //@{


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a max reduction of 128 integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Each thread obtains an input item
     *     int thread_data;
     *     ...
     *
     *     // Compute the block-wide max for thread0
     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
     *
     * \endcode
     *
     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
     */
    template <typename ReductionOp>
    __device__ __forceinline__ T Reduce(
        T               input,                      ///< [in] Calling thread's input
        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
    {
        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
    }


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a max reduction of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Compute the block-wide max for thread0
     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
     *
     * \endcode
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
     */
    template <
        int ITEMS_PER_THREAD,
        typename ReductionOp>
    __device__ __forceinline__ T Reduce(
        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
    {
        // Reduce partials
        T partial = ThreadReduce(inputs, reduction_op);
        return Reduce(partial, reduction_op);
    }


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(int num_valid, ...)
     * {

xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN

            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
        }
        else
        {
            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
        }
    }


    //@}  end member group
    /******************************************************************//**
     * \name Summation reductions
     *********************************************************************/
    //@{


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a sum reduction of 128 integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Each thread obtains an input item
     *     int thread_data;
     *     ...
     *
     *     // Compute the block-wide sum for thread0
     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
     *
     * \endcode
     *
     */
    __device__ __forceinline__ T Sum(
        T   input)                      ///< [in] Calling thread's input
    {
        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
    }

    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a sum reduction of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce
     *     __shared__ typename BlockReduce::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Compute the block-wide sum for thread0
     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
     *
     * \endcode
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     */
    template <int ITEMS_PER_THREAD>
    __device__ __forceinline__ T Sum(
        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
    {
        // Reduce partials
        T partial = ThreadReduce(inputs, cub::Sum());
        return Sum(partial);
    }


    /**
     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
     *
     * \par
     * - The return value is undefined in threads other than thread<sub>0</sub>.
     * - \rowmajor
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
     * are partitioned across 128 threads.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
     *
     * __global__ void ExampleKernel(int num_valid, ...)
     * {
     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
     *     typedef cub::BlockReduce<int, 128> BlockReduce;
     *
     *     // Allocate shared memory for BlockReduce

( run in 0.764 second using v1.01-cache-2.11-cpan-acf6aa7dc9e )