synchro results from the CPAN

Alien-XGBoost
 *
 ******************************************************************************/

/**
 * \file
 * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock.  Supports non-commutative reduction operators.
 */

#pragma once

#include "../../warp/warp_reduce.cuh"
#include "../../util_ptx.cuh"
#include "../../util_arch.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/**
 * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock.  Supports non-commutative reduction operators.
 */
template <
    typename    T,              ///< Data type being reduced
    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
struct BlockReduceWarpReductions
{
    /// Constants
    enum
    {
        /// The thread block size in threads
        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

        /// Number of warp threads
        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),

        /// Number of active warps
        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

        /// The logical warp size for warp reductions
        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),

        /// Whether or not the logical warp size evenly divides the threadblock size
        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
    };


    ///  WarpReduce utility type
    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;


    /// Shared memory storage layout type
    struct _TempStorage
    {
        typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
    };

    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    // Thread fields
    _TempStorage &temp_storage;
    unsigned int linear_tid;
    unsigned int warp_id;
    unsigned int lane_id;


    /// Constructor
    __device__ __forceinline__ BlockReduceWarpReductions(
        TempStorage &temp_storage)
    :
        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
        lane_id(LaneId())
    {}


    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
    __device__ __forceinline__ T ApplyWarpAggregates(
        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
    {
        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
        {
            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
            warp_aggregate = reduction_op(warp_aggregate, addend);
        }
        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
    }

    template <bool FULL_TILE, typename ReductionOp>
    __device__ __forceinline__ T ApplyWarpAggregates(
        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
        Int2Type<WARPS>     /*successor_warp*/)
    {
        return warp_aggregate;
    }


    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
    template <
        bool                FULL_TILE,
        typename            ReductionOp>
    __device__ __forceinline__ T ApplyWarpAggregates(
        ReductionOp         reduction_op,       ///< [in] Binary scan operator
        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
    {
( run in 0.459 second using v1.01-cache-2.11-cpan-84de2e75c66 )