Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/cub/warp/warp_reduce.cuh  view on Meta::CPAN

 *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
 *
 *     // Obtain one input item per thread
 *     int thread_data = ...
 *
 *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
 *     int warp_id = threadIdx.x / 32;
 *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
 *
 * \endcode
 * \par
 * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
 * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
 * \p 2544, and \p 3568, respectively (and is undefined in other threads).
 *
 * \par
 * The code snippet below illustrates a single warp sum reduction within a block of
 * 128 threads.
 * \par
 * \code
 * #include <cub/cub.cuh>
 *
 * __global__ void ExampleKernel(...)
 * {
 *     // Specialize WarpReduce for type int
 *     typedef cub::WarpReduce<int> WarpReduce;
 *
 *     // Allocate WarpReduce shared memory for one warp
 *     __shared__ typename WarpReduce::TempStorage temp_storage;
 *     ...
 *
 *     // Only the first warp performs a reduction
 *     if (threadIdx.x < 32)
 *     {
 *         // Obtain one input item per thread
 *         int thread_data = ...
 *
 *         // Return the warp-wide sum to lane0
 *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
 *
 * \endcode
 * \par
 * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
 * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
 *
 */
template <
    typename    T,
    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
    int         PTX_ARCH                = CUB_PTX_ARCH>
class WarpReduce
{
private:

    /******************************************************************************
     * Constants and type definitions
     ******************************************************************************/

    enum
    {
        /// Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        /// Whether the logical warp size is a power-of-two
        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
    };

public:

    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document

    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;

    #endif // DOXYGEN_SHOULD_SKIP_THIS


private:

    /// Shared memory storage layout type for WarpReduce
    typedef typename InternalWarpReduce::TempStorage _TempStorage;


    /******************************************************************************
     * Thread fields
     ******************************************************************************/

    /// Shared storage reference
    _TempStorage &temp_storage;


    /******************************************************************************
     * Utility methods
     ******************************************************************************/

public:

    /// \smemstorage{WarpReduce}
    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************//**
     * \name Collective constructors
     *********************************************************************/
    //@{


    /**
     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
     */
    __device__ __forceinline__ WarpReduce(
        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
    :
        temp_storage(temp_storage.Alias())
    {}


    //@}  end member group
    /******************************************************************//**



( run in 2.145 seconds using v1.01-cache-2.11-cpan-5837b0d9d2c )