Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/cub/block/block_load.cuh  view on Meta::CPAN

 * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
 */
template <
    typename        InputT,
    typename        DefaultT,
    int             ITEMS_PER_THREAD,
    typename        InputIteratorT>
__device__ __forceinline__ void LoadDirectWarpStriped(
    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
    int             valid_items,                ///< [in] Number of valid items to load
    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
{
    // Load directly in warp-striped order
    #pragma unroll
    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
        items[ITEM] = oob_default;

    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
}



//@}  end member group

/** @} */       // end group UtilIo



//-----------------------------------------------------------------------------
// Generic BlockLoad abstraction
//-----------------------------------------------------------------------------

/**
 * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
 */

/**
 * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
 */
enum BlockLoadAlgorithm
{
    /**
     * \par Overview
     *
     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
     * directly from memory.
     *
     * \par Performance Considerations
     * - The utilization of memory transactions (coalescing) decreases as the
     *   access stride between threads increases (i.e., the number items per thread).
     */
    BLOCK_LOAD_DIRECT,

    /**
     * \par Overview
     *
     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
     *
     * \par Performance Considerations
     * - The utilization of memory transactions (coalescing) remains high until the the
     *   access stride between threads (i.e., the number items per thread) exceeds the
     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
     *   - \p ITEMS_PER_THREAD is odd
     *   - The \p InputIteratorTis not a simple pointer type
     *   - The block input offset is not quadword-aligned
     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
     */
    BLOCK_LOAD_VECTORIZE,

    /**
     * \par Overview
     *
     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
     * efficiently from memory and then locally transposed into a
     * [<em>blocked arrangement</em>](index.html#sec5sec3).
     *
     * \par Performance Considerations
     * - The utilization of memory transactions (coalescing) remains high regardless
     *   of items loaded per thread.
     * - The local reordering incurs slightly longer latencies and throughput than the
     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
     */
    BLOCK_LOAD_TRANSPOSE,


    /**
     * \par Overview
     *
     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
     * read efficiently from memory and then locally transposed into a
     * [<em>blocked arrangement</em>](index.html#sec5sec3).
     *
     * \par Usage Considerations
     * - BLOCK_THREADS must be a multiple of WARP_THREADS
     *
     * \par Performance Considerations
     * - The utilization of memory transactions (coalescing) remains high regardless
     *   of items loaded per thread.
     * - The local reordering incurs slightly larger latencies than the
     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
     * - Provisions more shared storage, but incurs smaller latencies than the
     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
     */
    BLOCK_LOAD_WARP_TRANSPOSE,


    /**
     * \par Overview
     *
     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
     * of data is read directly from memory and then is locally transposed into a
     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
     * requirement, only one warp's worth of shared memory is provisioned and is
     * subsequently time-sliced among warps.
     *
     * \par Usage Considerations
     * - BLOCK_THREADS must be a multiple of WARP_THREADS
     *
     * \par Performance Considerations
     * - The utilization of memory transactions (coalescing) remains high regardless
     *   of items loaded per thread.
     * - Provisions less shared memory temporary storage, but incurs larger
     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
     */
    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
};


/**
 * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_lo...
 * \ingroup BlockModule
 * \ingroup UtilIo
 *
 * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
 * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
 * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
 * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
 * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (defa...
 * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
 * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
 * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
 *
 * \par Overview
 * - The BlockLoad class provides a single data movement abstraction that can be specialized
 *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
 *   performance policies for different architectures, data types, granularity sizes, etc.
 * - BlockLoad can be optionally specialized by different data movement strategies:
 *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
 *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
 *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
 *      of data is read directly from memory using CUDA's built-in vectorized loads as a
 *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
 *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
 *      of data is read directly from memory and is then locally transposed into a
 *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
 *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
 *      of data is read directly from memory and is then locally transposed into a
 *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
 *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
 *      of data is read directly from memory and is then locally transposed into a
 *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
 * - \rowmajor
 *
 * \par A Simple Example
 * \blockcollective{BlockLoad}
 * \par
 * The code snippet below illustrates the loading of a linear
 * segment of 512 integers into a "blocked" arrangement across 128 threads where each
 * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
 * meaning memory references are efficiently coalesced using a warp-striped access
 * pattern (after which items are locally reordered among threads).
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
 *
 * __global__ void ExampleKernel(int *d_data, ...)
 * {
 *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
 *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
 *
 *     // Allocate shared memory for BlockLoad
 *     __shared__ typename BlockLoad::TempStorage temp_storage;
 *
 *     // Load a segment of consecutive items that are blocked across threads
 *     int thread_data[4];
 *     BlockLoad(temp_storage).Load(d_data, thread_data);
 *
 * \endcode
 * \par
 * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
 * The set of \p thread_data across the block of threads in those threads will be
 * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
 *
 */
template <
    typename            InputT,
    int                 BLOCK_DIM_X,
    int                 ITEMS_PER_THREAD,
    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
    int                 BLOCK_DIM_Y         = 1,
    int                 BLOCK_DIM_Z         = 1,
    int                 PTX_ARCH            = CUB_PTX_ARCH>
class BlockLoad
{
private:

    /******************************************************************************
     * Constants and typed definitions
     ******************************************************************************/

    /// Constants
    enum
    {
        /// The thread block size in threads
        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
    };


    /******************************************************************************
     * Algorithmic variants
     ******************************************************************************/

    /// Load helper
    template <BlockLoadAlgorithm _POLICY, int DUMMY>
    struct LoadInternal;


    /**
     * BLOCK_LOAD_DIRECT specialization of load helper
     */
    template <int DUMMY>
    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
    {
        /// Shared memory storage layout type
        typedef NullType TempStorage;

xgboost/cub/cub/block/block_load.cuh  view on Meta::CPAN

    int linear_tid;

public:

    /// \smemstorage{BlockLoad}
    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************//**
     * \name Collective constructors
     *********************************************************************/
    //@{

    /**
     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
     */
    __device__ __forceinline__ BlockLoad()
    :
        temp_storage(PrivateStorage()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    /**
     * \brief Collective constructor using the specified memory allocation as temporary storage.
     */
    __device__ __forceinline__ BlockLoad(
        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
    :
        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}




    //@}  end member group
    /******************************************************************//**
     * \name Data movement
     *********************************************************************/
    //@{


    /**
     * \brief Load a linear segment of items from memory.
     *
     * \par
     * - \blocked
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the loading of a linear
     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
     * meaning memory references are efficiently coalesced using a warp-striped access
     * pattern (after which items are locally reordered among threads).
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, ...)
     * {
     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
     *
     *     // Allocate shared memory for BlockLoad
     *     __shared__ typename BlockLoad::TempStorage temp_storage;
     *
     *     // Load a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     BlockLoad(temp_storage).Load(d_data, thread_data);
     *
     * \endcode
     * \par
     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
     * The set of \p thread_data across the block of threads in those threads will be
     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
     *
     */
    template <typename InputIteratorT>
    __device__ __forceinline__ void Load(
        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
    {
        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
    }


    /**
     * \brief Load a linear segment of items from memory, guarded by range.
     *
     * \par
     * - \blocked
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the guarded loading of a linear
     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
     * meaning memory references are efficiently coalesced using a warp-striped access
     * pattern (after which items are locally reordered among threads).
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
     * {
     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
     *
     *     // Allocate shared memory for BlockLoad
     *     __shared__ typename BlockLoad::TempStorage temp_storage;
     *
     *     // Load a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
     *
     * \endcode
     * \par
     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
     * The set of \p thread_data across the block of threads in those threads will be
     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
     * being unmasked to load portions of valid data (and other items remaining unassigned).
     *
     */
    template <typename InputIteratorT>
    __device__ __forceinline__ void Load(
        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
        int             valid_items)                ///< [in] Number of valid items to load
    {
        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
    }


    /**
     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
     *
     * \par
     * - \blocked
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the guarded loading of a linear
     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
     * meaning memory references are efficiently coalesced using a warp-striped access
     * pattern (after which items are locally reordered among threads).
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
     * {
     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
     *
     *     // Allocate shared memory for BlockLoad
     *     __shared__ typename BlockLoad::TempStorage temp_storage;
     *
     *     // Load a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
     *
     * \endcode
     * \par
     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
     * The set of \p thread_data across the block of threads in those threads will be
     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
     * being unmasked to load portions of valid data (and other items are assigned \p -1)
     *
     */
    template <typename InputIteratorT, typename DefaultT>
    __device__ __forceinline__ void Load(
        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
        int             valid_items,                ///< [in] Number of valid items to load
        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
    {
        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
    }


    //@}  end member group

};


}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)



( run in 0.490 second using v1.01-cache-2.11-cpan-df04353d9ac )