updated results from the CPAN

updated
Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_shuffle.cuh view on Meta::CPAN

    /// Linear thread-id
    unsigned int linear_tid;


    /******************************************************************************
     * Utility methods
     ******************************************************************************/

    /// Internal storage allocator
    __device__ __forceinline__ _TempStorage& PrivateStorage()
    {
        __shared__ _TempStorage private_storage;
        return private_storage;
    }


public:

    /******************************************************************//**
     * \name Collective constructors
     *********************************************************************/
    //@{

    /**
     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
     */
    __device__ __forceinline__ BlockShuffle()
    :
        temp_storage(PrivateStorage()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    /**
     * \brief Collective constructor using the specified memory allocation as temporary storage.
     */
    __device__ __forceinline__ BlockShuffle(
        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
    :
        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    //@}  end member group
    /******************************************************************//**
     * \name Shuffle movement
     *********************************************************************/
    //@{


    /**
     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
     *
     * \par
     * - \smemreuse
     */
    __device__ __forceinline__ void Offset(
        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub...
        int distance = 1)           ///< [in] Offset distance (may be negative)
    {
        temp_storage[linear_tid].prev = input;

        CTA_SYNC();

        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
            output = temp_storage[linear_tid + distance].prev;
    }


    /**
     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
     *
     * \par
     * - \smemreuse
     */
    __device__ __forceinline__ void Rotate(
        T   input,                  ///< [in] The calling thread's input item
        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THR...
        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
    {
        temp_storage[linear_tid].prev = input;

        CTA_SYNC();

        unsigned int offset = threadIdx.x + distance;
        if (offset >= BLOCK_THREADS)
            offset -= BLOCK_THREADS;

        output = temp_storage[offset].prev;
    }


    /**
     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
     *
     * \par
     * - \blocked
     * - \granularity
     * - \smemreuse
     */
    template <int ITEMS_PER_THREAD>
    __device__ __forceinline__ void Up(
        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
    {
        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        #pragma unroll
        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
            prev[ITEM] = input[ITEM - 1];


        if (linear_tid > 0)
            prev[0] = temp_storage[linear_tid - 1].prev;
    }


    /**
     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
     *
     * \par
     * - \blocked
     * - \granularity
     * - \smemreuse
     */
    template <int ITEMS_PER_THREAD>
    __device__ __forceinline__ void Up(
        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
    {
        Up(input, prev);
        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
    }


    /**
     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
     *
     * \par
     * - \blocked
     * - \granularity
     * - \smemreuse
     */
    template <int ITEMS_PER_THREAD>
    __device__ __forceinline__ void Down(
        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
    {
        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        #pragma unroll
        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
            prev[ITEM] = input[ITEM - 1];

        if (linear_tid > 0)
            prev[0] = temp_storage[linear_tid - 1].prev;
    }


    /**
     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
     *
     * \par
     * - \blocked
     * - \granularity
     * - \smemreuse
     */
    template <int ITEMS_PER_THREAD>
    __device__ __forceinline__ void Down(
        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
    {
        Up(input, prev);
        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
    }

    //@}  end member group


};

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)
( run in 0.871 second using v1.01-cache-2.11-cpan-bbcb1afb8fc )