public results from the CPAN

Alien-XGBoost
        {
            if (linear_tid == 0)
                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
            CTA_SYNC();
            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
        }


        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
        template <typename InputIteratorT, typename DefaultT>
        __device__ __forceinline__ void Load(
            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
            int             valid_items,                    ///< [in] Number of valid items to load
            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
        {
            if (linear_tid == 0)
                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
            CTA_SYNC();
            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
        }
    };


    /******************************************************************************
     * Type definitions
     ******************************************************************************/

    /// Internal load implementation to use
    typedef LoadInternal<ALGORITHM, 0> InternalLoad;


    /// Shared memory storage layout type
    typedef typename InternalLoad::TempStorage _TempStorage;


    /******************************************************************************
     * Utility methods
     ******************************************************************************/

    /// Internal storage allocator
    __device__ __forceinline__ _TempStorage& PrivateStorage()
    {
        __shared__ _TempStorage private_storage;
        return private_storage;
    }


    /******************************************************************************
     * Thread fields
     ******************************************************************************/

    /// Thread reference to shared storage
    _TempStorage &temp_storage;

    /// Linear thread-id
    int linear_tid;

public:

    /// \smemstorage{BlockLoad}
    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************//**
     * \name Collective constructors
     *********************************************************************/
    //@{

    /**
     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
     */
    __device__ __forceinline__ BlockLoad()
    :
        temp_storage(PrivateStorage()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    /**
     * \brief Collective constructor using the specified memory allocation as temporary storage.
     */
    __device__ __forceinline__ BlockLoad(
        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
    :
        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}




    //@}  end member group
    /******************************************************************//**
     * \name Data movement
     *********************************************************************/
    //@{


    /**
     * \brief Load a linear segment of items from memory.
     *
     * \par
     * - \blocked
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the loading of a linear
     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
     * meaning memory references are efficiently coalesced using a warp-striped access
     * pattern (after which items are locally reordered among threads).
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, ...)
     * {
     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
( run in 0.949 second using v1.01-cache-2.11-cpan-d7f47b0818f )