Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_load.cuh view on Meta::CPAN
{
if (linear_tid == 0)
temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
CTA_SYNC();
LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
}
/// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
template <typename InputIteratorT, typename DefaultT>
__device__ __forceinline__ void Load(
InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int valid_items, ///< [in] Number of valid items to load
DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
{
if (linear_tid == 0)
temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
CTA_SYNC();
LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
}
};
/******************************************************************************
* Type definitions
******************************************************************************/
/// Internal load implementation to use
typedef LoadInternal<ALGORITHM, 0> InternalLoad;
/// Shared memory storage layout type
typedef typename InternalLoad::TempStorage _TempStorage;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/******************************************************************************
* Thread fields
******************************************************************************/
/// Thread reference to shared storage
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
public:
/// \smemstorage{BlockLoad}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage.
*/
__device__ __forceinline__ BlockLoad()
:
temp_storage(PrivateStorage()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage.
*/
__device__ __forceinline__ BlockLoad(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Data movement
*********************************************************************/
//@{
/**
* \brief Load a linear segment of items from memory.
*
* \par
* - \blocked
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the loading of a linear
* segment of 512 integers into a "blocked" arrangement across 128 threads where each
* thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
* meaning memory references are efficiently coalesced using a warp-striped access
* pattern (after which items are locally reordered among threads).
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
( run in 0.949 second using v1.01-cache-2.11-cpan-d7f47b0818f )