Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_store.cuh view on Meta::CPAN
/// Store items into a linear segment of memory
template <typename OutputIteratorT>
__device__ __forceinline__ void Store(
OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockExchange(temp_storage).BlockedToWarpStriped(items);
StoreDirectWarpStriped(linear_tid, block_itr, items);
}
/// Store items into a linear segment of memory, guarded by range
template <typename OutputIteratorT>
__device__ __forceinline__ void Store(
OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
BlockExchange(temp_storage).BlockedToWarpStriped(items);
if (linear_tid == 0)
temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
CTA_SYNC();
StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
}
};
/******************************************************************************
* Type definitions
******************************************************************************/
/// Internal load implementation to use
typedef StoreInternal<ALGORITHM, 0> InternalStore;
/// Shared memory storage layout type
typedef typename InternalStore::TempStorage _TempStorage;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/******************************************************************************
* Thread fields
******************************************************************************/
/// Thread reference to shared storage
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
public:
/// \smemstorage{BlockStore}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage.
*/
__device__ __forceinline__ BlockStore()
:
temp_storage(PrivateStorage()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage.
*/
__device__ __forceinline__ BlockStore(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Data movement
*********************************************************************/
//@{
/**
* \brief Store items into a linear segment of memory.
*
* \par
* - \blocked
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the storing of a "blocked" arrangement
* of 512 integers across 128 threads (where each thread owns 4 consecutive items)
* into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
* meaning items are locally reordered among threads so that memory references will be
* efficiently coalesced using a warp-striped access pattern.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
( run in 0.501 second using v1.01-cache-2.11-cpan-d7f47b0818f )