Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
* \par Performance Considerations
* - Proper device-specific padding ensures zero bank conflicts for most types.
*
*/
template <
typename InputT,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
bool WARP_TIME_SLICING = false,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
class BlockExchange
{
private:
/******************************************************************************
* Constants
******************************************************************************/
/// Constants
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
// Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
};
/******************************************************************************
* Type definitions
******************************************************************************/
/// Shared memory storage layout type
struct __align__(16) _TempStorage
{
InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
};
public:
/// \smemstorage{BlockExchange}
struct TempStorage : Uninitialized<_TempStorage> {};
private:
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
unsigned int linear_tid;
unsigned int lane_id;
unsigned int warp_id;
unsigned int warp_offset;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/**
* Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement. Specialized for no timeslicing.
*/
template <typename OutputT>
__device__ __forceinline__ void BlockedToStriped(
InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
Int2Type<false> /*time_slicing*/)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage.buff[item_offset] = input_items[ITEM];
}
CTA_SYNC();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
output_items[ITEM] = temp_storage.buff[item_offset];
}
}
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
* Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized for warp-timeslicing.
*/
template <typename OutputT, typename OffsetT>
__device__ __forceinline__ void ScatterToStriped(
InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<true> /*time_slicing*/)
{
InputT temp_items[ITEMS_PER_THREAD];
#pragma unroll
for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
{
const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
CTA_SYNC();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ranks[ITEM] - SLICE_OFFSET;
if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage.buff[item_offset] = input_items[ITEM];
}
}
CTA_SYNC();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
// Read a strip of items
const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
{
int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_items[ITEM] = temp_storage.buff[item_offset];
}
}
}
}
// Copy
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
output_items[ITEM] = temp_items[ITEM];
}
}
public:
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage.
*/
__device__ __forceinline__ BlockExchange()
:
temp_storage(PrivateStorage()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
lane_id(LaneId()),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage.
*/
__device__ __forceinline__ BlockExchange(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
lane_id(LaneId()),
warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
//@} end member group
/******************************************************************//**
* \name Structured exchanges
*********************************************************************/
//@{
/**
* \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
*
* \par
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
ScatterToStripedGuarded(items, items, ranks);
}
template <typename OffsetT, typename ValidFlag>
__device__ __forceinline__ void ScatterToStripedFlagged(
InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity
{
ScatterToStriped(items, items, ranks, is_valid);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <
typename T,
int ITEMS_PER_THREAD,
int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
int PTX_ARCH = CUB_PTX_ARCH>
class WarpExchange
{
private:
/******************************************************************************
* Constants
******************************************************************************/
/// Constants
enum
{
// Whether the logical warp size and the PTX warp size coincide
IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
// Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
};
/******************************************************************************
* Type definitions
******************************************************************************/
/// Shared memory storage layout type
struct _TempStorage
{
T buff[WARP_ITEMS + PADDING_ITEMS];
};
public:
/// \smemstorage{WarpExchange}
struct TempStorage : Uninitialized<_TempStorage> {};
private:
/******************************************************************************
* Thread fields
******************************************************************************/
_TempStorage &temp_storage;
int lane_id;
public:
/******************************************************************************
* Construction
******************************************************************************/
/// Constructor
__device__ __forceinline__ WarpExchange(
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),
lane_id(IS_ARCH_WARP ?
LaneId() :
LaneId() % LOGICAL_WARP_THREADS)
{}
/******************************************************************************
* Interface
******************************************************************************/
/**
* \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
*
* \par
* - \smemreuse
*
* \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
*/
template <typename OffsetT>
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
temp_storage.buff[ranks[ITEM]] = items[ITEM];
}
WARP_SYNC(0xffffffff);
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
items[ITEM] = temp_storage.buff[item_offset];
}
}
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
( run in 0.848 second using v1.01-cache-2.11-cpan-d7f47b0818f )