Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/warp/warp_scan.cuh view on Meta::CPAN
* __shared__ typename WarpScan::TempStorage temp_storage[4];
*
* // Obtain one input item per thread
* int thread_data = ...
*
* // Compute warp-wide prefix sums
* int warp_id = threadIdx.x / 32;
* WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
* The corresponding output \p thread_data in each of the four warps of threads will be
* <tt>0, 1, 2, 3, ..., 31}</tt>.
*
* \par
* The code snippet below illustrates a single warp prefix sum within a block of
* 128 threads.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpScan for type int
* typedef cub::WarpScan<int> WarpScan;
*
* // Allocate WarpScan shared memory for one warp
* __shared__ typename WarpScan::TempStorage temp_storage;
* ...
*
* // Only the first warp performs a prefix sum
* if (threadIdx.x < 32)
* {
* // Obtain one input item per thread
* int thread_data = ...
*
* // Compute warp-wide prefix sums
* WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
* The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
*
*/
template <
typename T,
int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
int PTX_ARCH = CUB_PTX_ARCH>
class WarpScan
{
private:
/******************************************************************************
* Constants and type definitions
******************************************************************************/
enum
{
/// Whether the logical warp size and the PTX warp size coincide
IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
/// Whether the logical warp size is a power-of-two
IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
/// Whether the data type is an integer (which has fully-associative addition)
IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
};
/// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
/// Shared memory storage layout type for WarpScan
typedef typename InternalWarpScan::TempStorage _TempStorage;
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
unsigned int lane_id;
/******************************************************************************
* Public types
******************************************************************************/
public:
/// \smemstorage{WarpScan}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ WarpScan(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
lane_id(IS_ARCH_WARP ?
LaneId() :
LaneId() % LOGICAL_WARP_THREADS)
{}
//@} end member group
/******************************************************************//**
* \name Inclusive prefix sums
*********************************************************************/
( run in 1.527 second using v1.01-cache-2.11-cpan-5837b0d9d2c )