Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_scan.cuh view on Meta::CPAN
/******************************************************************************
* Constants and type definitions
******************************************************************************/
/// Constants
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};
/**
* Ensure the template parameterization meets the requirements of the
* specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
* cannot be used with threadblock sizes not a multiple of the
* architectural warp size.
*/
static const BlockScanAlgorithm SAFE_ALGORITHM =
((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
BLOCK_SCAN_RAKING :
ALGORITHM;
typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
/// Define the delegate type for the desired algorithm
typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
WarpScans,
Raking>::Type InternalBlockScan;
/// Shared memory storage layout type for BlockScan
typedef typename InternalBlockScan::TempStorage _TempStorage;
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
unsigned int linear_tid;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/******************************************************************************
* Public types
******************************************************************************/
public:
/// \smemstorage{BlockScan}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage.
*/
__device__ __forceinline__ BlockScan()
:
temp_storage(PrivateStorage()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage.
*/
__device__ __forceinline__ BlockScan(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Exclusive prefix sum operations
*********************************************************************/
//@{
/**
* \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</su...
*
* \par
* - \identityzero
* - \rowmajor
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates an exclusive prefix sum of 128 integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockScan for a 1D block of 128 threads on type int
* typedef cub::BlockScan<int, 128> BlockScan;
*
* // Allocate shared memory for BlockScan
( run in 0.764 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )