Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/util_ptx.cuh view on Meta::CPAN
return (source >> bit_start) & MASK;
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
*/
template <typename UnsignedBits>
__device__ __forceinline__ unsigned int BFE(
UnsignedBits source,
unsigned int bit_start,
unsigned int num_bits)
{
return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
}
/**
* \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
*/
__device__ __forceinline__ void BFI(
unsigned int &ret,
unsigned int x,
unsigned int y,
unsigned int bit_start,
unsigned int num_bits)
{
#if CUB_PTX_ARCH >= 200
asm volatile("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
#else
x <<= bit_start;
unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
unsigned int MASK_Y = ~MASK_X;
ret = (y & MASK_Y) | (x & MASK_X);
#endif
}
/**
* \brief Three-operand add. Returns \p x + \p y + \p z.
*/
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
{
#if CUB_PTX_ARCH >= 200
asm volatile("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
#else
x = x + y + z;
#endif
return x;
}
/**
* \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.
*
* \par
* The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
* {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
* {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
* the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
*
* \par Snippet
* The code snippet below illustrates byte-permute.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* int a = 0x03020100;
* int b = 0x07060504;
* int index = 0x00007531;
*
* int selected = PRMT(a, b, index); // 0x07050301
*
* \endcode
*
*/
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
{
int ret;
asm volatile("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Sync-threads barrier.
*/
__device__ __forceinline__ void BAR(int count)
{
asm volatile("bar.sync 1, %0;" : : "r"(count));
}
/**
* CTA barrier
*/
__device__ __forceinline__ void CTA_SYNC()
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
__barrier_sync(0);
#else
__syncthreads();
#endif
}
/**
* CTA barrier with predicate
*/
__device__ __forceinline__ int CTA_SYNC_AND(int p)
{
return __syncthreads_and(p);
}
/**
* Warp barrier
*/
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
__syncwarp(member_mask);
#endif
}
/**
* Warp any
*/
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
( run in 0.582 second using v1.01-cache-2.11-cpan-13bb782fe5a )