Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/util_ptx.cuh view on Meta::CPAN
asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
*/
__device__ __forceinline__ unsigned int WarpId()
{
unsigned int ret;
asm volatile("mov.u32 %0, %%warpid;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes less than the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskLt()
{
unsigned int ret;
asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskLe()
{
unsigned int ret;
asm volatile("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes greater than the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskGt()
{
unsigned int ret;
asm volatile("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskGe()
{
unsigned int ret;
asm volatile("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
return ret;
}
/** @} */ // end group UtilPtx
/**
* \brief Shuffle-up for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>. For thread lanes \e i < src_offset, the thread's own \p input is retu...
* \ingroup WarpModule
*
* \par
* - Available only for SM3.0 or newer
*
* \par Snippet
* The code snippet below illustrates each thread obtaining a \p double value from the
* predecessor of its predecessor.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Obtain one input item per thread
* double thread_data = ...
*
* // Obtain item from two ranks below
* double peer_data = ShuffleUp(thread_data, 2, 0, 0xffffffff);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
* The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
*
*/
template <typename T>
__device__ __forceinline__ T ShuffleUp(
T input, ///< [in] The value to broadcast
int src_offset, ///< [in] The relative down-offset of the peer to read from
int first_lane, ///< [in] Index of first lane in segment (typically 0)
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
{
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
unsigned int shuffle_word;
shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_lane, member_mask);
output_alias[0] = shuffle_word;
#pragma unroll
for (int WORD = 1; WORD < WORDS; ++WORD)
{
shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_lane, member_mask);
output_alias[WORD] = shuffle_word;
}
return output;
}
/**
* \brief Shuffle-down for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is...
* \ingroup WarpModule
*
* \par
* - Available only for SM3.0 or newer
*
* \par Snippet
* The code snippet below illustrates each thread obtaining a \p double value from the
* successor of its successor.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Obtain one input item per thread
* double thread_data = ...
*
* // Obtain item from two ranks below
* double peer_data = ShuffleDown(thread_data, 2, 31, 0xffffffff);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
* The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
*
*/
template <typename T>
__device__ __forceinline__ T ShuffleDown(
T input, ///< [in] The value to broadcast
int src_offset, ///< [in] The relative up-offset of the peer to read from
int last_lane, ///< [in] Index of first lane in segment (typically 31)
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
{
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
unsigned int shuffle_word;
shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_lane, member_mask);
output_alias[0] = shuffle_word;
#pragma unroll
for (int WORD = 1; WORD < WORDS; ++WORD)
{
shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_lane, member_mask);
output_alias[WORD] = shuffle_word;
}
return output;
}
/**
* \brief Shuffle-broadcast for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
* contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>. For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
* then the thread's own \p input is returned to the thread. 
*
* \ingroup WarpModule
*
* \par
* - Available only for SM3.0 or newer
*
* \par Snippet
* The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
*
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Obtain one input item per thread
* double thread_data = ...
*
* // Obtain item from thread 0
* double peer_data = ShuffleIndex(thread_data, 0, 32, 0xffffffff);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
* The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
*
*/
template <typename T>
__device__ __forceinline__ T ShuffleIndex(
T input, ///< [in] The value to broadcast
int src_lane, ///< [in] Which warp lane is to do the broadcasting
int logical_warp_threads, ///< [in] Number of threads per logical warp
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
{
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
unsigned int shuffle_word;
shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
src_lane,
logical_warp_threads - 1,
member_mask);
output_alias[0] = shuffle_word;
#pragma unroll
for (int WORD = 1; WORD < WORDS; ++WORD)
{
shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
src_lane,
logical_warp_threads - 1,
member_mask);
output_alias[WORD] = shuffle_word;
}
( run in 0.581 second using v1.01-cache-2.11-cpan-5623c5533a1 )