Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/agent/agent_radix_sort_upsweep.cuh view on Meta::CPAN
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
*/
#pragma once
#include "../thread/thread_reduce.cuh"
#include "../thread/thread_load.cuh"
#include "../block/block_load.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Parameterizable tuning policy type for AgentRadixSortUpsweep
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys
int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
struct AgentRadixSortUpsweepPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
};
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
*/
template <
typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
typename KeyT, ///< KeyT type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentRadixSortUpsweep
{
//---------------------------------------------------------------------
// Type definitions and constants
//---------------------------------------------------------------------
typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
// Integer type for digit counters (to be packed into words of PackedCounters)
typedef unsigned char DigitCounter;
// Integer type for packing DigitCounters into columns of shared memory banks
typedef unsigned int PackedCounter;
static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
enum
{
RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS,
BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
RADIX_DIGITS = 1 << RADIX_BITS,
LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS,
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD,
BYTES_PER_COUNTER = sizeof(DigitCounter),
LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
COUNTER_LANES = 1 << LOG_COUNTER_LANES,
// To prevent counter overflow, we must periodically unpack and aggregate the
// digit counters back into registers. Each counter lane is assigned to a
// warp for aggregation.
LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
// Unroll tiles in batches without risk of counter overflow
UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD),
UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
};
// Input iterator wrapper type (for applying cache modifier)s
typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
/**
* Shared memory storage layout
*/
struct _TempStorage
{
( run in 2.834 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )