Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/agent/agent_reduce.cuh view on Meta::CPAN
#include "../grid/grid_mapping.cuh"
#include "../grid/grid_queue.cuh"
#include "../grid/grid_even_share.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Parameterizable tuning policy type for AgentReduce
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
struct AgentReducePolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
};
static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
*
* Each thread reduces only the values it loads. If \p FIRST_TILE, this
* partial reduction is stored into \p thread_aggregate. Otherwise it is
* accumulated into \p thread_aggregate.
*/
template <
typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type
typename InputIteratorT, ///< Random-access iterator type for input
typename OutputIteratorT, ///< Random-access iterator type for output
typename OffsetT, ///< Signed integer type for global offsets
typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
struct AgentReduce
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
/// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
/// Vector type of InputT for data movement
typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
/// Input iterator wrapper type (for applying cache modifier)
typedef typename If<IsPointer<InputIteratorT>::VALUE,
CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
InputIteratorT>::Type // Directly use the supplied input iterator type
WrappedInputIteratorT;
/// Constants
enum
{
BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD,
VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
// Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
(ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
(IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
};
static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER;
static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
/// Parameterized BlockReduce primitive
typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
/// Shared memory type required by this thread block
struct _TempStorage
{
typename BlockReduceT::TempStorage reduce;
OffsetT dequeue_offset;
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
xgboost/cub/cub/agent/agent_reduce.cuh view on Meta::CPAN
/**
* \brief Reduce a contiguous segment of input tiles
*/
template <int CAN_VECTORIZE>
__device__ __forceinline__ OutputT ConsumeRange(
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end, ///< [in] Threadblock end offset (exclusive)
Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads
{
OutputT thread_aggregate;
if (block_offset + TILE_ITEMS > block_end)
{
// First tile isn't full (not all threads have valid items)
int valid_items = block_end - block_offset;
ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
}
// At least one full block
ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
block_offset += TILE_ITEMS;
// Consume subsequent full tiles of input
while (block_offset + TILE_ITEMS <= block_end)
{
ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
block_offset += TILE_ITEMS;
}
// Consume a partially-full tile
if (block_offset < block_end)
{
int valid_items = block_end - block_offset;
ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
}
// Compute block-wide reduction (all threads have valid items)
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
}
/**
* \brief Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ OutputT ConsumeRange(
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
{
return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(block_offset, block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(block_offset, block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
}
/**
* Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT /*num_items*/, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &/*queue*/, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_EVEN_SHARE> /*is_even_share*/) ///< [in] Marker type indicating this is an even-share mapping
{
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
}
//---------------------------------------------------------------------
// Dynamically consume tiles
//---------------------------------------------------------------------
/**
* Dequeue and reduce tiles of items as part of a inter-block reduction
*/
template <int CAN_VECTORIZE>
__device__ __forceinline__ OutputT ConsumeTiles(
int num_items, ///< Total number of input items
GridQueue<OffsetT> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads
{
// We give each thread block at least one tile of input.
OutputT thread_aggregate;
OffsetT block_offset = blockIdx.x * TILE_ITEMS;
OffsetT even_share_base = gridDim.x * TILE_ITEMS;
if (block_offset + TILE_ITEMS > num_items)
{
// First tile isn't full (not all threads have valid items)
int valid_items = num_items - block_offset;
ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
}
// Consume first full tile of input
ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
if (num_items > even_share_base)
{
// Dequeue a tile of items
if (threadIdx.x == 0)
temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
CTA_SYNC();
// Grab tile offset and check if we're done with full tiles
block_offset = temp_storage.dequeue_offset;
// Consume more full tiles
while (block_offset + TILE_ITEMS <= num_items)
{
ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
CTA_SYNC();
// Dequeue a tile of items
if (threadIdx.x == 0)
temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
CTA_SYNC();
// Grab tile offset and check if we're done with full tiles
block_offset = temp_storage.dequeue_offset;
}
// Consume partial tile
if (block_offset < num_items)
{
int valid_items = num_items - block_offset;
ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
}
}
// Compute block-wide reduction (all threads have valid items)
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
}
/**
* Dequeue and reduce tiles of items as part of a inter-block reduction
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT num_items, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &/*even_share*/, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &queue, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_DYNAMIC> /*is_dynamic*/) ///< [in] Marker type indicating this is a dynamic mapping
{
return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeTiles(num_items, queue, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeTiles(num_items, queue, Int2Type<false && ATTEMPT_VECTORIZATION>());
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
( run in 0.752 second using v1.01-cache-2.11-cpan-39bf76dae61 )