Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/agent/agent_scan.cuh view on Meta::CPAN
#include "single_pass_scan_operators.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_scan.cuh"
#include "../grid/grid_queue.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Parameterizable tuning policy type for AgentScan
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use
BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
struct AgentScanPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
};
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
*/
template <
typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type
typename InputIteratorT, ///< Random-access input iterator type
typename OutputIteratorT, ///< Random-access output iterator type
typename ScanOpT, ///< Scan functor type
typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
typename OffsetT> ///< Signed integer type for global offsets
struct AgentScan
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
// Tile status descriptor interface type
typedef ScanTileState<OutputT> ScanTileStateT;
// Input iterator wrapper type (for applying cache modifier)
typedef typename If<IsPointer<InputIteratorT>::VALUE,
CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
InputIteratorT>::Type // Directly use the supplied input iterator type
WrappedInputIteratorT;
// Constants
enum
{
IS_INCLUSIVE = Equals<InitValueT, NullType>::VALUE, // Inclusive scan if no init_value type is provided
BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// Parameterized BlockLoad type
typedef BlockLoad<
OutputT,
AgentScanPolicyT::BLOCK_THREADS,
AgentScanPolicyT::ITEMS_PER_THREAD,
AgentScanPolicyT::LOAD_ALGORITHM>
BlockLoadT;
// Parameterized BlockStore type
typedef BlockStore<
OutputT,
AgentScanPolicyT::BLOCK_THREADS,
AgentScanPolicyT::ITEMS_PER_THREAD,
AgentScanPolicyT::STORE_ALGORITHM>
BlockStoreT;
// Parameterized BlockScan type
typedef BlockScan<
OutputT,
AgentScanPolicyT::BLOCK_THREADS,
AgentScanPolicyT::SCAN_ALGORITHM>
BlockScanT;
// Callback type for obtaining tile prefix during block scan
typedef TilePrefixCallbackOp<
OutputT,
ScanOpT,
ScanTileStateT>
xgboost/cub/cub/agent/agent_scan.cuh view on Meta::CPAN
/**
* Exclusive scan specialization (subsequent tiles)
*/
template <typename PrefixCallback>
__device__ __forceinline__
void ScanTile(
OutputT (&items)[ITEMS_PER_THREAD],
ScanOpT scan_op,
PrefixCallback &prefix_op,
Int2Type<false> /*is_inclusive*/)
{
BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
}
/**
* Inclusive scan specialization (subsequent tiles)
*/
template <typename PrefixCallback>
__device__ __forceinline__
void ScanTile(
OutputT (&items)[ITEMS_PER_THREAD],
ScanOpT scan_op,
PrefixCallback &prefix_op,
Int2Type<true> /*is_inclusive*/)
{
BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
}
//---------------------------------------------------------------------
// Constructor
//---------------------------------------------------------------------
// Constructor
__device__ __forceinline__
AgentScan(
TempStorage& temp_storage, ///< Reference to temp_storage
InputIteratorT d_in, ///< Input data
OutputIteratorT d_out, ///< Output data
ScanOpT scan_op, ///< Binary scan operator
InitValueT init_value) ///< Initial value to seed the exclusive scan
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_out(d_out),
scan_op(scan_op),
init_value(init_value)
{}
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process a tile of input (dynamic chained scan)
*/
template <bool IS_LAST_TILE> ///< Whether the current tile is the last tile
__device__ __forceinline__ void ConsumeTile(
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
// Load items
OutputT items[ITEMS_PER_THREAD];
if (IS_LAST_TILE)
BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
else
BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
CTA_SYNC();
// Perform tile scan
if (tile_idx == 0)
{
// Scan first tile
OutputT block_aggregate;
ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
if ((!IS_LAST_TILE) && (threadIdx.x == 0))
tile_state.SetInclusive(0, block_aggregate);
}
else
{
// Scan non-first tile
TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
}
CTA_SYNC();
// Store items
if (IS_LAST_TILE)
BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
else
BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
}
/**
* Scan tiles of items as part of a dynamic chained scan
*/
__device__ __forceinline__ void ConsumeRange(
int num_items, ///< Total number of input items
ScanTileStateT& tile_state, ///< Global tile state descriptor
int start_tile) ///< The starting tile for the current grid
{
// Blocks are launched in increasing order, so just assign one tile per block
int tile_idx = start_tile + blockIdx.x; // Current tile index
OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile
OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
if (num_remaining > TILE_ITEMS)
{
// Not last tile
ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
}
else if (num_remaining > 0)
{
( run in 0.424 second using v1.01-cache-2.11-cpan-39bf76dae61 )