Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
#include "single_pass_scan_operators.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_scan.cuh"
#include "../block/block_discontinuity.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../iterator/constant_input_iterator.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Parameterizable tuning policy type for AgentReduceByKey
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
struct AgentReduceByKeyPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
};
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
*/
template <
typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type
typename KeysInputIteratorT, ///< Random-access input iterator type for keys
typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys
typename ValuesInputIteratorT, ///< Random-access input iterator type for values
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected
typename EqualityOpT, ///< KeyT equality operator type
typename ReductionOpT, ///< ValueT reduction operator type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentReduceByKey
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input keys type
typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
// The output keys type
typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<KeysInputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT; // ... else the output iterator's value type
// The input values type
typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
// The output values type
typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<ValuesInputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT; // ... else the output iterator's value type
// Tuple type for scanning (pairs accumulated segment-value with segment-index)
typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
// Tuple type for pairing keys and values
typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
// Tile status descriptor interface type
typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
// Guarded inequality functor
template <typename _EqualityOpT>
struct GuardedInequalityWrapper
{
_EqualityOpT op; ///< Wrapped equality operator
int num_remaining; ///< Items remaining
/// Constructor
__host__ __device__ __forceinline__
GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
/// Boolean inequality operator, returns <tt>(a != b)</tt>
template <typename T>
__host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
{
if (idx < num_remaining)
return !op(a, b); // In bounds
// Return true if first out-of-bounds item, false otherwise
return (idx == num_remaining);
}
};
// Constants
enum
{
BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
// Compact and scatter pairs
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (segment_flags[ITEM])
{
temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
}
}
CTA_SYNC();
for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
{
KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item];
d_unique_out[num_tile_segments_prefix + item] = pair.key;
d_aggregates_out[num_tile_segments_prefix + item] = pair.value;
}
}
/**
* Scatter flagged items
*/
__device__ __forceinline__ void Scatter(
KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
OffsetT (&segment_flags)[ITEMS_PER_THREAD],
OffsetT (&segment_indices)[ITEMS_PER_THREAD],
OffsetT num_tile_segments,
OffsetT num_tile_segments_prefix)
{
// Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
{
ScatterTwoPhase(
scatter_items,
segment_flags,
segment_indices,
num_tile_segments,
num_tile_segments_prefix);
}
else
{
ScatterDirect(
scatter_items,
segment_flags,
segment_indices);
}
}
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process a tile of input (dynamic chained scan)
*/
template <bool IS_LAST_TILE> ///< Whether the current tile is the last tile
__device__ __forceinline__ void ConsumeTile(
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys
KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up
ValueOutputT values[ITEMS_PER_THREAD]; // Tile values
OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags
OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices
OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices
KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering
// Load keys
if (IS_LAST_TILE)
BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
else
BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
// Load tile predecessor key in first thread
KeyOutputT tile_predecessor;
if (threadIdx.x == 0)
{
tile_predecessor = (tile_idx == 0) ?
keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head)
d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile
}
CTA_SYNC();
// Load values
if (IS_LAST_TILE)
BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
else
BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
CTA_SYNC();
// Initialize head-flags and shuffle up the previous keys
if (IS_LAST_TILE)
{
// Use custom flag operator to additionally flag the first out-of-bounds item
GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
head_flags, keys, prev_keys, flag_op, tile_predecessor);
}
else
{
InequalityWrapper<EqualityOpT> flag_op(equality_op);
BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
head_flags, keys, prev_keys, flag_op, tile_predecessor);
}
// Zip values and head flags
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
scan_items[ITEM].value = values[ITEM];
scan_items[ITEM].key = head_flags[ITEM];
}
( run in 1.279 second using v1.01-cache-2.11-cpan-39bf76dae61 )