Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
NumRunsOutputIteratorT,
EqualityOpT,
ReductionOpT,
OffsetT>
AgentReduceByKeyT;
// Shared memory for AgentReduceByKey
__shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
// Process tiles
AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
num_items,
tile_state,
start_tile);
}
/******************************************************************************
* Dispatch
******************************************************************************/
/**
* Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
*/
template <
typename KeysInputIteratorT, ///< Random-access input iterator type for keys
typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys
typename ValuesInputIteratorT, ///< Random-access input iterator type for values
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered
typename EqualityOpT, ///< KeyT equality operator type
typename ReductionOpT, ///< ValueT reduction operator type
typename OffsetT> ///< Signed integer type for global offsets
struct DispatchReduceByKey
{
//-------------------------------------------------------------------------
// Types and constants
//-------------------------------------------------------------------------
// The input keys type
typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
// The output keys type
typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<KeysInputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT; // ... else the output iterator's value type
// The input values type
typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
// The output values type
typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<ValuesInputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT; // ... else the output iterator's value type
enum
{
INIT_KERNEL_THREADS = 128,
MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT),
};
// Tile status descriptor interface type
typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
//-------------------------------------------------------------------------
// Tuning policies
//-------------------------------------------------------------------------
/// SM35
struct Policy350
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 6,
ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
};
typedef AgentReduceByKeyPolicy<
128,
ITEMS_PER_THREAD,
BLOCK_LOAD_DIRECT,
LOAD_LDG,
BLOCK_SCAN_WARP_SCANS>
ReduceByKeyPolicyT;
};
/// SM30
struct Policy300
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 6,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
};
typedef AgentReduceByKeyPolicy<
128,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_SCAN_WARP_SCANS>
ReduceByKeyPolicyT;
};
/// SM20
struct Policy200
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 11,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
};
typedef AgentReduceByKeyPolicy<
128,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_SCAN_WARP_SCANS>
ReduceByKeyPolicyT;
};
/// SM13
struct Policy130
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 7,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
};
typedef AgentReduceByKeyPolicy<
128,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_SCAN_WARP_SCANS>
ReduceByKeyPolicyT;
};
/// SM11
struct Policy110
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 5,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
};
typedef AgentReduceByKeyPolicy<
64,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
BLOCK_SCAN_RAKING>
ReduceByKeyPolicyT;
};
/******************************************************************************
* Tuning policies of current PTX compiler pass
******************************************************************************/
#if (CUB_PTX_ARCH >= 350)
typedef Policy350 PtxPolicy;
#elif (CUB_PTX_ARCH >= 300)
typedef Policy300 PtxPolicy;
#elif (CUB_PTX_ARCH >= 200)
typedef Policy200 PtxPolicy;
#elif (CUB_PTX_ARCH >= 130)
typedef Policy130 PtxPolicy;
#else
typedef Policy110 PtxPolicy;
#endif
// "Opaque" policies (whose parameterizations aren't reflected in the type signature)
struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
/******************************************************************************
* Utilities
******************************************************************************/
/**
* Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
*/
template <typename KernelConfig>
CUB_RUNTIME_FUNCTION __forceinline__
static void InitConfigs(
int ptx_version,
KernelConfig &reduce_by_key_config)
{
#if (CUB_PTX_ARCH > 0)
(void)ptx_version;
// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
#else
// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 350)
( run in 0.994 second using v1.01-cache-2.11-cpan-39bf76dae61 )