glob results from the CPAN

Alien-XGBoost


#include "single_pass_scan_operators.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_scan.cuh"
#include "../block/block_discontinuity.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../iterator/constant_input_iterator.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/******************************************************************************
 * Tuning policy types
 ******************************************************************************/

/**
 * Parameterizable tuning policy type for AgentReduceByKey
 */
template <
    int                         _BLOCK_THREADS,                 ///< Threads per thread block
    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
struct AgentReduceByKeyPolicy
{
    enum
    {
        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
    };

    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
};


/******************************************************************************
 * Thread block abstractions
 ******************************************************************************/

/**
 * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
 */
template <
    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
    typename    EqualityOpT,                    ///< KeyT equality operator type
    typename    ReductionOpT,                   ///< ValueT reduction operator type
    typename    OffsetT>                        ///< Signed integer type for global offsets
struct AgentReduceByKey
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input keys type
    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;

    // The output keys type
    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type

    // The input values type
    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;

    // The output values type
    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type

    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;

    // Tuple type for pairing keys and values
    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;

    // Tile status descriptor interface type
    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;

    // Guarded inequality functor
    template <typename _EqualityOpT>
    struct GuardedInequalityWrapper
    {
        _EqualityOpT     op;             ///< Wrapped equality operator
        int             num_remaining;  ///< Items remaining

        /// Constructor
        __host__ __device__ __forceinline__
        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}

        /// Boolean inequality operator, returns <tt>(a != b)</tt>
        template <typename T>
        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
        {
            if (idx < num_remaining)
                return !op(a, b);   // In bounds

            // Return true if first out-of-bounds item, false otherwise
            return (idx == num_remaining);
       }
    };


    // Constants
    enum
    {
        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,

xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN

        // Compact and scatter pairs
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            if (segment_flags[ITEM])
            {
                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
            }
        }

        CTA_SYNC();

        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
        {
            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
        }
    }


    /**
     * Scatter flagged items
     */
    __device__ __forceinline__ void Scatter(
        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
        OffsetT         num_tile_segments,
        OffsetT         num_tile_segments_prefix)
    {
        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
        {
            ScatterTwoPhase(
                scatter_items,
                segment_flags,
                segment_indices,
                num_tile_segments,
                num_tile_segments_prefix);
        }
        else
        {
            ScatterDirect(
                scatter_items,
                segment_flags,
                segment_indices);
        }
    }


    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------

    /**
     * Process a tile of input (dynamic chained scan)
     */
    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
    __device__ __forceinline__ void ConsumeTile(
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering

        // Load keys
        if (IS_LAST_TILE)
            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
        else
            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);

        // Load tile predecessor key in first thread
        KeyOutputT tile_predecessor;
        if (threadIdx.x == 0)
        {
            tile_predecessor = (tile_idx == 0) ?
                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
        }

        CTA_SYNC();

        // Load values
        if (IS_LAST_TILE)
            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
        else
            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);

        CTA_SYNC();

        // Initialize head-flags and shuffle up the previous keys
        if (IS_LAST_TILE)
        {
            // Use custom flag operator to additionally flag the first out-of-bounds item
            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
                head_flags, keys, prev_keys, flag_op, tile_predecessor);
        }
        else
        {
            InequalityWrapper<EqualityOpT> flag_op(equality_op);
            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
                head_flags, keys, prev_keys, flag_op, tile_predecessor);
        }

        // Zip values and head flags
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            scan_items[ITEM].value  = values[ITEM];
            scan_items[ITEM].key    = head_flags[ITEM];
        }

( run in 1.279 second using v1.01-cache-2.11-cpan-39bf76dae61 )