Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/cub/agent/agent_scan.cuh  view on Meta::CPAN


#include "single_pass_scan_operators.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_scan.cuh"
#include "../grid/grid_queue.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/******************************************************************************
 * Tuning policy types
 ******************************************************************************/

/**
 * Parameterizable tuning policy type for AgentScan
 */
template <
    int                         _BLOCK_THREADS,                 ///< Threads per thread block
    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
struct AgentScanPolicy
{
    enum
    {
        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
    };

    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
};




/******************************************************************************
 * Thread block abstractions
 ******************************************************************************/

/**
 * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
 */
template <
    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
    typename InputIteratorT,        ///< Random-access input iterator type
    typename OutputIteratorT,       ///< Random-access output iterator type
    typename ScanOpT,               ///< Scan functor type
    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
    typename OffsetT>               ///< Signed integer type for global offsets
struct AgentScan
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input value type
    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

    // The output value type
    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    // Tile status descriptor interface type
    typedef ScanTileState<OutputT> ScanTileStateT;

    // Input iterator wrapper type (for applying cache modifier)
    typedef typename If<IsPointer<InputIteratorT>::VALUE,
            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
        WrappedInputIteratorT;

    // Constants
    enum
    {
        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
    };

    // Parameterized BlockLoad type
    typedef BlockLoad<
            OutputT,
            AgentScanPolicyT::BLOCK_THREADS,
            AgentScanPolicyT::ITEMS_PER_THREAD,
            AgentScanPolicyT::LOAD_ALGORITHM>
        BlockLoadT;

    // Parameterized BlockStore type
    typedef BlockStore<
            OutputT,
            AgentScanPolicyT::BLOCK_THREADS,
            AgentScanPolicyT::ITEMS_PER_THREAD,
            AgentScanPolicyT::STORE_ALGORITHM>
        BlockStoreT;

    // Parameterized BlockScan type
    typedef BlockScan<
            OutputT,
            AgentScanPolicyT::BLOCK_THREADS,
            AgentScanPolicyT::SCAN_ALGORITHM>
        BlockScanT;

    // Callback type for obtaining tile prefix during block scan
    typedef TilePrefixCallbackOp<
            OutputT,
            ScanOpT,
            ScanTileStateT>

xgboost/cub/cub/agent/agent_scan.cuh  view on Meta::CPAN

    /**
     * Exclusive scan specialization (subsequent tiles)
     */
    template <typename PrefixCallback>
    __device__ __forceinline__
    void ScanTile(
        OutputT             (&items)[ITEMS_PER_THREAD],
        ScanOpT             scan_op,
        PrefixCallback      &prefix_op,
        Int2Type<false>     /*is_inclusive*/)
    {
        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
    }


    /**
     * Inclusive scan specialization (subsequent tiles)
     */
    template <typename PrefixCallback>
    __device__ __forceinline__
    void ScanTile(
        OutputT             (&items)[ITEMS_PER_THREAD],
        ScanOpT             scan_op,
        PrefixCallback      &prefix_op,
        Int2Type<true>      /*is_inclusive*/)
    {
        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
    }


    //---------------------------------------------------------------------
    // Constructor
    //---------------------------------------------------------------------

    // Constructor
    __device__ __forceinline__
    AgentScan(
        TempStorage&    temp_storage,       ///< Reference to temp_storage
        InputIteratorT  d_in,               ///< Input data
        OutputIteratorT d_out,              ///< Output data
        ScanOpT         scan_op,            ///< Binary scan operator
        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
    :
        temp_storage(temp_storage.Alias()),
        d_in(d_in),
        d_out(d_out),
        scan_op(scan_op),
        init_value(init_value)
    {}


    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------

    /**
     * Process a tile of input (dynamic chained scan)
     */
    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
    __device__ __forceinline__ void ConsumeTile(
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        // Load items
        OutputT items[ITEMS_PER_THREAD];

        if (IS_LAST_TILE)
            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
        else
            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);

        CTA_SYNC();

        // Perform tile scan
        if (tile_idx == 0)
        {
            // Scan first tile
            OutputT block_aggregate;
            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
                tile_state.SetInclusive(0, block_aggregate);
        }
        else
        {
            // Scan non-first tile
            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
        }

        CTA_SYNC();

        // Store items
        if (IS_LAST_TILE)
            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
        else
            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
    }


    /**
     * Scan tiles of items as part of a dynamic chained scan
     */
    __device__ __forceinline__ void ConsumeRange(
        int                 num_items,          ///< Total number of input items
        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
        int                 start_tile)         ///< The starting tile for the current grid
    {
        // Blocks are launched in increasing order, so just assign one tile per block
        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)

        if (num_remaining > TILE_ITEMS)
        {
            // Not last tile
            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
        }
        else if (num_remaining > 0)
        {



( run in 0.424 second using v1.01-cache-2.11-cpan-39bf76dae61 )