Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/cub/agent/agent_radix_sort_upsweep.cuh  view on Meta::CPAN

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
 */

#pragma once

#include "../thread/thread_reduce.cuh"
#include "../thread/thread_load.cuh"
#include "../block/block_load.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

/******************************************************************************
 * Tuning policy types
 ******************************************************************************/

/**
 * Parameterizable tuning policy type for AgentRadixSortUpsweep
 */
template <
    int                 _BLOCK_THREADS,     ///< Threads per thread block
    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
struct AgentRadixSortUpsweepPolicy
{
    enum
    {
        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
    };

    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
};


/******************************************************************************
 * Thread block abstractions
 ******************************************************************************/

/**
 * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
 */
template <
    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
    typename KeyT,                          ///< KeyT type
    typename OffsetT>                       ///< Signed integer type for global offsets
struct AgentRadixSortUpsweep
{

    //---------------------------------------------------------------------
    // Type definitions and constants
    //---------------------------------------------------------------------

    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;

    // Integer type for digit counters (to be packed into words of PackedCounters)
    typedef unsigned char DigitCounter;

    // Integer type for packing DigitCounters into columns of shared memory banks
    typedef unsigned int PackedCounter;

    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;

    enum
    {
        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,

        RADIX_DIGITS            = 1 << RADIX_BITS,

        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
        WARP_THREADS            = 1 << LOG_WARP_THREADS,
        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,

        BYTES_PER_COUNTER       = sizeof(DigitCounter),
        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,

        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,

        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,

        // To prevent counter overflow, we must periodically unpack and aggregate the
        // digit counters back into registers.  Each counter lane is assigned to a
        // warp for aggregation.

        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),

        // Unroll tiles in batches without risk of counter overflow
        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
    };


    // Input iterator wrapper type (for applying cache modifier)s
    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;

    /**
     * Shared memory storage layout
     */
    struct _TempStorage
    {



( run in 2.834 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )