Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/test/test_block_scan.cu  view on Meta::CPAN


/// Inclusive sum (PREFIX, 1)
template <typename BlockScanT, typename T, typename PrefixCallbackOp>
__device__ __forceinline__ void DeviceTest(
    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
{
    block_scan.InclusiveSum(data[0], data[0], prefix_op);
}

/// Inclusive sum (PREFIX, ITEMS_PER_THREAD)
template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
__device__ __forceinline__ void DeviceTest(
    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
{
    block_scan.InclusiveSum(data, data, prefix_op);
}



//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------

/**
 * BlockScan test kernel.
 */
template <
    int                 BLOCK_DIM_X,
    int                 BLOCK_DIM_Y,
    int                 BLOCK_DIM_Z,
    int                 ITEMS_PER_THREAD,
    ScanMode            SCAN_MODE,
    TestMode            TEST_MODE,
    BlockScanAlgorithm  ALGORITHM,
    typename            T,
    typename            ScanOpT>
__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
__global__ void BlockScanKernel(
    T                   *d_in,
    T                   *d_out,
    T                   *d_aggregate,
    ScanOpT              scan_op,
    T                   initial_value,
    clock_t             *d_elapsed)
{
    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;

    // Parameterize BlockScan type for our thread block
    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;

    // Allocate temp storage in shared memory
    __shared__ typename BlockScanT::TempStorage temp_storage;

    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);

    // Per-thread tile data
    T data[ITEMS_PER_THREAD];
    LoadDirectBlocked(linear_tid, d_in, data);

    __threadfence_block();      // workaround to prevent clock hoisting
    clock_t start = clock();
    __threadfence_block();      // workaround to prevent clock hoisting

    // Test scan
    T                                   block_aggregate;
    BlockScanT                          block_scan(temp_storage);
    BlockPrefixCallbackOp<T, ScanOpT>   prefix_op(linear_tid, initial_value, scan_op);

    DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
        Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());

    // Stop cycle timer
    __threadfence_block();      // workaround to prevent clock hoisting
    clock_t stop = clock();
    __threadfence_block();      // workaround to prevent clock hoisting

    // Store output
    StoreDirectBlocked(linear_tid, d_out, data);

    // Store block_aggregate
    if (TEST_MODE != BASIC)
        d_aggregate[linear_tid] = block_aggregate;

    // Store prefix
    if (TEST_MODE == PREFIX)
    {
        if (linear_tid == 0)
            d_out[TILE_SIZE] = prefix_op.prefix;
    }

    // Store time
    if (linear_tid == 0)
        *d_elapsed = (start > stop) ? start - stop : stop - start;
}



//---------------------------------------------------------------------
// Host utility subroutines
//---------------------------------------------------------------------

/**
 * Initialize exclusive-scan problem (and solution)
 */
template <typename T, typename ScanOpT>
T Initialize(
    GenMode     gen_mode,
    T           *h_in,
    T           *h_reference,
    int         num_items,
    ScanOpT     scan_op,
    T           initial_value,
    Int2Type<EXCLUSIVE>)
{
    InitValue(gen_mode, h_in[0], 0);

    T block_aggregate   = h_in[0];
    h_reference[0]      = initial_value;
    T inclusive         = scan_op(initial_value, h_in[0]);

    for (int i = 1; i < num_items; ++i)
    {
        InitValue(gen_mode, h_in[i], i);
        h_reference[i] = inclusive;
        inclusive = scan_op(inclusive, h_in[i]);
        block_aggregate = scan_op(block_aggregate, h_in[i]);
    }

    return block_aggregate;
}


/**
 * Initialize inclusive-scan problem (and solution)
 */
template <typename T, typename ScanOpT>
T Initialize(
    GenMode     gen_mode,



( run in 1.262 second using v1.01-cache-2.11-cpan-39bf76dae61 )