locked results from the CPAN

Alien-XGBoost
{
    return block_reduce.Sum(data[0]);
}

/// Sum reduction (full, ITEMS_PER_THREAD)
template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
__device__ __forceinline__ T DeviceTest(
    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
{
    return block_reduce.Sum(data);
}

/// Sum reduction (partial, 1)
template <typename BlockReduceT, typename T>
__device__ __forceinline__ T DeviceTest(
    BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
{
    return block_reduce.Sum(data, valid_threads);
}


/**
 * Test full-tile reduction kernel (where num_items is an even
 * multiple of BLOCK_THREADS)
 */
template <
    BlockReduceAlgorithm    ALGORITHM,
    int                     BLOCK_DIM_X,
    int                     BLOCK_DIM_Y,
    int                     BLOCK_DIM_Z,
    int                     ITEMS_PER_THREAD,
    typename                T,
    typename                ReductionOp>
__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
__global__ void FullTileReduceKernel(
    T                       *d_in,
    T                       *d_out,
    ReductionOp             reduction_op,
    int                     tiles,
    clock_t                 *d_elapsed)
{
    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;

    // Cooperative threadblock reduction utility type (returns aggregate in thread 0)
    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;

    // Allocate temp storage in shared memory
    __shared__ typename BlockReduceT::TempStorage temp_storage;

    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);

    // Per-thread tile data
    T data[ITEMS_PER_THREAD];

    // Load first tile of data
    int block_offset = 0;

    if (block_offset < TILE_SIZE * tiles)
    {
        LoadDirectBlocked(linear_tid, d_in + block_offset, data);
        block_offset += TILE_SIZE;

        // Start cycle timer
        clock_t start = clock();

        // Cooperative reduce first tile
        BlockReduceT block_reduce(temp_storage) ;
        T block_aggregate = DeviceTest(block_reduce, data, reduction_op);

        // Stop cycle timer
 #if CUB_PTX_ARCH == 100
        // Bug: recording stop clock causes mis-write of running prefix value
        clock_t stop = 0;
#else
        clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
        clock_t elapsed = (start > stop) ? start - stop : stop - start;

        // Loop over input tiles
        while (block_offset < TILE_SIZE * tiles)
        {
            // TestBarrier between threadblock reductions
            __syncthreads();
    
            // Load tile of data
            LoadDirectBlocked(linear_tid, d_in + block_offset, data);
            block_offset += TILE_SIZE;

            // Start cycle timer
            clock_t start = clock();

            // Cooperatively reduce the tile's aggregate
            BlockReduceT block_reduce(temp_storage) ;
            T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);

            // Stop cycle timer
#if CUB_PTX_ARCH == 100
            // Bug: recording stop clock causes mis-write of running prefix value
            clock_t stop = 0;
#else
            clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
            elapsed += (start > stop) ? start - stop : stop - start;

            // Reduce threadblock aggregate
            block_aggregate = reduction_op(block_aggregate, tile_aggregate);
        }

        // Store data
        if (linear_tid == 0)
        {
            d_out[0] = block_aggregate;
            *d_elapsed = elapsed;
        }
    }
}



/**
 * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
 */
template <
    BlockReduceAlgorithm    ALGORITHM,
    int                     BLOCK_DIM_X,
    int                     BLOCK_DIM_Y,
    int                     BLOCK_DIM_Z,
    typename                T,
    typename                ReductionOp>
__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
__global__ void PartialTileReduceKernel(
    T                       *d_in,
    T                       *d_out,
    int                     num_items,
    ReductionOp             reduction_op,
    clock_t                 *d_elapsed)
{
    // Cooperative threadblock reduction utility type (returns aggregate only in thread-0)
    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;

    // Allocate temp storage in shared memory
    __shared__ typename BlockReduceT::TempStorage temp_storage;

    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);

    // Per-thread tile data
( run in 0.318 second using v1.01-cache-2.11-cpan-bbe5e583499 )