locked results from the CPAN

Alien-XGBoost
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Simple demonstration of cub::BlockScan
 *
 * To compile using the command line:
 *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console (define before including cub.h)
#define CUB_STDERR

#include <stdio.h>
#include <iostream>

#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/block/block_scan.cuh>

#include "../../test/test_util.h"

using namespace cub;

//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

/// Verbose output
bool g_verbose = false;

/// Timing iterations
int g_timing_iterations = 100;

/// Default grid size
int g_grid_size = 1;



//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------

/**
 * Simple kernel for performing a block-wide exclusive prefix sum over integers
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    BlockScanAlgorithm      ALGORITHM>
__global__ void BlockPrefixSumKernel(
    int         *d_in,          // Tile of input
    int         *d_out,         // Tile of output
    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
{
    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
    typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;

    // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
    typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;

    // Specialize BlockScan type for our thread block
    typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;

    // Shared memory
    __shared__ union
    {
        typename BlockLoadT::TempStorage    load;
        typename BlockStoreT::TempStorage   store;
        typename BlockScanT::TempStorage    scan;
    } temp_storage;

    // Per-thread tile data
    int data[ITEMS_PER_THREAD];

    // Load items into a blocked arrangement
    BlockLoadT(temp_storage.load).Load(d_in, data);

    // Barrier for smem reuse
    __syncthreads();

    // Start cycle timer
    clock_t start = clock();

    // Compute exclusive prefix sum
    int aggregate;
    BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);

    // Stop cycle timer
    clock_t stop = clock();

    // Barrier for smem reuse
    __syncthreads();

    // Store items from a blocked arrangement
    BlockStoreT(temp_storage.store).Store(d_out, data);

    // Store aggregate and elapsed clocks
    if (threadIdx.x == 0)
    {
        *d_elapsed = (start > stop) ? start - stop : stop - start;
        d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
    }
}



//---------------------------------------------------------------------
// Host utilities
//---------------------------------------------------------------------

/**
 * Initialize exclusive prefix sum problem (and solution).
 * Returns the aggregate
 */
int Initialize(
    int *h_in,
    int *h_reference,
    int num_items)
{
    int inclusive = 0;

    for (int i = 0; i < num_items; ++i)
    {
        h_in[i] = i % 17;

        h_reference[i] = inclusive;
        inclusive += h_in[i];
    }

    return inclusive;
}


/**
 * Test thread block scan
 */
template <
    int                 BLOCK_THREADS,
    int                 ITEMS_PER_THREAD,
    BlockScanAlgorithm  ALGORITHM>
void Test()
{
    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;

    // Allocate host arrays
    int *h_in           = new int[TILE_SIZE];
    int *h_reference    = new int[TILE_SIZE];
    int *h_gpu          = new int[TILE_SIZE + 1];

    // Initialize problem and reference output on host
    int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);

    // Initialize device arrays
    int *d_in           = NULL;
( run in 2.666 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )