Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/test/test_block_scan.cu  view on Meta::CPAN

        (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(),
        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
        ITEMS_PER_THREAD,  TILE_SIZE,
        typeid(T).name(), (int) sizeof(T));
    fflush(stdout);

    // Initialize/clear device arrays
    T       *d_in = NULL;
    T       *d_out = NULL;
    T       *d_aggregate = NULL;
    clock_t *d_elapsed = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2)));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS));
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1)));
    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS));

    // Display input problem data
    if (g_verbose)
    {
        printf("Input data: ");
        for (int i = 0; i < TILE_SIZE; i++)
        {
            std::cout << CoutCast(h_in[i]) << ", ";
        }
        printf("\n\n");
    }

    // Run block_aggregate/prefix kernel
    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
    BlockScanKernel<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM><<<1, block_dims>>>(
        d_in,
        d_out,
        d_aggregate,
        scan_op,
        initial_value,
        d_elapsed);

    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(cudaDeviceSynchronize());

    // Copy out and display results
    printf("\tScan results: ");
    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    if (TEST_MODE == AGGREGATE)
    {
        // Copy out and display block_aggregate
        printf("\tScan block aggregate: ");
        compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose);
        printf("%s\n", compare ? "FAIL" : "PASS");
        AssertEquals(0, compare);
    }

    if (TEST_MODE == PREFIX)
    {
        // Copy out and display updated prefix
        printf("\tScan running total: ");
        T running_total = scan_op(initial_value, block_aggregate);
        compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
        printf("%s\n", compare ? "FAIL" : "PASS");
        AssertEquals(0, compare);
    }

    printf("\tElapsed clocks: ");
    DisplayDeviceResults(d_elapsed, 1);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_reference) delete[] h_reference;
    if (h_aggregate) delete[] h_aggregate;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
}


/**
 * Test threadblock scan.  (Specialized for insufficient resources)
 */
template <
    int                 BLOCK_DIM_X,
    int                 BLOCK_DIM_Y,
    int                 BLOCK_DIM_Z,
    int                 ITEMS_PER_THREAD,
    ScanMode            SCAN_MODE,
    TestMode            TEST_MODE,
    BlockScanAlgorithm  ALGORITHM,
    typename            ScanOpT,
    typename            T>
void Test(
    GenMode             gen_mode,
    ScanOpT             scan_op,
    T                   initial_value,
    Int2Type<false>     sufficient_resources)
{}


/**
 * Test threadblock scan.
 */
template <
    int                 BLOCK_DIM_X,
    int                 BLOCK_DIM_Y,
    int                 BLOCK_DIM_Z,
    int                 ITEMS_PER_THREAD,
    ScanMode            SCAN_MODE,
    TestMode            TEST_MODE,
    BlockScanAlgorithm  ALGORITHM,
    typename            ScanOpT,
    typename            T>
void Test(
    GenMode             gen_mode,
    ScanOpT             scan_op,
    T                   initial_value)
{



( run in 0.669 second using v1.01-cache-2.11-cpan-39bf76dae61 )