Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/experimental/spmv_compare.cu  view on Meta::CPAN

                    params.d_vector_y[row_idx] = nonzero;
            }
        }
    }

}


/**
 * Run GPU I/O proxy
 */
template <
    typename ValueT,
    typename OffsetT>
float TestGpuCsrIoProxy(
    SpmvParams<ValueT, OffsetT>&    params,
    int                             timing_iterations)
{
    enum {
        BLOCK_THREADS       = 128,
        ITEMS_PER_THREAD    = 7,
        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
    };

//    size_t smem = 1024 * 16;
    size_t smem = 1024 * 0;

    unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
    unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
    unsigned int blocks = std::max(nonzero_blocks, row_blocks);

    typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
    TexItr x_itr;
    CubDebugExit(x_itr.BindTexture(params.d_vector_x));

    // Get device ordinal
    int device_ordinal;
    CubDebugExit(cudaGetDevice(&device_ordinal));

    // Get device SM version
    int sm_version;
    CubDebugExit(SmVersion(sm_version, device_ordinal));

    void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;


    int spmv_sm_occupancy;
    CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));

    if (!g_quiet)
        printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);

    // Warmup
    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);

    // Check for failures
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(SyncStream(0));

    // Timing
    GpuTimer timer;
    float elapsed_millis = 0.0;
    timer.Start();
    for (int it = 0; it < timing_iterations; ++it)
    {
        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    CubDebugExit(x_itr.UnbindTexture());

    return elapsed_millis / timing_iterations;
}



//---------------------------------------------------------------------
// cuSparse HybMV
//---------------------------------------------------------------------

/**
 * Run cuSparse HYB SpMV (specialized for fp32)
 */
template <
    typename OffsetT>
float TestCusparseHybmv(
    float*                          vector_y_in,
    float*                          reference_vector_y_out,
    SpmvParams<float, OffsetT>&     params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    CpuTimer cpu_timer;
    cpu_timer.Start();

    // Construct Hyb matrix
    cusparseMatDescr_t mat_desc;
    cusparseHybMat_t hyb_desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
    cusparseStatus_t status = cusparseScsr2hyb(
        cusparse,
        params.num_rows, params.num_cols,
        mat_desc,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        hyb_desc,
        0,
        CUSPARSE_HYB_PARTITION_AUTO);
    AssertEquals(CUSPARSE_STATUS_SUCCESS, status);

    cudaDeviceSynchronize();
    cpu_timer.Stop();
    float elapsed_millis = cpu_timer.ElapsedMillis();
    printf("HYB setup ms, %.5f, ", elapsed_millis);

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
        cusparse,
        CUSPARSE_OPERATION_NON_TRANSPOSE,
        &params.alpha, mat_desc,
        hyb_desc,
        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
            cusparse,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            &params.alpha, mat_desc,
            hyb_desc,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    // Cleanup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));

    return elapsed_millis / timing_iterations;
}


/**
 * Run cuSparse HYB SpMV (specialized for fp64)
 */
template <
    typename OffsetT>
float TestCusparseHybmv(
    double*                         vector_y_in,
    double*                         reference_vector_y_out,
    SpmvParams<double, OffsetT>&    params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    CpuTimer cpu_timer;
    cpu_timer.Start();

    // Construct Hyb matrix
    cusparseMatDescr_t mat_desc;
    cusparseHybMat_t hyb_desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
        cusparse,
        params.num_rows, params.num_cols,
        mat_desc,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        hyb_desc,
        0,
        CUSPARSE_HYB_PARTITION_AUTO));

    cudaDeviceSynchronize();
    cpu_timer.Stop();
    float elapsed_millis = cpu_timer.ElapsedMillis();
    printf("HYB setup ms, %.5f, ", elapsed_millis);

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
        cusparse,
        CUSPARSE_OPERATION_NON_TRANSPOSE,
        &params.alpha, mat_desc,
        hyb_desc,
        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
            cusparse,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            &params.alpha, mat_desc,
            hyb_desc,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    // Cleanup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));

    return elapsed_millis / timing_iterations;
}



//---------------------------------------------------------------------
// cuSparse CsrMV
//---------------------------------------------------------------------

/**
 * Run cuSparse SpMV (specialized for fp32)
 */
template <
    typename OffsetT>
float TestCusparseCsrmv(
    float*                          vector_y_in,
    float*                          reference_vector_y_out,
    SpmvParams<float, OffsetT>&     params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    cusparseMatDescr_t desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    float elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
    return elapsed_millis / timing_iterations;
}


/**
 * Run cuSparse SpMV (specialized for fp64)
 */
template <
    typename OffsetT>
float TestCusparseCsrmv(
    double*                         vector_y_in,
    double*                         reference_vector_y_out,
    SpmvParams<double, OffsetT>&    params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    cusparseMatDescr_t desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    float elapsed_millis = 0.0;
    GpuTimer timer;
    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, &params.beta, params.d_vector_y));

    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
    return elapsed_millis / timing_iterations;
}

//---------------------------------------------------------------------
// GPU Merge-based SpMV
//---------------------------------------------------------------------

/**
 * Run CUB SpMV
 */
template <
    typename ValueT,
    typename OffsetT>
float TestGpuMergeCsrmv(
    ValueT*                         vector_y_in,
    ValueT*                         reference_vector_y_out,
    SpmvParams<ValueT, OffsetT>&    params,
    int                             timing_iterations)
{
    // Allocate temporary storage
    size_t temp_storage_bytes = 0;
    void *d_temp_storage = NULL;

    // Get amount of temporary storage needed
    CubDebugExit(DeviceSpmv::CsrMV(
        d_temp_storage, temp_storage_bytes,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        params.d_vector_x, params.d_vector_y,
        params.num_rows, params.num_cols, params.num_nonzeros,
// params.alpha, params.beta,
        (cudaStream_t) 0, false));

    // Allocate
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    CubDebugExit(DeviceSpmv::CsrMV(
        d_temp_storage, temp_storage_bytes,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        params.d_vector_x, params.d_vector_y,
        params.num_rows, params.num_cols, params.num_nonzeros, 
// params.alpha, params.beta,
        (cudaStream_t) 0, !g_quiet));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    GpuTimer timer;
    float elapsed_millis = 0.0;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        CubDebugExit(DeviceSpmv::CsrMV(
            d_temp_storage, temp_storage_bytes,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, params.d_vector_y,
            params.num_rows, params.num_cols, params.num_nonzeros, 
// params.alpha, params.beta,
            (cudaStream_t) 0, false));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    return elapsed_millis / timing_iterations;
}

//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------

/**
 * Display perf
 */
template <typename ValueT, typename OffsetT>
void DisplayPerf(
    float                           device_giga_bandwidth,
    double                          avg_millis,
    CsrMatrix<ValueT, OffsetT>&     csr_matrix)
{
    double nz_throughput, effective_bandwidth;
    size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
        (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));

    nz_throughput       = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
    effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;

    if (!g_quiet)
        printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n",
            sizeof(ValueT) * 8,
            avg_millis,
            2 * nz_throughput,
            effective_bandwidth,
            effective_bandwidth / device_giga_bandwidth * 100);
    else
        printf("%.5f, %.6f, %.3lf, %.2f%%, ",
            avg_millis,
            2 * nz_throughput,
            effective_bandwidth,
            effective_bandwidth / device_giga_bandwidth * 100);

    fflush(stdout);
}



/**
 * Run tests
 */
template <
    typename ValueT,
    typename OffsetT>
void RunTest(
    bool                        rcm_relabel,
    ValueT                      alpha,
    ValueT                      beta,
    CooMatrix<ValueT, OffsetT>& coo_matrix,
    int                         timing_iterations,
    CommandLineArgs&            args)
{
    // Adaptive timing iterations: run 16 billion nonzeros through
    if (timing_iterations == -1)
        timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));



( run in 0.609 second using v1.01-cache-2.11-cpan-d7f47b0818f )