Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
params.d_vector_y[row_idx] = nonzero;
}
}
}
}
/**
* Run GPU I/O proxy
*/
template <
typename ValueT,
typename OffsetT>
float TestGpuCsrIoProxy(
SpmvParams<ValueT, OffsetT>& params,
int timing_iterations)
{
enum {
BLOCK_THREADS = 128,
ITEMS_PER_THREAD = 7,
TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// size_t smem = 1024 * 16;
size_t smem = 1024 * 0;
unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
unsigned int blocks = std::max(nonzero_blocks, row_blocks);
typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
TexItr x_itr;
CubDebugExit(x_itr.BindTexture(params.d_vector_x));
// Get device ordinal
int device_ordinal;
CubDebugExit(cudaGetDevice(&device_ordinal));
// Get device SM version
int sm_version;
CubDebugExit(SmVersion(sm_version, device_ordinal));
void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;
int spmv_sm_occupancy;
CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));
if (!g_quiet)
printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
// Warmup
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
// Check for failures
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(SyncStream(0));
// Timing
GpuTimer timer;
float elapsed_millis = 0.0;
timer.Start();
for (int it = 0; it < timing_iterations; ++it)
{
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
CubDebugExit(x_itr.UnbindTexture());
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// cuSparse HybMV
//---------------------------------------------------------------------
/**
* Run cuSparse HYB SpMV (specialized for fp32)
*/
template <
typename OffsetT>
float TestCusparseHybmv(
float* vector_y_in,
float* reference_vector_y_out,
SpmvParams<float, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
CpuTimer cpu_timer;
cpu_timer.Start();
// Construct Hyb matrix
cusparseMatDescr_t mat_desc;
cusparseHybMat_t hyb_desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
cusparseStatus_t status = cusparseScsr2hyb(
cusparse,
params.num_rows, params.num_cols,
mat_desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
hyb_desc,
0,
CUSPARSE_HYB_PARTITION_AUTO);
AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
cudaDeviceSynchronize();
cpu_timer.Stop();
float elapsed_millis = cpu_timer.ElapsedMillis();
printf("HYB setup ms, %.5f, ", elapsed_millis);
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Cleanup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
return elapsed_millis / timing_iterations;
}
/**
* Run cuSparse HYB SpMV (specialized for fp64)
*/
template <
typename OffsetT>
float TestCusparseHybmv(
double* vector_y_in,
double* reference_vector_y_out,
SpmvParams<double, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
CpuTimer cpu_timer;
cpu_timer.Start();
// Construct Hyb matrix
cusparseMatDescr_t mat_desc;
cusparseHybMat_t hyb_desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
cusparse,
params.num_rows, params.num_cols,
mat_desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
hyb_desc,
0,
CUSPARSE_HYB_PARTITION_AUTO));
cudaDeviceSynchronize();
cpu_timer.Stop();
float elapsed_millis = cpu_timer.ElapsedMillis();
printf("HYB setup ms, %.5f, ", elapsed_millis);
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Cleanup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// cuSparse CsrMV
//---------------------------------------------------------------------
/**
* Run cuSparse SpMV (specialized for fp32)
*/
template <
typename OffsetT>
float TestCusparseCsrmv(
float* vector_y_in,
float* reference_vector_y_out,
SpmvParams<float, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
cusparseMatDescr_t desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
float elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
return elapsed_millis / timing_iterations;
}
/**
* Run cuSparse SpMV (specialized for fp64)
*/
template <
typename OffsetT>
float TestCusparseCsrmv(
double* vector_y_in,
double* reference_vector_y_out,
SpmvParams<double, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
cusparseMatDescr_t desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
float elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// GPU Merge-based SpMV
//---------------------------------------------------------------------
/**
* Run CUB SpMV
*/
template <
typename ValueT,
typename OffsetT>
float TestGpuMergeCsrmv(
ValueT* vector_y_in,
ValueT* reference_vector_y_out,
SpmvParams<ValueT, OffsetT>& params,
int timing_iterations)
{
// Allocate temporary storage
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
// Get amount of temporary storage needed
CubDebugExit(DeviceSpmv::CsrMV(
d_temp_storage, temp_storage_bytes,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, params.d_vector_y,
params.num_rows, params.num_cols, params.num_nonzeros,
// params.alpha, params.beta,
(cudaStream_t) 0, false));
// Allocate
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
CubDebugExit(DeviceSpmv::CsrMV(
d_temp_storage, temp_storage_bytes,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, params.d_vector_y,
params.num_rows, params.num_cols, params.num_nonzeros,
// params.alpha, params.beta,
(cudaStream_t) 0, !g_quiet));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
GpuTimer timer;
float elapsed_millis = 0.0;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
CubDebugExit(DeviceSpmv::CsrMV(
d_temp_storage, temp_storage_bytes,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, params.d_vector_y,
params.num_rows, params.num_cols, params.num_nonzeros,
// params.alpha, params.beta,
(cudaStream_t) 0, false));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
/**
* Display perf
*/
template <typename ValueT, typename OffsetT>
void DisplayPerf(
float device_giga_bandwidth,
double avg_millis,
CsrMatrix<ValueT, OffsetT>& csr_matrix)
{
double nz_throughput, effective_bandwidth;
size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
(csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));
nz_throughput = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;
if (!g_quiet)
printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n",
sizeof(ValueT) * 8,
avg_millis,
2 * nz_throughput,
effective_bandwidth,
effective_bandwidth / device_giga_bandwidth * 100);
else
printf("%.5f, %.6f, %.3lf, %.2f%%, ",
avg_millis,
2 * nz_throughput,
effective_bandwidth,
effective_bandwidth / device_giga_bandwidth * 100);
fflush(stdout);
}
/**
* Run tests
*/
template <
typename ValueT,
typename OffsetT>
void RunTest(
bool rcm_relabel,
ValueT alpha,
ValueT beta,
CooMatrix<ValueT, OffsetT>& coo_matrix,
int timing_iterations,
CommandLineArgs& args)
{
// Adaptive timing iterations: run 16 billion nonzeros through
if (timing_iterations == -1)
timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));
( run in 0.609 second using v1.01-cache-2.11-cpan-d7f47b0818f )