Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
*/
template <
typename ValueT,
typename OffsetT>
float TestGpuCsrIoProxy(
SpmvParams<ValueT, OffsetT>& params,
int timing_iterations)
{
enum {
BLOCK_THREADS = 128,
ITEMS_PER_THREAD = 7,
TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// size_t smem = 1024 * 16;
size_t smem = 1024 * 0;
unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
unsigned int blocks = std::max(nonzero_blocks, row_blocks);
typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
TexItr x_itr;
CubDebugExit(x_itr.BindTexture(params.d_vector_x));
// Get device ordinal
int device_ordinal;
CubDebugExit(cudaGetDevice(&device_ordinal));
// Get device SM version
int sm_version;
CubDebugExit(SmVersion(sm_version, device_ordinal));
void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;
int spmv_sm_occupancy;
CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));
if (!g_quiet)
printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
// Warmup
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
// Check for failures
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(SyncStream(0));
// Timing
GpuTimer timer;
float elapsed_millis = 0.0;
timer.Start();
for (int it = 0; it < timing_iterations; ++it)
{
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
CubDebugExit(x_itr.UnbindTexture());
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// cuSparse HybMV
//---------------------------------------------------------------------
/**
* Run cuSparse HYB SpMV (specialized for fp32)
*/
template <
typename OffsetT>
float TestCusparseHybmv(
float* vector_y_in,
float* reference_vector_y_out,
SpmvParams<float, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
CpuTimer cpu_timer;
cpu_timer.Start();
// Construct Hyb matrix
cusparseMatDescr_t mat_desc;
cusparseHybMat_t hyb_desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
cusparseStatus_t status = cusparseScsr2hyb(
cusparse,
params.num_rows, params.num_cols,
mat_desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
hyb_desc,
0,
CUSPARSE_HYB_PARTITION_AUTO);
AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
cudaDeviceSynchronize();
cpu_timer.Stop();
float elapsed_millis = cpu_timer.ElapsedMillis();
printf("HYB setup ms, %.5f, ", elapsed_millis);
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
( run in 1.259 second using v1.01-cache-2.11-cpan-2398b32b56e )