Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
PartialProduct *d_block_partials; // Temporary storage for communicating dot product partials between threadblocks
// Create SOA version of coo_graph on host
int num_edges = coo_graph.coo_tuples.size();
VertexId *h_rows = new VertexId[num_edges];
VertexId *h_columns = new VertexId[num_edges];
Value *h_values = new Value[num_edges];
for (int i = 0; i < num_edges; i++)
{
h_rows[i] = coo_graph.coo_tuples[i].row;
h_columns[i] = coo_graph.coo_tuples[i].col;
h_values[i] = coo_graph.coo_tuples[i].val;
}
// Get CUDA properties
Device device_props;
CubDebugExit(device_props.Init());
// Determine launch configuration from kernel properties
int coo_sm_occupancy;
CubDebugExit(device_props.MaxSmOccupancy(
coo_sm_occupancy,
CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
COO_BLOCK_THREADS));
int max_coo_grid_size = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
// Construct an even-share work distribution
GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
int coo_grid_size = even_share.grid_size;
int num_partials = coo_grid_size * 2;
// Allocate COO device arrays
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows, sizeof(VertexId) * num_edges));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns, sizeof(VertexId) * num_edges));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_edges));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector, sizeof(Value) * coo_graph.col_dim));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result, sizeof(Value) * coo_graph.row_dim));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials, sizeof(PartialProduct) * num_partials));
// Copy host arrays to device
CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice));
// Bind textures
TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
// Print debug info
printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
if (coo_grid_size > 1)
{
printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
}
fflush(stdout);
CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
// Run kernel (always run one iteration without timing)
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i <= g_timing_iterations; i++)
{
gpu_timer.Start();
// Initialize output
CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
// Run the COO kernel
CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
even_share,
d_block_partials,
d_rows,
d_columns,
d_values,
d_vector,
d_result);
if (coo_grid_size > 1)
{
// Run the COO finalize kernel
CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
d_block_partials,
num_partials,
d_result);
}
gpu_timer.Stop();
if (i > 0)
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Force any kernel stdio to screen
CubDebugExit(cudaThreadSynchronize());
fflush(stdout);
// Display timing
if (g_timing_iterations > 0)
{
float avg_elapsed = elapsed_millis / g_timing_iterations;
int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
g_timing_iterations,
avg_elapsed,
total_bytes / avg_elapsed / 1000.0 / 1000.0,
num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
}
// Check results
int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
TexVector<Value>::UnbindTexture();
CubDebugExit(g_allocator.DeviceFree(d_block_partials));
CubDebugExit(g_allocator.DeviceFree(d_rows));
CubDebugExit(g_allocator.DeviceFree(d_columns));
CubDebugExit(g_allocator.DeviceFree(d_values));
CubDebugExit(g_allocator.DeviceFree(d_vector));
CubDebugExit(g_allocator.DeviceFree(d_result));
delete[] h_rows;
delete[] h_columns;
delete[] h_values;
}
/**
* Compute reference answer on CPU
*/
template <typename VertexId, typename Value>
void ComputeReference(
CooGraph<VertexId, Value>& coo_graph,
Value* h_vector,
Value* h_reference)
{
for (VertexId i = 0; i < coo_graph.row_dim; i++)
{
h_reference[i] = 0.0;
}
for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
{
h_reference[coo_graph.coo_tuples[i].row] +=
coo_graph.coo_tuples[i].val *
h_vector[coo_graph.coo_tuples[i].col];
}
}
/**
* Assign arbitrary values to vector items
*/
template <typename Value>
void AssignVectorValues(Value *vector, int col_dim)
{
for (int i = 0; i < col_dim; i++)
{
vector[i] = 1.0;
}
}
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
"\t--type=wheel --spokes=<spokes>\n"
"\t--type=grid2d --width=<width> [--no-self-loops]\n"
"\t--type=grid3d --width=<width> [--no-self-loops]\n"
"\t--type=market --file=<file>\n"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Get graph type
string type;
args.GetCmdLineArgument("type", type);
// Generate graph structure
CpuTimer timer;
timer.Start();
CooGraph<VertexId, Value> coo_graph;
if (type == string("grid2d"))
{
VertexId width;
args.GetCmdLineArgument("width", width);
bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
} else if (type == string("grid3d"))
{
VertexId width;
args.GetCmdLineArgument("width", width);
bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
}
else if (type == string("wheel"))
{
VertexId spokes;
args.GetCmdLineArgument("spokes", spokes);
printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
if (coo_graph.InitWheel(spokes)) exit(1);
}
else if (type == string("market"))
{
string filename;
args.GetCmdLineArgument("file", filename);
printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
if (coo_graph.InitMarket(filename)) exit(1);
}
else
{
printf("Unsupported graph type\n");
exit(1);
}
timer.Stop();
printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
timer.ElapsedMillis() / 1000.0,
coo_graph.coo_tuples.size(),
coo_graph.row_dim,
coo_graph.col_dim);
fflush(stdout);
if (g_verbose)
{
cout << coo_graph << "\n";
}
// Create vector
Value *h_vector = new Value[coo_graph.col_dim];
AssignVectorValues(h_vector, coo_graph.col_dim);
if (g_verbose)
{
printf("Vector[%d]: ", coo_graph.col_dim);
DisplayResults(h_vector, coo_graph.col_dim);
printf("\n\n");
}
// Compute reference answer
Value *h_reference = new Value[coo_graph.row_dim];
ComputeReference(coo_graph, h_vector, h_reference);
if (g_verbose)
{
printf("Results[%d]: ", coo_graph.row_dim);
DisplayResults(h_reference, coo_graph.row_dim);
printf("\n\n");
}
// Parameterization for SM35
enum
{
COO_BLOCK_THREADS = 64,
COO_ITEMS_PER_THREAD = 10,
COO_SUBSCRIPTION_FACTOR = 4,
FINALIZE_BLOCK_THREADS = 256,
FINALIZE_ITEMS_PER_THREAD = 4,
};
// Run GPU version
TestDevice<
COO_BLOCK_THREADS,
COO_ITEMS_PER_THREAD,
COO_SUBSCRIPTION_FACTOR,
FINALIZE_BLOCK_THREADS,
FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
// Cleanup
delete[] h_vector;
delete[] h_reference;
return 0;
}
( run in 0.752 second using v1.01-cache-2.11-cpan-d7f47b0818f )