Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
#include <iterator>
#include <vector>
#include <string>
#include <algorithm>
#include <stdio.h>
#include <cub/cub.cuh>
#include "coo_graph.cuh"
#include "../test/test_util.h"
using namespace cub;
using namespace std;
/******************************************************************************
* Globals, constants, and typedefs
******************************************************************************/
typedef int VertexId; // uint32s as vertex ids
typedef double Value; // double-precision floating point values
bool g_verbose = false;
int g_timing_iterations = 1;
CachingDeviceAllocator g_allocator;
/******************************************************************************
* Texture referencing
******************************************************************************/
/**
* Templated texture reference type for multiplicand vector
*/
template <typename Value>
struct TexVector
{
// Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
// Texture reference type
typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
static TexRef ref;
/**
* Bind textures
*/
static void BindTexture(void *d_in, int elements)
{
cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
if (d_in)
{
size_t offset;
size_t bytes = sizeof(CastType) * elements;
CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
}
}
/**
* Unbind textures
*/
static void UnbindTexture()
{
CubDebugExit(cudaUnbindTexture(ref));
}
/**
* Load
*/
static __device__ __forceinline__ Value Load(int offset)
{
Value output;
reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
return output;
}
};
// Texture reference definitions
template <typename Value>
typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
/******************************************************************************
* Utility types
******************************************************************************/
/**
* A partial dot-product sum paired with a corresponding row-id
*/
template <typename VertexId, typename Value>
struct PartialProduct
{
VertexId row; /// Row-id
Value partial; /// PartialProduct sum
};
/**
* A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
*/
template <>
struct PartialProduct<int, double>
{
long long row; /// Row-id
double partial; /// PartialProduct sum
};
/**
* Reduce-value-by-row scan operator
*/
struct ReduceByKeyOp
{
template <typename PartialProduct>
__device__ __forceinline__ PartialProduct operator()(
const PartialProduct &first,
const PartialProduct &second)
{
PartialProduct retval;
retval.partial = (second.row != first.row) ?
second.partial :
first.partial + second.partial;
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
// Run kernel (always run one iteration without timing)
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i <= g_timing_iterations; i++)
{
gpu_timer.Start();
// Initialize output
CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
// Run the COO kernel
CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
even_share,
d_block_partials,
d_rows,
d_columns,
d_values,
d_vector,
d_result);
if (coo_grid_size > 1)
{
// Run the COO finalize kernel
CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
d_block_partials,
num_partials,
d_result);
}
gpu_timer.Stop();
if (i > 0)
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Force any kernel stdio to screen
CubDebugExit(cudaThreadSynchronize());
fflush(stdout);
// Display timing
if (g_timing_iterations > 0)
{
float avg_elapsed = elapsed_millis / g_timing_iterations;
int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
g_timing_iterations,
avg_elapsed,
total_bytes / avg_elapsed / 1000.0 / 1000.0,
num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
}
// Check results
int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
TexVector<Value>::UnbindTexture();
CubDebugExit(g_allocator.DeviceFree(d_block_partials));
CubDebugExit(g_allocator.DeviceFree(d_rows));
CubDebugExit(g_allocator.DeviceFree(d_columns));
CubDebugExit(g_allocator.DeviceFree(d_values));
CubDebugExit(g_allocator.DeviceFree(d_vector));
CubDebugExit(g_allocator.DeviceFree(d_result));
delete[] h_rows;
delete[] h_columns;
delete[] h_values;
}
/**
* Compute reference answer on CPU
*/
template <typename VertexId, typename Value>
void ComputeReference(
CooGraph<VertexId, Value>& coo_graph,
Value* h_vector,
Value* h_reference)
{
for (VertexId i = 0; i < coo_graph.row_dim; i++)
{
h_reference[i] = 0.0;
}
for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
{
h_reference[coo_graph.coo_tuples[i].row] +=
coo_graph.coo_tuples[i].val *
h_vector[coo_graph.coo_tuples[i].col];
}
}
/**
* Assign arbitrary values to vector items
*/
template <typename Value>
void AssignVectorValues(Value *vector, int col_dim)
{
for (int i = 0; i < col_dim; i++)
{
vector[i] = 1.0;
}
}
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
( run in 0.676 second using v1.01-cache-2.11-cpan-2398b32b56e )