Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
/******************************************************************************
* Simple demonstration of cub::BlockRadixSort
*
* To compile using the command line:
* nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console (define before including cub.h)
#define CUB_STDERR
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/block/block_radix_sort.cuh>
#include "../../test/test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
/// Verbose output
bool g_verbose = false;
/// Timing iterations
int g_timing_iterations = 100;
/// Default grid size
int g_grid_size = 1;
/// Uniform key samples
bool g_uniform_keys;
//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------
/**
* Simple kernel for performing a block-wide sorting over integers
*/
template <
typename Key,
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
__launch_bounds__ (BLOCK_THREADS)
__global__ void BlockSortKernel(
Key *d_in, // Tile of input
Key *d_out, // Tile of output
clock_t *d_elapsed) // Elapsed cycle count of block scan
{
enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
// Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
// Specialize BlockRadixSort type for our thread block
typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// Shared memory
__shared__ union
{
typename BlockLoadT::TempStorage load;
typename BlockRadixSortT::TempStorage sort;
} temp_storage;
// Per-thread tile items
Key items[ITEMS_PER_THREAD];
// Our current block's offset
int block_offset = blockIdx.x * TILE_SIZE;
// Load items into a blocked arrangement
BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
// Barrier for smem reuse
__syncthreads();
// Start cycle timer
clock_t start = clock();
// Sort keys
BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
// Stop cycle timer
clock_t stop = clock();
// Store output in striped fashion
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
// Store elapsed clocks
if (threadIdx.x == 0)
{
d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
}
}
//---------------------------------------------------------------------
// Host utilities
//---------------------------------------------------------------------
/**
* Initialize sorting problem (and solution).
*/
template <typename Key>
void Initialize(
Key *h_in,
Key *h_reference,
int num_items,
int tile_size)
{
for (int i = 0; i < num_items; ++i)
{
if (g_uniform_keys)
{
h_in[i] = 0;
}
else
{
RandomBits(h_in[i]);
}
h_reference[i] = h_in[i];
}
// Only sort the first tile
std::sort(h_reference, h_reference + tile_size);
}
/**
* Test BlockScan
*/
template <
typename Key,
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
void Test()
{
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
int max_sm_occupancy;
CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
// Copy problem to device
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
fflush(stdout);
// Run kernel once to prime caches and check result
BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Check results
printf("\tOutput items: ");
int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
fflush(stdout);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
unsigned long long elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
timer.Start();
// Run kernel
BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Copy clocks from device
CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
for (int i = 0; i < g_grid_size; i++)
elapsed_clocks += h_elapsed[i];
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaDeviceSynchronize());
// Display timing results
float avg_millis = elapsed_millis / g_timing_iterations;
float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
double avg_clocks = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
double avg_clocks_per_item = avg_clocks / TILE_SIZE;
printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
printf("\tAverage kernel millis: %.4f\n", avg_millis);
printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
fflush(stdout);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (h_elapsed) delete[] h_elapsed;
if (d_in) CubDebugExit(cudaFree(d_in));
if (d_out) CubDebugExit(cudaFree(d_out));
if (d_elapsed) CubDebugExit(cudaFree(d_elapsed));
}
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
g_uniform_keys = args.CheckCmdLineFlag("uniform");
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("grid-size", g_grid_size);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--i=<timing iterations (default:%d)>]"
"[--grid-size=<grid size (default:%d)>]"
"[--v] "
"\n", argv[0], g_timing_iterations, g_grid_size);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
fflush(stdout);
// Run tests
printf("\nuint32:\n"); fflush(stdout);
Test<unsigned int, 128, 13>();
printf("\n"); fflush(stdout);
printf("\nfp32:\n"); fflush(stdout);
Test<float, 128, 13>();
printf("\n"); fflush(stdout);
printf("\nuint8:\n"); fflush(stdout);
Test<unsigned char, 128, 13>();
printf("\n"); fflush(stdout);
return 0;
}
( run in 0.666 second using v1.01-cache-2.11-cpan-39bf76dae61 )