view release on metacpan or search on metacpan
xgboost/dmlc-core/include/dmlc/memory.h
xgboost/dmlc-core/include/dmlc/memory_io.h
xgboost/dmlc-core/include/dmlc/omp.h
xgboost/dmlc-core/include/dmlc/optional.h
xgboost/dmlc-core/include/dmlc/parameter.h
xgboost/dmlc-core/include/dmlc/recordio.h
xgboost/dmlc-core/include/dmlc/registry.h
xgboost/dmlc-core/include/dmlc/serializer.h
xgboost/dmlc-core/include/dmlc/thread_local.h
xgboost/dmlc-core/include/dmlc/threadediter.h
xgboost/dmlc-core/include/dmlc/timer.h
xgboost/dmlc-core/include/dmlc/type_traits.h
xgboost/dmlc-core/make/config.mk
xgboost/dmlc-core/make/dmlc.mk
xgboost/dmlc-core/scripts/lint.py
xgboost/dmlc-core/scripts/packages.mk
xgboost/dmlc-core/scripts/setup_nvcc.sh
xgboost/dmlc-core/scripts/travis/travis_before_cache.sh
xgboost/dmlc-core/scripts/travis/travis_osx_install.sh
xgboost/dmlc-core/scripts/travis/travis_script.sh
xgboost/dmlc-core/scripts/travis/travis_setup_env.sh
xgboost/rabit/include/dmlc/README.md
xgboost/rabit/include/dmlc/base.h
xgboost/rabit/include/dmlc/io.h
xgboost/rabit/include/dmlc/logging.h
xgboost/rabit/include/dmlc/serializer.h
xgboost/rabit/include/dmlc/type_traits.h
xgboost/rabit/include/rabit/c_api.h
xgboost/rabit/include/rabit/internal/engine.h
xgboost/rabit/include/rabit/internal/io.h
xgboost/rabit/include/rabit/internal/rabit-inl.h
xgboost/rabit/include/rabit/internal/timer.h
xgboost/rabit/include/rabit/internal/utils.h
xgboost/rabit/include/rabit/rabit.h
xgboost/rabit/include/rabit/serializable.h
xgboost/rabit/lib/README.md
xgboost/rabit/python/rabit.py
xgboost/rabit/scripts/travis_runtest.sh
xgboost/rabit/scripts/travis_script.sh
xgboost/rabit/src/README.md
xgboost/rabit/src/allreduce_base.cc
xgboost/rabit/src/allreduce_base.h
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
// Our current block's offset
int block_offset = blockIdx.x * TILE_SIZE;
// Load items into a blocked arrangement
BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
// Barrier for smem reuse
__syncthreads();
// Start cycle timer
clock_t start = clock();
// Sort keys
BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
// Stop cycle timer
clock_t stop = clock();
// Store output in striped fashion
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
// Store elapsed clocks
if (threadIdx.x == 0)
{
d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
}
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
CubDebugExit(cudaDeviceSynchronize());
// Check results
printf("\tOutput items: ");
int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
fflush(stdout);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
unsigned long long elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
timer.Start();
// Run kernel
BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Copy clocks from device
CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
for (int i = 0; i < g_grid_size; i++)
elapsed_clocks += h_elapsed[i];
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaDeviceSynchronize());
xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN
// Specialize BlockReduce type for our thread block
typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;
// Shared memory
__shared__ typename BlockReduceT::TempStorage temp_storage;
// Per-thread tile data
int data[ITEMS_PER_THREAD];
LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);
// Start cycle timer
clock_t start = clock();
// Compute sum
int aggregate = BlockReduceT(temp_storage).Sum(data);
// Stop cycle timer
clock_t stop = clock();
// Store aggregate and elapsed clocks
if (threadIdx.x == 0)
{
*d_elapsed = (start > stop) ? start - stop : stop - start;
*d_out = aggregate;
}
}
xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN
d_out,
d_elapsed);
// Check total aggregate
printf("\tAggregate: ");
int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
clock_t elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
timer.Start();
// Run aggregate/prefix kernel
BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Copy clocks from device
clock_t clocks;
CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
elapsed_clocks += clocks;
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaPeekAtLastError());
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
// Per-thread tile data
int data[ITEMS_PER_THREAD];
// Load items into a blocked arrangement
BlockLoadT(temp_storage.load).Load(d_in, data);
// Barrier for smem reuse
__syncthreads();
// Start cycle timer
clock_t start = clock();
// Compute exclusive prefix sum
int aggregate;
BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
// Stop cycle timer
clock_t stop = clock();
// Barrier for smem reuse
__syncthreads();
// Store items from a blocked arrangement
BlockStoreT(temp_storage.store).Store(d_out, data);
// Store aggregate and elapsed clocks
if (threadIdx.x == 0)
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Check total aggregate
printf("\tAggregate: ");
compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
clock_t elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
timer.Start();
// Run aggregate/prefix kernel
BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Copy clocks from device
clock_t clocks;
CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
elapsed_clocks += clocks;
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaPeekAtLastError());
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
fflush(stdout);
Initialize(h_keys, h_values, num_items, max_key);
int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
printf("%d non-trivial runs\n", num_runs);
fflush(stdout);
// Repeat for performance timing
GpuTimer gpu_timer;
GpuTimer gpu_rle_timer;
float elapsed_millis = 0.0;
float elapsed_rle_millis = 0.0;
for (int i = 0; i <= timing_iterations; ++i)
{
// Allocate and initialize device arrays for sorting
DoubleBuffer<Key> d_keys;
DoubleBuffer<Value> d_values;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
// Start timer
gpu_timer.Start();
// Allocate temporary storage for sorting
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Do the sort
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
// Free unused buffers and sorting temporary storage
if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Start timer
gpu_rle_timer.Start();
// Allocate device arrays for enumerating non-trivial runs
int *d_offests_out = NULL;
int *d_lengths_out = NULL;
int *d_num_runs = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));
// Allocate temporary storage for isolating non-trivial runs
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
d_num_runs,
num_items));
// Free keys buffer
if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
//
// Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
//
// Stop sort timer
gpu_timer.Stop();
gpu_rle_timer.Stop();
if (i == 0)
{
// First iteration is a warmup: // Check for correctness (and display results, if specified)
printf("\nRUN OFFSETS: \n");
int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
printf("\t\t %s ", compare ? "FAIL" : "PASS");
printf("\nRUN LENGTHS: \n");
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
printf("\t\t %s ", compare ? "FAIL" : "PASS");
printf("\nNUM RUNS: \n");
compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
printf("\t\t %s ", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
}
else
{
elapsed_millis += gpu_timer.ElapsedMillis();
elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
}
// GPU cleanup
if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
}
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
if (coo_grid_size > 1)
{
printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
}
fflush(stdout);
CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
// Run kernel (always run one iteration without timing)
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i <= g_timing_iterations; i++)
{
gpu_timer.Start();
// Initialize output
CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
// Run the COO kernel
CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
even_share,
d_block_partials,
d_rows,
d_columns,
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
if (coo_grid_size > 1)
{
// Run the COO finalize kernel
CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
d_block_partials,
num_partials,
d_result);
}
gpu_timer.Stop();
if (i > 0)
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Force any kernel stdio to screen
CubDebugExit(cudaThreadSynchronize());
fflush(stdout);
// Display timing
if (g_timing_iterations > 0)
{
float avg_elapsed = elapsed_millis / g_timing_iterations;
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
// Initialize device
CubDebugExit(args.DeviceInit());
// Get graph type
string type;
args.GetCmdLineArgument("type", type);
// Generate graph structure
CpuTimer timer;
timer.Start();
CooGraph<VertexId, Value> coo_graph;
if (type == string("grid2d"))
{
VertexId width;
args.GetCmdLineArgument("width", width);
bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
} else if (type == string("grid3d"))
{
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
string filename;
args.GetCmdLineArgument("file", filename);
printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
if (coo_graph.InitMarket(filename)) exit(1);
}
else
{
printf("Unsupported graph type\n");
exit(1);
}
timer.Stop();
printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
timer.ElapsedMillis() / 1000.0,
coo_graph.coo_tuples.size(),
coo_graph.row_dim,
coo_graph.col_dim);
fflush(stdout);
if (g_verbose)
{
cout << coo_graph << "\n";
}
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
for (int i = 0; i < g_timing_iterations; ++i)
{
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
}
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
float giga_bandwidth = giga_rate *
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
xgboost/cub/experimental/histogram/histogram_cub.h view on Meta::CPAN
d_histogram,
num_levels,
lower_level,
upper_level,
width * height,
(cudaStream_t) 0,
is_warmup);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
GpuTimer gpu_timer;
gpu_timer.Start();
// Compute histogram
DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_image_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
width * height,
(cudaStream_t) 0,
is_warmup);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
cudaFree(d_temp_storage);
return elapsed_millis;
}
xgboost/cub/experimental/histogram/histogram_gmem_atomics.h view on Meta::CPAN
dim3 grid(16, 16);
int total_blocks = grid.x * grid.y;
// allocate partial histogram
unsigned int *d_part_hist;
cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
dim3 block2(128);
dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
GpuTimer gpu_timer;
gpu_timer.Start();
histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
d_image,
width,
height,
d_part_hist);
histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
d_part_hist,
total_blocks,
d_hist);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
cudaFree(d_part_hist);
return elapsed_millis;
}
xgboost/cub/experimental/histogram/histogram_smem_atomics.h view on Meta::CPAN
dim3 grid(16, 16);
int total_blocks = grid.x * grid.y;
// allocate partial histogram
unsigned int *d_part_hist;
cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
dim3 block2(128);
dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);
GpuTimer gpu_timer;
gpu_timer.Start();
histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
d_image,
width,
height,
d_part_hist);
histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
d_part_hist,
total_blocks,
d_hist);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
cudaFree(d_part_hist);
return elapsed_millis;
}
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
// Warmup
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
// Check for failures
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(SyncStream(0));
// Timing
GpuTimer timer;
float elapsed_millis = 0.0;
timer.Start();
for (int it = 0; it < timing_iterations; ++it)
{
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
CubDebugExit(x_itr.UnbindTexture());
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// cuSparse HybMV
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
*/
template <
typename OffsetT>
float TestCusparseHybmv(
float* vector_y_in,
float* reference_vector_y_out,
SpmvParams<float, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
CpuTimer cpu_timer;
cpu_timer.Start();
// Construct Hyb matrix
cusparseMatDescr_t mat_desc;
cusparseHybMat_t hyb_desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
cusparseStatus_t status = cusparseScsr2hyb(
cusparse,
params.num_rows, params.num_cols,
mat_desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
hyb_desc,
0,
CUSPARSE_HYB_PARTITION_AUTO);
AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
cudaDeviceSynchronize();
cpu_timer.Stop();
float elapsed_millis = cpu_timer.ElapsedMillis();
printf("HYB setup ms, %.5f, ", elapsed_millis);
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Cleanup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
return elapsed_millis / timing_iterations;
}
/**
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
*/
template <
typename OffsetT>
float TestCusparseHybmv(
double* vector_y_in,
double* reference_vector_y_out,
SpmvParams<double, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
CpuTimer cpu_timer;
cpu_timer.Start();
// Construct Hyb matrix
cusparseMatDescr_t mat_desc;
cusparseHybMat_t hyb_desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
cusparse,
params.num_rows, params.num_cols,
mat_desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
hyb_desc,
0,
CUSPARSE_HYB_PARTITION_AUTO));
cudaDeviceSynchronize();
cpu_timer.Stop();
float elapsed_millis = cpu_timer.ElapsedMillis();
printf("HYB setup ms, %.5f, ", elapsed_millis);
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Cleanup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
return elapsed_millis / timing_iterations;
}
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
float elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
return elapsed_millis / timing_iterations;
}
/**
* Run cuSparse SpMV (specialized for fp64)
*/
template <
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
params.d_vector_x, ¶ms.beta, params.d_vector_y));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
float elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// GPU Merge-based SpMV
//---------------------------------------------------------------------
/**
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
// params.alpha, params.beta,
(cudaStream_t) 0, !g_quiet));
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
GpuTimer timer;
float elapsed_millis = 0.0;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
CubDebugExit(DeviceSpmv::CsrMV(
d_temp_storage, temp_storage_bytes,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, params.d_vector_y,
params.num_rows, params.num_cols, params.num_nonzeros,
// params.alpha, params.beta,
(cudaStream_t) 0, false));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
/**
* Display perf
xgboost/cub/test/test_allocator.cu view on Meta::CPAN
#endif // CUB_CDP
//
// Performance
//
printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
fflush(stdout); fflush(stderr);
// CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times
CpuTimer cpu_timer;
char *d_1024MB = NULL;
allocator.debug = false;
// Prime the caching allocator and the kernel
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
CubDebugExit(allocator.DeviceFree(d_1024MB));
cub::EmptyKernel<void><<<1, 32>>>();
// CUDA
cpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
CubDebugExit(cudaFree(d_1024MB));
}
cpu_timer.Stop();
float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
// CUB
cpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
CubDebugExit(allocator.DeviceFree(d_1024MB));
}
cpu_timer.Stop();
float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
cuda_malloc_elapsed_millis / timing_iterations,
cub_calloc_elapsed_millis / timing_iterations);
// GPU performance comparisons. Allocate and free a 1MB block 2000 times
GpuTimer gpu_timer;
printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
fflush(stdout); fflush(stderr);
// Kernel-only
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
cub::EmptyKernel<void><<<1, 32>>>();
}
gpu_timer.Stop();
float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
// CUDA
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
cub::EmptyKernel<void><<<1, 32>>>();
CubDebugExit(cudaFree(d_1024MB));
}
gpu_timer.Stop();
cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
// CUB
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
cub::EmptyKernel<void><<<1, 32>>>();
CubDebugExit(allocator.DeviceFree(d_1024MB));
}
gpu_timer.Stop();
cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
cuda_malloc_elapsed_millis / timing_iterations,
cub_calloc_elapsed_millis / timing_iterations);
#endif
printf("Success\n");
xgboost/cub/test/test_block_radix_sort.cu view on Meta::CPAN
// Allocate temp storage in shared memory
__shared__ typename BlockRadixSortT::TempStorage temp_storage;
// Items per thread
Key keys[ITEMS_PER_THREAD];
Value values[ITEMS_PER_THREAD];
LoadDirectBlocked(threadIdx.x, d_keys, keys);
LoadDirectBlocked(threadIdx.x, d_values, values);
// Start cycle timer
clock_t stop;
clock_t start = clock();
TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());
// Store time
if (threadIdx.x == 0)
*d_elapsed = (start > stop) ? start - stop : stop - start;
}
xgboost/cub/test/test_block_reduce.cu view on Meta::CPAN
T data[ITEMS_PER_THREAD];
// Load first tile of data
int block_offset = 0;
if (block_offset < TILE_SIZE * tiles)
{
LoadDirectBlocked(linear_tid, d_in + block_offset, data);
block_offset += TILE_SIZE;
// Start cycle timer
clock_t start = clock();
// Cooperative reduce first tile
BlockReduceT block_reduce(temp_storage) ;
T block_aggregate = DeviceTest(block_reduce, data, reduction_op);
// Stop cycle timer
#if CUB_PTX_ARCH == 100
// Bug: recording stop clock causes mis-write of running prefix value
clock_t stop = 0;
#else
clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
clock_t elapsed = (start > stop) ? start - stop : stop - start;
// Loop over input tiles
while (block_offset < TILE_SIZE * tiles)
{
// TestBarrier between threadblock reductions
__syncthreads();
// Load tile of data
LoadDirectBlocked(linear_tid, d_in + block_offset, data);
block_offset += TILE_SIZE;
// Start cycle timer
clock_t start = clock();
// Cooperatively reduce the tile's aggregate
BlockReduceT block_reduce(temp_storage) ;
T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);
// Stop cycle timer
#if CUB_PTX_ARCH == 100
// Bug: recording stop clock causes mis-write of running prefix value
clock_t stop = 0;
#else
clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
elapsed += (start > stop) ? start - stop : stop - start;
// Reduce threadblock aggregate
block_aggregate = reduction_op(block_aggregate, tile_aggregate);
xgboost/cub/test/test_block_reduce.cu view on Meta::CPAN
// Per-thread tile data
T partial;
// Load partial tile data
if (linear_tid < num_items)
{
partial = d_in[linear_tid];
}
// Start cycle timer
clock_t start = clock();
// Cooperatively reduce the tile's aggregate
BlockReduceT block_reduce(temp_storage) ;
T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);
// Stop cycle timer
#if CUB_PTX_ARCH == 100
// Bug: recording stop clock causes mis-write of running prefix value
clock_t stop = 0;
#else
clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
clock_t elapsed = (start > stop) ? start - stop : stop - start;
// Store data
xgboost/cub/test/test_block_scan.cu view on Meta::CPAN
__threadfence_block(); // workaround to prevent clock hoisting
// Test scan
T block_aggregate;
BlockScanT block_scan(temp_storage);
BlockPrefixCallbackOp<T, ScanOpT> prefix_op(linear_tid, initial_value, scan_op);
DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());
// Stop cycle timer
__threadfence_block(); // workaround to prevent clock hoisting
clock_t stop = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Store output
StoreDirectBlocked(linear_tid, d_out, data);
// Store block_aggregate
if (TEST_MODE != BASIC)
d_aggregate[linear_tid] = block_aggregate;
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
// Check that input isn't overwritten
int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
compare |= input_compare;
printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Performance
if (g_timing_iterations)
printf("\nPerforming timing iterations:\n"); fflush(stdout);
GpuTimer gpu_timer;
float elapsed_millis = 0.0f;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Initialize/clear device arrays
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
}
gpu_timer.Start();
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, false));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = (KEYS_ONLY) ?
giga_rate * sizeof(KeyT) * 2 :
giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
if (g_timing_iterations > 0)
{
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(backend, g_timing_iterations,
d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, num_segments, d_segment_offsets,
reduction_op, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(InputT);
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
int bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
}
compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0...
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(),
Int2Type<Traits<OutputT>::PRIMITIVE>(),
g_timing_iterations,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
initial_value,
num_items,
0,
false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
}
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
int num_output_items = (IS_PARTITION) ? num_items : num_selected;
int num_flag_items = (IS_FLAGGED) ? num_items : 0;
size_t num_bytes = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
float giga_bandwidth = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
xgboost/cub/test/test_device_select_unique.cu view on Meta::CPAN
printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
}
printf("\n\n");
xgboost/cub/test/test_grid_barrier.cu view on Meta::CPAN
printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
grid_size, block_size, occupancy);
fflush(stdout);
// Init global barrier
GridBarrierLifetime global_barrier;
global_barrier.Setup(grid_size);
// Time kernel
GpuTimer gpu_timer;
gpu_timer.Start();
Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
gpu_timer.Stop();
retval = CubDebug(cudaThreadSynchronize());
// Output timing results
float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
iterations,
gpu_timer.ElapsedMillis(),
avg_elapsed);
return retval;
}
xgboost/cub/test/test_warp_scan.cu view on Meta::CPAN
{
// Cooperative warp-scan utility type (1 warp)
typedef WarpScan<T, LOGICAL_WARP_THREADS> WarpScanT;
// Allocate temp storage in shared memory
__shared__ typename WarpScanT::TempStorage temp_storage;
// Per-thread tile data
T data = d_in[threadIdx.x];
// Start cycle timer
__threadfence_block(); // workaround to prevent clock hoisting
clock_t start = clock();
__threadfence_block(); // workaround to prevent clock hoisting
T aggregate;
// Test scan
WarpScanT warp_scan(temp_storage);
DeviceTest(
warp_scan,
data,
initial_value,
scan_op,
aggregate,
Int2Type<TEST_MODE>(),
Int2Type<Traits<T>::PRIMITIVE>());
// Stop cycle timer
__threadfence_block(); // workaround to prevent clock hoisting
clock_t stop = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Store data
d_out[threadIdx.x] = data;
if (TEST_MODE != BASIC)
{
// Store aggregate
xgboost/cub/tune/tune_device_reduce.cu view on Meta::CPAN
reduction_op));
if (g_verify) CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
int compare = (g_verify) ?
CompareDeviceResults(h_reference, d_out, 1, true, false) :
0;
// Performance
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i < g_timing_iterations; i++)
{
gpu_timer.Start();
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Mooch
CubDebugExit(cudaDeviceSynchronize());
float avg_elapsed = elapsed_millis / g_timing_iterations;
float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
float avg_bandwidth = avg_throughput * sizeof(T);
multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
xgboost/dmlc-core/include/dmlc/timer.h view on Meta::CPAN
/*!
* Copyright (c) 2015 by Contributors
* \file timer.h
* \brief cross platform timer for timing
* \author Tianqi Chen
*/
#ifndef DMLC_TIMER_H_
#define DMLC_TIMER_H_
#include "base.h"
#if DMLC_USE_CXX11
#include <chrono>
#endif
xgboost/dmlc-core/scripts/lint.py view on Meta::CPAN
# singleton helper for lint check
_HELPER = LintHelper()
def get_header_guard_dmlc(filename):
"""Get Header Guard Convention for DMLC Projects.
For headers in include, directly use the path
For headers in src, use project name plus path
Examples: with project-name = dmlc
include/dmlc/timer.h -> DMLC_TIMTER_H_
src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
"""
fileinfo = cpplint.FileInfo(filename)
file_path_from_root = fileinfo.RepositoryName()
inc_list = ['include', 'api', 'wrapper']
if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
idx = file_path_from_root.find('src/')
file_path_from_root = _HELPER.project_name + file_path_from_root[idx + 3:]
else:
xgboost/dmlc-core/src/data/basic_row_iter.h view on Meta::CPAN
* \file basic_row_iter.h
* \brief row based iterator that
* loads in everything into memory and returns
* \author Tianqi Chen
*/
#ifndef DMLC_DATA_BASIC_ROW_ITER_H_
#define DMLC_DATA_BASIC_ROW_ITER_H_
#include <dmlc/io.h>
#include <dmlc/logging.h>
#include <dmlc/data.h>
#include <dmlc/timer.h>
#include "./row_block.h"
#include "./parser.h"
namespace dmlc {
namespace data {
/*!
* \brief basic set of row iterators that provides
* \tparam IndexType the type of index we are using
*/
template<typename IndexType>
xgboost/dmlc-core/src/data/disk_row_iter.h view on Meta::CPAN
* \brief row based iterator that
* caches things into disk and then load segments
* \author Tianqi Chen
*/
#ifndef DMLC_DATA_DISK_ROW_ITER_H_
#define DMLC_DATA_DISK_ROW_ITER_H_
#include <dmlc/io.h>
#include <dmlc/logging.h>
#include <dmlc/data.h>
#include <dmlc/timer.h>
#include <dmlc/threadediter.h>
#include <algorithm>
#include <string>
#include "./row_block.h"
#include "./libsvm_parser.h"
#if DMLC_ENABLE_STD_THREAD
namespace dmlc {
namespace data {
/*!