Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of DeviceRadixSort utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <algorithm>
#include <typeinfo>
#include <cub/util_allocator.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_segmented_radix_sort.cuh>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/reverse.h>
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method (allows overwriting of input)
CUB_NO_OVERWRITE, // CUB method (disallows overwriting of input)
CUB_SEGMENTED, // CUB method (allows overwriting of input)
CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
THRUST, // Thrust method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
};
//---------------------------------------------------------------------
// Dispatch to different DeviceRadixSort entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to CUB sorting entrypoint (specialized for ascending)
*/
template <typename KeyT, typename ValueT>
CUB_RUNTIME_FUNCTION
__forceinline__
cudaError_t Dispatch(
Int2Type<false> is_descending,
Int2Type<CUB> dispatch_to,
int *d_selector,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
int num_items,
int num_segments,
const int *d_segment_offsets,
int begin_bit,
int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
return DeviceRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
d_keys, d_values,
num_items, begin_bit, end_bit, stream, debug_synchronous);
}
/**
* Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
*/
template <typename KeyT, typename ValueT>
CUB_RUNTIME_FUNCTION
__forceinline__
cudaError_t Dispatch(
Int2Type<false> is_descending,
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
if (!KEYS_ONLY)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
}
// Allocate temporary storage (and make it un-aligned)
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, true));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
// Initialize/clear device arrays
d_keys.selector = 0;
CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
d_values.selector = 0;
CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
}
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
// Run warmup/correctness iteration
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, true));
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
printf("Warmup done. Checking results:\n"); fflush(stdout);
int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
if (!KEYS_ONLY)
{
int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
compare |= values_compare;
printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
}
if (BACKEND == CUB_NO_OVERWRITE)
{
// Check that input isn't overwritten
int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
compare |= input_compare;
printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Performance
if (g_timing_iterations)
printf("\nPerforming timing iterations:\n"); fflush(stdout);
GpuTimer gpu_timer;
float elapsed_millis = 0.0f;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Initialize/clear device arrays
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
}
gpu_timer.Start();
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, false));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = (KEYS_ONLY) ?
giga_rate * sizeof(KeyT) * 2 :
giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
// Correctness asserts
AssertEquals(0, compare);
}
/**
* Test backend
*/
template <bool IS_DESCENDING, typename KeyT, typename ValueT>
void TestBackend(
KeyT *h_keys,
int num_items,
int num_segments,
int *h_segment_offsets,
int begin_bit,
int end_bit,
KeyT *h_reference_keys,
int *h_reference_ranks)
{
const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
ValueT *h_values = NULL;
ValueT *h_reference_values = NULL;
if (!KEYS_ONLY)
{
h_values = new ValueT[num_items];
h_reference_values = new ValueT[num_items];
for (int i = 0; i < num_items; ++i)
{
InitValue(INTEGER_SEED, h_values[i], i);
InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
}
}
if (num_segments == 1)
{
// Test single-segment implementations
Test<CUB, IS_DESCENDING>( h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
ValueT *h_values = NULL;
ValueT *h_reference_values = NULL;
int *h_segment_offsets = new int[num_segments + 1];
if (end_bit < 0)
end_bit = sizeof(KeyT) * 8;
InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
InitializeSegments(num_items, num_segments, h_segment_offsets);
InitializeSolution<IS_DESCENDING>(
h_keys, num_items, num_segments, h_segment_offsets,
begin_bit, end_bit, h_reference_ranks, h_reference_keys);
if (!KEYS_ONLY)
{
h_values = new ValueT[num_items];
h_reference_values = new ValueT[num_items];
for (int i = 0; i < num_items; ++i)
{
InitValue(INTEGER_SEED, h_values[i], i);
InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
}
}
if (h_reference_ranks) delete[] h_reference_ranks;
printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
Test<BACKEND, IS_DESCENDING>(
h_keys, h_values,
num_items, num_segments, h_segment_offsets,
begin_bit, end_bit, h_reference_keys, h_reference_values);
if (h_keys) delete[] h_keys;
if (h_reference_keys) delete[] h_reference_keys;
if (h_values) delete[] h_values;
if (h_reference_values) delete[] h_reference_values;
if (h_segment_offsets) delete[] h_segment_offsets;
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
int bits = -1;
int num_items = -1;
int num_segments = -1;
int entropy_reduction = 0;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("s", num_segments);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("bits", bits);
args.GetCmdLineArgument("entropy", entropy_reduction);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--bits=<valid key bits>]"
"[--n=<input items> "
"[--s=<num segments> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--v] "
"[--entropy=<entropy-reduction factor (default 0)>]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Get ptx version
int ptx_version;
CubDebugExit(PtxVersion(ptx_version));
#ifdef QUICKER_TEST
enum {
IS_DESCENDING = false
};
// Compile/run basic CUB test
if (num_items < 0) num_items = 48000000;
if (num_segments < 0) num_segments = 5000;
Test<CUB_SEGMENTED, unsigned int, NullType, IS_DESCENDING>( num_items, num_segments, RANDOM, entropy_reduction, 0, bits);
Test<CUB, unsigned int, NullType, IS_DESCENDING>( num_items, 1, RANDOM, entropy_reduction, 0, bits);
Test<CUB, unsigned long long, NullType, IS_DESCENDING>( num_items, 1, RANDOM, entropy_reduction, 0, bits);
Test<CUB, unsigned int, unsigned int, IS_DESCENDING>( num_items, 1, RANDOM, entropy_reduction, 0, bits);
Test<CUB, unsigned long long, unsigned int, IS_DESCENDING>( num_items, 1, RANDOM, entropy_reduction, 0, bits);
#elif defined(QUICK_TEST)
// Compile/run quick tests
if (num_items < 0) num_items = 48000000;
if (num_segments < 0) num_segments = 5000;
// Compare CUB and thrust on 32b keys-only
Test<CUB, unsigned int, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
Test<THRUST, unsigned int, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
// Compare CUB and thrust on 64b keys-only
Test<CUB, unsigned long long, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
Test<THRUST, unsigned long long, NullType, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
// Compare CUB and thrust on 32b key-value pairs
Test<CUB, unsigned int, unsigned int, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
Test<THRUST, unsigned int, unsigned int, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
// Compare CUB and thrust on 64b key-value pairs
Test<CUB, unsigned long long, unsigned long long, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
Test<THRUST, unsigned long long, unsigned long long, false> ( num_items, 1, RANDOM, entropy_reduction, 0, bits);
#else
( run in 0.755 second using v1.01-cache-2.11-cpan-71847e10f99 )