Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_block_radix_sort.cu view on Meta::CPAN
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of BlockRadixSort utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <algorithm>
#include <iostream>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/util_allocator.cuh>
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
CachingDeviceAllocator g_allocator(true);
//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------
/// Specialized descending, blocked -> blocked
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
typename BlockRadixSort::TempStorage &temp_storage,
Key (&keys)[ITEMS_PER_THREAD],
Value (&values)[ITEMS_PER_THREAD],
Key *d_keys,
Value *d_values,
int begin_bit,
int end_bit,
clock_t &stop,
Int2Type<true> is_descending,
Int2Type<true> is_blocked_output)
{
BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit);
stop = clock();
StoreDirectBlocked(threadIdx.x, d_keys, keys);
StoreDirectBlocked(threadIdx.x, d_values, values);
}
/// Specialized descending, blocked -> striped
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
typename BlockRadixSort::TempStorage &temp_storage,
Key (&keys)[ITEMS_PER_THREAD],
Value (&values)[ITEMS_PER_THREAD],
Key *d_keys,
Value *d_values,
int begin_bit,
int end_bit,
clock_t &stop,
Int2Type<true> is_descending,
Int2Type<false> is_blocked_output)
{
BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
stop = clock();
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
}
/// Specialized ascending, blocked -> blocked
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
typename BlockRadixSort::TempStorage &temp_storage,
Key (&keys)[ITEMS_PER_THREAD],
Value (&values)[ITEMS_PER_THREAD],
Key *d_keys,
Value *d_values,
int begin_bit,
int end_bit,
clock_t &stop,
Int2Type<false> is_descending,
Int2Type<true> is_blocked_output)
{
BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
stop = clock();
StoreDirectBlocked(threadIdx.x, d_keys, keys);
StoreDirectBlocked(threadIdx.x, d_values, values);
}
/// Specialized ascending, blocked -> striped
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
typename BlockRadixSort::TempStorage &temp_storage,
Key (&keys)[ITEMS_PER_THREAD],
Value (&values)[ITEMS_PER_THREAD],
Key *d_keys,
Value *d_values,
int begin_bit,
int end_bit,
clock_t &stop,
Int2Type<false> is_descending,
Int2Type<false> is_blocked_output)
{
BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
stop = clock();
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
}
/**
* BlockRadixSort kernel
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN,
BlockScanAlgorithm INNER_SCAN_ALGORITHM,
cudaSharedMemConfig SMEM_CONFIG,
int DESCENDING,
int BLOCKED_OUTPUT,
typename Key,
typename Value>
__launch_bounds__ (BLOCK_THREADS, 1)
__global__ void Kernel(
Key *d_keys,
Value *d_values,
int begin_bit,
int end_bit,
clock_t *d_elapsed)
{
// Threadblock load/store abstraction types
typedef BlockRadixSort<
Key,
BLOCK_THREADS,
ITEMS_PER_THREAD,
Value,
RADIX_BITS,
MEMOIZE_OUTER_SCAN,
INNER_SCAN_ALGORITHM,
SMEM_CONFIG>
BlockRadixSortT;
// Allocate temp storage in shared memory
__shared__ typename BlockRadixSortT::TempStorage temp_storage;
// Items per thread
Key keys[ITEMS_PER_THREAD];
Value values[ITEMS_PER_THREAD];
LoadDirectBlocked(threadIdx.x, d_keys, keys);
LoadDirectBlocked(threadIdx.x, d_values, values);
// Start cycle timer
clock_t stop;
clock_t start = clock();
TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());
// Store time
if (threadIdx.x == 0)
*d_elapsed = (start > stop) ? start - stop : stop - start;
}
//---------------------------------------------------------------------
// Host testing subroutines
//---------------------------------------------------------------------
/**
* Simple key-value pairing
*/
template <
typename Key,
typename Value,
bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
struct Pair
{
Key key;
Value value;
bool operator<(const Pair &b) const
{
return (key < b.key);
}
};
/**
* Simple key-value pairing (specialized for floating point types)
*/
template <typename Key, typename Value>
struct Pair<Key, Value, true>
{
Key key;
Value value;
bool operator<(const Pair &b) const
{
if (key < b.key)
return true;
if (key > b.key)
return false;
// Key in unsigned bits
typedef typename Traits<Key>::UnsignedBits UnsignedBits;
// Return true if key is negative zero and b.key is positive zero
UnsignedBits key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&key));
xgboost/cub/test/test_block_radix_sort.cu view on Meta::CPAN
/**
* Test driver (valid tile size <= MAX_SMEM_BYTES)
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN,
BlockScanAlgorithm INNER_SCAN_ALGORITHM,
cudaSharedMemConfig SMEM_CONFIG,
bool DESCENDING,
bool BLOCKED_OUTPUT,
typename Key,
typename Value>
void TestValid(Int2Type<true> fits_smem_capacity)
{
// Iterate begin_bit
for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
{
// Iterate end bit
for (int end_bit = begin_bit + 1; end_bit <= sizeof(Key) * 8; end_bit = end_bit * 2 + begin_bit)
{
// Uniform key distribution
TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
UNIFORM, 0, begin_bit, end_bit);
// Sequential key distribution
TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
INTEGER_SEED, 0, begin_bit, end_bit);
// Iterate random with entropy_reduction
for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
{
TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
RANDOM, entropy_reduction, begin_bit, end_bit);
}
}
}
}
/**
* Test driver (invalid tile size)
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN,
BlockScanAlgorithm INNER_SCAN_ALGORITHM,
cudaSharedMemConfig SMEM_CONFIG,
bool DESCENDING,
bool BLOCKED_OUTPUT,
typename Key,
typename Value>
void TestValid(Int2Type<false> fits_smem_capacity)
{}
/**
* Test ascending/descending and to-blocked/to-striped
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN,
BlockScanAlgorithm INNER_SCAN_ALGORITHM,
cudaSharedMemConfig SMEM_CONFIG,
typename Key,
typename Value>
void Test()
{
// Check size of smem storage for the target arch to make sure it will fit
typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;
#if defined(SM100) || defined(SM110) || defined(SM130)
Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
#else
Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
#endif
// Sort-ascending, to-striped
TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);
// Sort-descending, to-blocked
TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);
// Not necessary
// TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
// TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
}
/**
* Test value type and smem config
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN,
BlockScanAlgorithm INNER_SCAN_ALGORITHM,
typename Key>
void TestKeys()
{
// Test keys-only sorting with both smem configs
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>(); // Keys-only (4-byte smem bank config)
#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>(); // Keys-only (8-byte smem bank config)
#endif
}
/**
* Test value type and smem config
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN,
BlockScanAlgorithm INNER_SCAN_ALGORITHM,
typename Key>
void TestKeysAndPairs()
{
// Test pairs sorting with only 4-byte configs
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>(); // With small-values
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>(); // With same-values
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>(); // With large values
}
/**
* Test key type
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN,
BlockScanAlgorithm INNER_SCAN_ALGORITHM>
void Test()
{
// Get ptx version
int ptx_version;
( run in 1.012 second using v1.01-cache-2.11-cpan-39bf76dae61 )