locked results from the CPAN

Alien-XGBoost

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Test of BlockRadixSort utilities
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>
#include <algorithm>
#include <iostream>

#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/util_allocator.cuh>

#include "test_util.h"

using namespace cub;

//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose = false;
CachingDeviceAllocator  g_allocator(true);


//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------


/// Specialized descending, blocked -> blocked
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
    typename BlockRadixSort::TempStorage &temp_storage,
    Key                         (&keys)[ITEMS_PER_THREAD],
    Value                       (&values)[ITEMS_PER_THREAD],
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     &stop,
    Int2Type<true>              is_descending,
    Int2Type<true>              is_blocked_output)
{
    BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit);
    stop = clock();
    StoreDirectBlocked(threadIdx.x, d_keys, keys);
    StoreDirectBlocked(threadIdx.x, d_values, values);
}

/// Specialized descending, blocked -> striped
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
    typename BlockRadixSort::TempStorage &temp_storage,
    Key                         (&keys)[ITEMS_PER_THREAD],
    Value                       (&values)[ITEMS_PER_THREAD],
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     &stop,
    Int2Type<true>              is_descending,
    Int2Type<false>             is_blocked_output)
{
    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
    stop = clock();
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
}

/// Specialized ascending, blocked -> blocked
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
    typename BlockRadixSort::TempStorage &temp_storage,
    Key                         (&keys)[ITEMS_PER_THREAD],
    Value                       (&values)[ITEMS_PER_THREAD],
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     &stop,
    Int2Type<false>             is_descending,
    Int2Type<true>              is_blocked_output)
{
    BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
    stop = clock();
    StoreDirectBlocked(threadIdx.x, d_keys, keys);
    StoreDirectBlocked(threadIdx.x, d_values, values);
}

/// Specialized ascending, blocked -> striped
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
    typename BlockRadixSort::TempStorage &temp_storage,
    Key                         (&keys)[ITEMS_PER_THREAD],
    Value                       (&values)[ITEMS_PER_THREAD],
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     &stop,
    Int2Type<false>             is_descending,
    Int2Type<false>             is_blocked_output)
{
    BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
    stop = clock();
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
}



/**
 * BlockRadixSort kernel
 */
template <
    int                 BLOCK_THREADS,
    int                 ITEMS_PER_THREAD,
    int                 RADIX_BITS,
    bool                MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm  INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig SMEM_CONFIG,
    int                 DESCENDING,
    int                 BLOCKED_OUTPUT,
    typename            Key,
    typename            Value>
__launch_bounds__ (BLOCK_THREADS, 1)
__global__ void Kernel(
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     *d_elapsed)
{
    // Threadblock load/store abstraction types
    typedef BlockRadixSort<
            Key,
            BLOCK_THREADS,
            ITEMS_PER_THREAD,
            Value,
            RADIX_BITS,
            MEMOIZE_OUTER_SCAN,
            INNER_SCAN_ALGORITHM,
            SMEM_CONFIG>
        BlockRadixSortT;

    // Allocate temp storage in shared memory
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    // Items per thread
    Key     keys[ITEMS_PER_THREAD];
    Value   values[ITEMS_PER_THREAD];

    LoadDirectBlocked(threadIdx.x, d_keys, keys);
    LoadDirectBlocked(threadIdx.x, d_values, values);

    // Start cycle timer
    clock_t stop;
    clock_t start = clock();

    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());

    // Store time
    if (threadIdx.x == 0)
        *d_elapsed = (start > stop) ? start - stop : stop - start;
}



//---------------------------------------------------------------------
// Host testing subroutines
//---------------------------------------------------------------------


/**
 * Simple key-value pairing
 */
template <
    typename Key,
    typename Value,
    bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
struct Pair
{
    Key     key;
    Value   value;

    bool operator<(const Pair &b) const
    {
        return (key < b.key);
    }
};

/**
 * Simple key-value pairing (specialized for floating point types)
 */
template <typename Key, typename Value>
struct Pair<Key, Value, true>
{
    Key     key;
    Value   value;

    bool operator<(const Pair &b) const
    {
        if (key < b.key)
            return true;

        if (key > b.key)
            return false;

        // Key in unsigned bits
        typedef typename Traits<Key>::UnsignedBits UnsignedBits;

        // Return true if key is negative zero and b.key is positive zero
        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&key));

xgboost/cub/test/test_block_radix_sort.cu view on Meta::CPAN

/**
 * Test driver (valid tile size <= MAX_SMEM_BYTES)
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig     SMEM_CONFIG,
    bool                    DESCENDING,
    bool                    BLOCKED_OUTPUT,
    typename                Key,
    typename                Value>
void TestValid(Int2Type<true> fits_smem_capacity)
{
    // Iterate begin_bit
    for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
    {
        // Iterate end bit
        for (int end_bit = begin_bit + 1; end_bit <= sizeof(Key) * 8; end_bit = end_bit * 2 + begin_bit)
        {
            // Uniform key distribution
            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
                UNIFORM, 0, begin_bit, end_bit);

            // Sequential key distribution
            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
                INTEGER_SEED, 0, begin_bit, end_bit);

            // Iterate random with entropy_reduction
            for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
            {
                TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
                    RANDOM, entropy_reduction, begin_bit, end_bit);
            }
        }
    }
}


/**
 * Test driver (invalid tile size)
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig     SMEM_CONFIG,
    bool                    DESCENDING,
    bool                    BLOCKED_OUTPUT,
    typename                Key,
    typename                Value>
void TestValid(Int2Type<false> fits_smem_capacity)
{}


/**
 * Test ascending/descending and to-blocked/to-striped
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig     SMEM_CONFIG,
    typename                Key,
    typename                Value>
void Test()
{
    // Check size of smem storage for the target arch to make sure it will fit
    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;

#if defined(SM100) || defined(SM110) || defined(SM130)
    Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
#else
    Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
#endif

    // Sort-ascending, to-striped
    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);

    // Sort-descending, to-blocked
    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);

    // Not necessary
//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
}


/**
 * Test value type and smem config
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    typename                Key>
void TestKeys()
{
    // Test keys-only sorting with both smem configs
    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>();    // Keys-only (4-byte smem bank config)
#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>();   // Keys-only (8-byte smem bank config)
#endif
}


/**
 * Test value type and smem config
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    typename                Key>
void TestKeysAndPairs()
{
    // Test pairs sorting with only 4-byte configs
    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>();        // With small-values
    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>();         // With same-values
    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>();     // With large values
}


/**
 * Test key type
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM>
void Test()
{
    // Get ptx version
    int ptx_version;

( run in 0.308 second using v1.01-cache-2.11-cpan-bbe5e583499 )