PUT results from the CPAN

Alien-XGBoost

{
    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
    stop = clock();
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
}

/// Specialized ascending, blocked -> blocked
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
    typename BlockRadixSort::TempStorage &temp_storage,
    Key                         (&keys)[ITEMS_PER_THREAD],
    Value                       (&values)[ITEMS_PER_THREAD],
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     &stop,
    Int2Type<false>             is_descending,
    Int2Type<true>              is_blocked_output)
{
    BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
    stop = clock();
    StoreDirectBlocked(threadIdx.x, d_keys, keys);
    StoreDirectBlocked(threadIdx.x, d_values, values);
}

/// Specialized ascending, blocked -> striped
template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
__device__ __forceinline__ void TestBlockSort(
    typename BlockRadixSort::TempStorage &temp_storage,
    Key                         (&keys)[ITEMS_PER_THREAD],
    Value                       (&values)[ITEMS_PER_THREAD],
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     &stop,
    Int2Type<false>             is_descending,
    Int2Type<false>             is_blocked_output)
{
    BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
    stop = clock();
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
}



/**
 * BlockRadixSort kernel
 */
template <
    int                 BLOCK_THREADS,
    int                 ITEMS_PER_THREAD,
    int                 RADIX_BITS,
    bool                MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm  INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig SMEM_CONFIG,
    int                 DESCENDING,
    int                 BLOCKED_OUTPUT,
    typename            Key,
    typename            Value>
__launch_bounds__ (BLOCK_THREADS, 1)
__global__ void Kernel(
    Key                         *d_keys,
    Value                       *d_values,
    int                         begin_bit,
    int                         end_bit,
    clock_t                     *d_elapsed)
{
    // Threadblock load/store abstraction types
    typedef BlockRadixSort<
            Key,
            BLOCK_THREADS,
            ITEMS_PER_THREAD,
            Value,
            RADIX_BITS,
            MEMOIZE_OUTER_SCAN,
            INNER_SCAN_ALGORITHM,
            SMEM_CONFIG>
        BlockRadixSortT;

    // Allocate temp storage in shared memory
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    // Items per thread
    Key     keys[ITEMS_PER_THREAD];
    Value   values[ITEMS_PER_THREAD];

    LoadDirectBlocked(threadIdx.x, d_keys, keys);
    LoadDirectBlocked(threadIdx.x, d_values, values);

    // Start cycle timer
    clock_t stop;
    clock_t start = clock();

    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());

    // Store time
    if (threadIdx.x == 0)
        *d_elapsed = (start > stop) ? start - stop : stop - start;
}



//---------------------------------------------------------------------
// Host testing subroutines
//---------------------------------------------------------------------


/**
 * Simple key-value pairing
 */
template <
    typename Key,
    typename Value,
    bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
struct Pair
{
    Key     key;
    Value   value;

    bool operator<(const Pair &b) const
    {
        return (key < b.key);
    }
};

/**
 * Simple key-value pairing (specialized for floating point types)
 */
template <typename Key, typename Value>
struct Pair<Key, Value, true>
{
    Key     key;
    Value   value;

    bool operator<(const Pair &b) const
    {
        if (key < b.key)
            return true;

        if (key > b.key)
            return false;

        // Key in unsigned bits
        typedef typename Traits<Key>::UnsignedBits UnsignedBits;

        // Return true if key is negative zero and b.key is positive zero
        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&key));
        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&b.key));
        UnsignedBits HIGH_BIT   = Traits<Key>::HIGH_BIT;

        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
    }
};

xgboost/cub/test/test_block_radix_sort.cu view on Meta::CPAN

void Initialize(
    GenMode         gen_mode,
    Key             *h_keys,
    Value           *h_values,
    Key             *h_reference_keys,
    Value           *h_reference_values,
    int             num_items,
    int             entropy_reduction,
    int             begin_bit,
    int             end_bit)
{
    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];

    for (int i = 0; i < num_items; ++i)
    {
        InitValue(gen_mode, h_keys[i], i);

        RandomBits(h_values[i]);

        // Mask off unwanted portions
        int num_bits = end_bit - begin_bit;
        if ((begin_bit > 0) || (end_bit < sizeof(Key) * 8))
        {
            unsigned long long base = 0;
            memcpy(&base, &h_keys[i], sizeof(Key));
            base &= ((1ull << num_bits) - 1) << begin_bit;
            memcpy(&h_keys[i], &base, sizeof(Key));
        }

        h_pairs[i].key    = h_keys[i];
        h_pairs[i].value  = h_values[i];
    }

    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
    std::stable_sort(h_pairs, h_pairs + num_items);
    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);

    for (int i = 0; i < num_items; ++i)
    {
        h_reference_keys[i]     = h_pairs[i].key;
        h_reference_values[i]   = h_pairs[i].value;
    }

    delete[] h_pairs;
}




/**
 * Test BlockRadixSort kernel
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig     SMEM_CONFIG,
    bool                    DESCENDING,
    bool                    BLOCKED_OUTPUT,
    typename                Key,
    typename                Value>
void TestDriver(
    GenMode                 gen_mode,
    int                     entropy_reduction,
    int                     begin_bit,
    int                     end_bit)
{
    enum
    {
        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
        KEYS_ONLY = Equals<Value, NullType>::VALUE,
    };

    // Allocate host arrays
    Key     *h_keys             = new Key[TILE_SIZE];
    Key     *h_reference_keys   = new Key[TILE_SIZE];
    Value   *h_values           = new Value[TILE_SIZE];
    Value   *h_reference_values = new Value[TILE_SIZE];

    // Allocate device arrays
    Key     *d_keys     = NULL;
    Value   *d_values   = NULL;
    clock_t *d_elapsed  = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys, sizeof(Key) * TILE_SIZE));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * TILE_SIZE));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));

    // Initialize problem and solution on host
    Initialize<DESCENDING>(gen_mode, h_keys, h_values, h_reference_keys, h_reference_values,
        TILE_SIZE, entropy_reduction, begin_bit, end_bit);

    // Copy problem to device
    CubDebugExit(cudaMemcpy(d_keys, h_keys, sizeof(Key) * TILE_SIZE, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * TILE_SIZE, cudaMemcpyHostToDevice));

    printf("%s "
        "BLOCK_THREADS(%d) "
        "ITEMS_PER_THREAD(%d) "
        "RADIX_BITS(%d) "
        "MEMOIZE_OUTER_SCAN(%d) "
        "INNER_SCAN_ALGORITHM(%d) "
        "SMEM_CONFIG(%d) "
        "DESCENDING(%d) "
        "BLOCKED_OUTPUT(%d) "
        "sizeof(Key)(%d) "
        "sizeof(Value)(%d) "
        "gen_mode(%d), "
        "entropy_reduction(%d) "
        "begin_bit(%d) "
        "end_bit(%d), "
        "samples(%d)\n",
            ((KEYS_ONLY) ? "Keys-only" : "Key-value"),
            BLOCK_THREADS,
            ITEMS_PER_THREAD,
            RADIX_BITS,
            MEMOIZE_OUTER_SCAN,
            INNER_SCAN_ALGORITHM,
            SMEM_CONFIG,
            DESCENDING,
            BLOCKED_OUTPUT,
            (int) sizeof(Key),
            (int) sizeof(Value),
            gen_mode,
            entropy_reduction,
            begin_bit,
            end_bit,
            g_num_rand_samples);

    // Set shared memory config
    cudaDeviceSetSharedMemConfig(SMEM_CONFIG);

    // Run kernel
    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT><<<1, BLOCK_THREADS>>>(
        d_keys, d_values, begin_bit, end_bit, d_elapsed);

    // Flush kernel output / errors
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(cudaDeviceSynchronize());

    // Check keys results
    printf("\tKeys: ");
    int compare = CompareDeviceResults(h_reference_keys, d_keys, TILE_SIZE, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Check value results
    if (!KEYS_ONLY)
    {
        printf("\tValues: ");
        int compare = CompareDeviceResults(h_reference_values, d_values, TILE_SIZE, g_verbose, g_verbose);
        printf("%s\n", compare ? "FAIL" : "PASS");
        AssertEquals(0, compare);
    }
    printf("\n");

    printf("\tElapsed clocks: ");
    DisplayDeviceResults(d_elapsed, 1);
    printf("\n");

    // Cleanup
    if (h_keys)             delete[] h_keys;
    if (h_reference_keys)   delete[] h_reference_keys;
    if (h_values)           delete[] h_values;
    if (h_reference_values) delete[] h_reference_values;
    if (d_keys)             CubDebugExit(g_allocator.DeviceFree(d_keys));
    if (d_values)           CubDebugExit(g_allocator.DeviceFree(d_values));
    if (d_elapsed)          CubDebugExit(g_allocator.DeviceFree(d_elapsed));
}


/**
 * Test driver (valid tile size <= MAX_SMEM_BYTES)
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig     SMEM_CONFIG,
    bool                    DESCENDING,
    bool                    BLOCKED_OUTPUT,
    typename                Key,
    typename                Value>
void TestValid(Int2Type<true> fits_smem_capacity)
{
    // Iterate begin_bit
    for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
    {
        // Iterate end bit
        for (int end_bit = begin_bit + 1; end_bit <= sizeof(Key) * 8; end_bit = end_bit * 2 + begin_bit)
        {
            // Uniform key distribution
            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
                UNIFORM, 0, begin_bit, end_bit);

            // Sequential key distribution
            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
                INTEGER_SEED, 0, begin_bit, end_bit);

            // Iterate random with entropy_reduction
            for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
            {
                TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
                    RANDOM, entropy_reduction, begin_bit, end_bit);
            }
        }
    }
}


/**
 * Test driver (invalid tile size)
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig     SMEM_CONFIG,
    bool                    DESCENDING,
    bool                    BLOCKED_OUTPUT,
    typename                Key,
    typename                Value>
void TestValid(Int2Type<false> fits_smem_capacity)
{}


/**
 * Test ascending/descending and to-blocked/to-striped
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    cudaSharedMemConfig     SMEM_CONFIG,
    typename                Key,
    typename                Value>
void Test()
{
    // Check size of smem storage for the target arch to make sure it will fit
    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;

#if defined(SM100) || defined(SM110) || defined(SM130)
    Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
#else
    Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
#endif

    // Sort-ascending, to-striped
    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);

    // Sort-descending, to-blocked
    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);

    // Not necessary
//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
}


/**
 * Test value type and smem config
 */
template <
    int                     BLOCK_THREADS,
    int                     ITEMS_PER_THREAD,
    int                     RADIX_BITS,
    bool                    MEMOIZE_OUTER_SCAN,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
    typename                Key>
void TestKeys()
{
    // Test keys-only sorting with both smem configs
    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>();    // Keys-only (4-byte smem bank config)
#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>();   // Keys-only (8-byte smem bank config)
#endif
}

( run in 1.034 second using v1.01-cache-2.11-cpan-39bf76dae61 )