iterations results from the CPAN

Alien-XGBoost

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * An implementation of segmented reduction using a load-balanced parallelization
 * strategy based on the MergePath decision path.
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <iterator>
#include <vector>
#include <string>
#include <algorithm>
#include <stdio.h>

#include <cub/cub.cuh>

#include "test_util.h"

using namespace cub;
using namespace std;


/******************************************************************************
 * Globals, constants, and typedefs
 ******************************************************************************/

bool                    g_verbose           = false;
int                     g_timing_iterations = 1;
CachingDeviceAllocator  g_allocator(true);


/******************************************************************************
 * Utility routines
 ******************************************************************************/


/**
 * An pair of index offsets
 */
template <typename OffsetT>
struct IndexPair
{
    OffsetT a_idx;
    OffsetT b_idx;
};


/**
 * Computes the begin offsets into A and B for the specified
 * location (diagonal) along the merge decision path
 */
template <
    int                 BLOCK_THREADS,
    typename            IteratorA,
    typename            IteratorB,
    typename            OffsetT>
__device__ __forceinline__ void ParallelMergePathSearch(
    OffsetT             diagonal,
    IteratorA           a,
    IteratorB           b,
    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
    IndexPair<OffsetT>  end,            // End offsets into a and b
    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
{
    OffsetT a_split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
    OffsetT a_split_max = CUB_MIN(diagonal, end.a_idx);

    while (a_split_min < a_split_max)
    {
        OffsetT a_distance       = a_split_max - a_split_min;
        OffsetT a_slice          = (a_distance + BLOCK_THREADS - 1) >> Log2<BLOCK_THREADS>::VALUE;
        OffsetT a_split_pivot    = CUB_MIN(a_split_min + (threadIdx.x * a_slice), end.a_idx - 1);

        int move_up = (a[a_split_pivot] <= b[diagonal - a_split_pivot - 1]);
        int num_up = __syncthreads_count(move_up);
/*
        _CubLog("a_split_min(%d), a_split_max(%d) a_distance(%d), a_slice(%d), a_split_pivot(%d), move_up(%d), num_up(%d), a_begin(%d), a_end(%d)\n",
            a_split_min, a_split_max, a_distance, a_slice, a_split_pivot, move_up, num_up, a_begin, a_end);
*/
        a_split_max = CUB_MIN(num_up * a_slice, end.a_idx);
        a_split_min = CUB_MAX(a_split_max - a_slice, begin.a_idx) + 1;
    }

    intersection.a_idx = CUB_MIN(a_split_min, end.a_idx);
    intersection.b_idx = CUB_MIN(diagonal - a_split_min, end.b_idx);
}

/**

xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN

    Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);

    // Allocate simple offsets array and copy STL vector into it
    h_segment_offsets = new OffsetT[segment_offsets.size()];
    for (int i = 0; i < segment_offsets.size(); ++i)
        h_segment_offsets[i] = segment_offsets[i];

    OffsetT num_segments = segment_offsets.size() - 1;
    if (g_verbose)
    {
        printf("%d segment offsets: ", num_segments);
        for (int i = 0; i < num_segments; ++i)
            std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
        if (g_verbose) std::cout << std::endl << std::endl;
    }

    // Solve problem on host
    h_reference = new Value[num_segments];
    ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);

    printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
        (CDP) ? "CDP device invoked" : "Host-invoked",
        (Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
        num_values, (int) sizeof(Value), type_string,
        num_segments, (int) sizeof(OffsetT));
    fflush(stdout);

    // Allocate and initialize problem on device
    Value   *d_values = NULL;
    OffsetT *d_segment_offsets = NULL;
    Value   *d_output = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));

    // Request and allocate temporary storage
    void    *d_temp_storage = NULL;
    size_t  temp_storage_bytes = 0;
    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Clear device output
    CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));

    // Run warmup/correctness iteration
    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    for (int i = 0; i < g_timing_iterations; ++i)
    {
        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
    }
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
        float giga_bandwidth = giga_rate *
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

    // Device cleanup
    if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
    if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    // Host cleanup
    if (h_values)           delete[] h_values;
    if (h_segment_offsets)  delete[] h_segment_offsets;
    if (h_reference)        delete[] h_reference;
}


/**
 * Main
 */
int main(int argc, char** argv)
{
    int num_values          = 32 * 1024 * 1024;
    int avg_segment_size    = 500;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_values);
    args.GetCmdLineArgument("ss", avg_segment_size);
    args.GetCmdLineArgument("i", g_timing_iterations);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>] "
            "[--v] "
            "[--i=<timing iterations>] "
            "[--n=<input samples>]\n"
            "[--ss=<average segment size>]\n"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

    Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));

    return 0;
}

( run in 0.849 second using v1.01-cache-2.11-cpan-71847e10f99 )