Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/test/test_device_reduce.cu  view on Meta::CPAN

    typename                ReductionOpT>
void Test(
    BackendT                backend,
    DeviceInputIteratorT    d_in,
    int                     num_items,
    int                     num_segments,
    int                     *d_segment_offsets,
    ReductionOpT            reduction_op,
    HostReferenceIteratorT  h_reference)
{
    // Input and output data types
    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputT;
    typedef typename std::iterator_traits<HostReferenceIteratorT>::value_type   OutputT;

    // Allocate CUB_CDP device arrays for temp storage size and error
    OutputT         *d_out = NULL;
    size_t          *d_temp_storage_bytes = NULL;
    cudaError_t     *d_cdp_error = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,                 sizeof(OutputT) * num_segments));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));

    // Inquire temp device storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(Dispatch(backend, 1,
        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
        d_in, d_out, num_items, num_segments, d_segment_offsets,
        reduction_op, 0, true));

    // Allocate temp device storage
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Clear device output
    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));

    // Run once with discard iterator
    DiscardOutputIterator<int> discard_itr;
    CubDebugExit(Dispatch(backend, 1,
        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
        d_in, discard_itr, num_items, num_segments, d_segment_offsets,
        reduction_op, 0, true));

    // Run warmup/correctness iteration
    CubDebugExit(Dispatch(backend, 1,
        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
        d_in, d_out, num_items, num_segments, d_segment_offsets,
        reduction_op, 0, true));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    if (g_timing_iterations > 0)
    {
        GpuTimer gpu_timer;
        gpu_timer.Start();

        CubDebugExit(Dispatch(backend, g_timing_iterations,
            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
            d_in, d_out, num_items, num_segments, d_segment_offsets,
            reduction_op, 0, false));

        gpu_timer.Stop();
        float elapsed_millis = gpu_timer.ElapsedMillis();

        // Display performance
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(InputT);
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    // Correctness asserts
    AssertEquals(0, compare);
}


/// Test DeviceReduce
template <
    Backend                 BACKEND,
    typename                OutputValueT,
    typename                HostInputIteratorT,
    typename                DeviceInputIteratorT,
    typename                ReductionOpT>
void SolveAndTest(
    HostInputIteratorT      h_in,
    DeviceInputIteratorT    d_in,
    int                     num_items,
    int                     num_segments,
    int                     *h_segment_offsets,
    int                     *d_segment_offsets,
    ReductionOpT            reduction_op)
{
    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputValueT;
    typedef Solution<ReductionOpT, InputValueT, OutputValueT>                   SolutionT;
    typedef typename SolutionT::OutputT                                         OutputT;

    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
        (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
        typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
    fflush(stdout);

    // Allocate and solve solution
    OutputT *h_reference = new OutputT[num_segments];
    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);

    // Run test
    Test(Int2Type<BACKEND>(), d_in, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);

    // Cleanup
    if (h_reference) delete[] h_reference;
}


/// Test specific problem type
template <
    Backend         BACKEND,
    typename        InputT,
    typename        OutputT,



( run in 1.762 second using v1.01-cache-2.11-cpan-524268b4103 )