Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/test/test_device_histogram.cu  view on Meta::CPAN

 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Test of DeviceHistogram utilities
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>
#include <limits>
#include <algorithm>
#include <typeinfo>

#if defined(QUICK_TEST) || defined(QUICKER_TEST)
    #include <npp.h>
#endif

#include <cub/util_allocator.cuh>
#include <cub/device/device_histogram.cuh>

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------


// Dispatch types
enum Backend
{
    CUB,        // CUB method
    NPP,        // NPP method
    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
};


bool                    g_verbose_input     = false;
bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);




//---------------------------------------------------------------------
// Dispatch to NPP histogram
//---------------------------------------------------------------------

#if defined(QUICK_TEST) || defined(QUICKER_TEST)

/**
 * Dispatch to single-channel 8b NPP histo-even
 */
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<1>             num_channels,
    Int2Type<1>             num_active_channels,
    Int2Type<NPP>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
    CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
    int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
    LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef unsigned char SampleT;

    cudaError_t error = cudaSuccess;
    NppiSize oSizeROI = {
        num_row_pixels,
        num_rows
    };

    if (d_temp_storage_bytes == NULL)
    {
        int nDeviceBufferSize;
        nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
        temp_storage_bytes = nDeviceBufferSize;
    }
    else
    {
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            // compute the histogram
            nppiHistogramEven_8u_C1R(
                d_samples,
                row_stride_bytes,
                oSizeROI,
                d_histogram[0],
                num_levels[0],
                lower_level[0],
                upper_level[0],
                (Npp8u*) d_temp_storage);
        }
    }

    return error;
}


/**
 * Dispatch to 3/4 8b NPP histo-even
 */
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<4>          num_channels,
    Int2Type<3>   num_active_channels,
    Int2Type<NPP>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
    CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
    int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
    LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef unsigned char SampleT;

    cudaError_t error = cudaSuccess;
    NppiSize oSizeROI = {
        num_row_pixels,
        num_rows
    };

    if (d_temp_storage_bytes == NULL)
    {
        int nDeviceBufferSize;
        nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
        temp_storage_bytes = nDeviceBufferSize;
    }
    else
    {
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            // compute the histogram
            nppiHistogramEven_8u_AC4R(
                d_samples,
                row_stride_bytes,
                oSizeROI,
                d_histogram,
                num_levels,
                lower_level,
                upper_level,
                (Npp8u*) d_temp_storage);
        }
    }

    return error;
}


#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)


//---------------------------------------------------------------------
// Dispatch to different DeviceHistogram entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to CUB single histogram-even entrypoint
 */
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<1>             num_channels,
    Int2Type<1>             num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              lower_level[1],                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[1],                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::HistogramEven(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram[0],
            num_levels[0],
            lower_level[0],
            upper_level[0],
            num_row_pixels,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to CUB multi histogram-even entrypoint
 */
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<NUM_CHANNELS>          num_channels,
    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram,
            num_levels,
            lower_level,
            upper_level,
            num_row_pixels,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to CUB single histogram-range entrypoint
 */
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
    Int2Type<1>             num_channels,
    Int2Type<1>             num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              *d_levels[1],                               ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::HistogramRange(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram[0],
            num_levels[0],
            d_levels[0],
            num_row_pixels,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to CUB multi histogram-range entrypoint
 */
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
    Int2Type<NUM_CHANNELS>          num_channels,
    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram,
            num_levels,
            d_levels,
            num_row_pixels,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }
    return error;
}



//---------------------------------------------------------------------
// CUDA nested-parallelism test kernel
//---------------------------------------------------------------------

/**
 * Simple wrapper kernel to invoke DeviceHistogram
 * /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
__global__ void CnpDispatchKernel(
    Int2Type<ALGORITHM> algorithm,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t              temp_storage_bytes,
    SampleT             *d_samples,
    SampleIteratorT      d_sample_itr,
    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
    int                 num_samples,
    bool                debug_synchronous)
{
#ifndef CUB_CDP
    *d_cdp_error = cudaErrorNotSupported;
#else
    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_s...
    *d_temp_storage_bytes = temp_storage_bytes;
#endif
}


/ **
 * Dispatch to CDP kernel
 * /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
cudaError_t Dispatch(
    Int2Type<ALGORITHM> algorithm,
    Int2Type<true>      use_cdp,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleT             *d_samples,
    SampleIteratorT      d_sample_itr,
    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
    int                 num_samples,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];

    // Invoke kernel to invoke device-side dispatch
    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo...

    // Copy out temp_storage_bytes
    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

    // Copy out error
    cudaError_t retval;
    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
    return retval;
}
*/


//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------

// Searches for bin given a list of bin-boundary levels
template <typename LevelT>
struct SearchTransform
{
    LevelT          *levels;      // Pointer to levels array
    int             num_levels;   // Number of levels in array

    // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
    template <typename SampleT>
    int operator()(SampleT sample)
    {
        int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
        if (bin < 0)
        {
            // Sample out of range
            return num_levels;
        }
        return bin;
    }
};


// Scales samples to evenly-spaced bins
template <typename LevelT>
struct ScaleTransform
{
    int    num_levels;  // Number of levels in array
    LevelT max;         // Max sample level (exclusive)
    LevelT min;         // Min sample level (inclusive)
    LevelT scale;       // Bin scaling factor

    void Init(
        int    num_levels,  // Number of levels in array
        LevelT max,         // Max sample level (exclusive)
        LevelT min,         // Min sample level (inclusive)
        LevelT scale)       // Bin scaling factor
    {
        this->num_levels = num_levels;
        this->max = max;
        this->min = min;
        this->scale = scale;
    }

    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)

xgboost/cub/test/test_device_histogram.cu  view on Meta::CPAN


    // Allocate CDP device arrays
    size_t          *d_temp_storage_bytes = NULL;
    cudaError_t     *d_cdp_error = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;

    DispatchEven(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, lower_level, upper_level,
        num_row_pixels, num_rows, row_stride_bytes,
        0, true);

    // Allocate temporary storage with "canary" zones
    int 	canary_bytes 	= 256;
    char 	canary_token 	= 8;
    char* 	canary_zone 	= new char[canary_bytes];

    memset(canary_zone, canary_token, canary_bytes);
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));

    // Run warmup/correctness iteration
    DispatchEven(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
        d_samples, d_histogram, num_levels, lower_level, upper_level,
        num_row_pixels, num_rows, row_stride_bytes,
        0, true);

    // Check canary zones
    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
    AssertEquals(0, error);
    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
    AssertEquals(0, error);

    // Flush any stdout/stderr
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(cudaDeviceSynchronize());
    fflush(stdout);
    fflush(stderr);

    // Check for correctness (and display results, if specified)
    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
        error |= channel_error;
    }

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();

    DispatchEven(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, lower_level, upper_level,
        num_row_pixels, num_rows, row_stride_bytes,
        0, false);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(SampleT);
        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
            avg_millis,
            giga_rate,
            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
            giga_rate / NUM_CHANNELS,
            giga_bandwidth);
    }

    printf("\n\n");

    // Cleanup
    if (h_samples) delete[] h_samples;

    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        if (h_histogram[channel])
            delete[] h_histogram[channel];

        if (d_histogram[channel])
            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
    }

    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    // Correctness asserts
    AssertEquals(0, error);
}





/**
 * Test histogram-range
 */
template <
    Backend         BACKEND,
    int             NUM_CHANNELS,
    int             NUM_ACTIVE_CHANNELS,
    typename        SampleT,
    typename        CounterT,
    typename        LevelT,
    typename        OffsetT>
void TestRange(
    LevelT          max_level,
    int             entropy_reduction,
    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[...
    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));

    printf("\n----------------------------\n");

xgboost/cub/test/test_device_histogram.cu  view on Meta::CPAN

    // Allocate CDP device arrays
    size_t          *d_temp_storage_bytes = NULL;
    cudaError_t     *d_cdp_error = NULL;

    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;

    DispatchRange(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, d_levels,
        num_row_pixels, num_rows, row_stride_bytes,
        0, true);

    // Allocate temporary storage with "canary" zones
    int     canary_bytes    = 256;
    char    canary_token    = 9;
    char*   canary_zone     = new char[canary_bytes];

    memset(canary_zone, canary_token, canary_bytes);
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));

    // Run warmup/correctness iteration
    DispatchRange(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
        d_samples, d_histogram, num_levels, d_levels,
        num_row_pixels, num_rows, row_stride_bytes,
        0, true);

    // Check canary zones
    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
    AssertEquals(0, error);
    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
    AssertEquals(0, error);

    // Flush any stdout/stderr
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(cudaDeviceSynchronize());
    fflush(stdout);
    fflush(stderr);

    // Check for correctness (and display results, if specified)
    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
        error |= channel_error;
    }

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();

    DispatchRange(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, d_levels,
        num_row_pixels, num_rows, row_stride_bytes,
        0, false);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(SampleT);
        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
            avg_millis,
            giga_rate,
            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
            giga_rate / NUM_CHANNELS,
            giga_bandwidth);
    }

    printf("\n\n");

    // Cleanup
    if (h_samples) delete[] h_samples;

    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        if (h_histogram[channel])
            delete[] h_histogram[channel];

        if (d_histogram[channel])
            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));

        if (d_levels[channel])
            CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
    }

    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    // Correctness asserts
    AssertEquals(0, error);
}


/**
 * Test histogram-even
 */
template <
    Backend         BACKEND,
    typename        SampleT,
    int             NUM_CHANNELS,
    int             NUM_ACTIVE_CHANNELS,
    typename        CounterT,
    typename        LevelT,
    typename        OffsetT>
void TestEven(
    OffsetT         num_row_pixels,
    OffsetT         num_rows,
    OffsetT         row_stride_bytes,
    int             entropy_reduction,
    int             num_levels[NUM_ACTIVE_CHANNELS],
    LevelT          max_level,
    int             max_num_levels)
{
    LevelT lower_level[NUM_ACTIVE_CHANNELS];
    LevelT upper_level[NUM_ACTIVE_CHANNELS];

xgboost/cub/test/test_device_histogram.cu  view on Meta::CPAN

 */
template <
    typename        SampleT,
    typename        CounterT,
    typename        LevelT,
    typename        OffsetT>
void TestChannels(
    LevelT          max_level,
    int             max_num_levels,
    Int2Type<true>  is_valid_tag)
{
    Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
    Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
    Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
    Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
}


/**
 * Test different channel interleavings (invalid specialiation)
 */
template <
    typename        SampleT,
    typename        CounterT,
    typename        LevelT,
    typename        OffsetT>
void TestChannels(
    LevelT          max_level,
    int             max_num_levels,
    Int2Type<false> is_valid_tag)
{}



//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------




/**
 * Main
 */
int main(int argc, char** argv)
{
    int num_row_pixels = -1;
    int entropy_reduction = 0;
    int num_rows = 1;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    g_verbose_input = args.CheckCmdLineFlag("v2");
    args.GetCmdLineArgument("n", num_row_pixels);

    int row_stride_pixels = num_row_pixels;

    args.GetCmdLineArgument("rows", num_rows);
    args.GetCmdLineArgument("stride", row_stride_pixels);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);
    args.GetCmdLineArgument("entropy", entropy_reduction);

    bool compare_npp = args.CheckCmdLineFlag("npp");


    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<pixels per row> "
            "[--rows=<number of rows> "
            "[--stride=<row stride in pixels> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--repeat=<repetitions of entire test suite>]"
            "[--entropy=<entropy-reduction factor (default 0)>]"
            "[--v] "
            "[--cdp]"
            "[--npp]"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Get ptx version
    int ptx_version;
    CubDebugExit(PtxVersion(ptx_version));

    if (num_row_pixels < 0)
    {
        num_row_pixels      = 1920 * 1080;
        row_stride_pixels   = num_row_pixels;
    }

#if defined(QUICKER_TEST)

    // Compile/run quick tests
    {
        // HistogramEven: unsigned char 256 bins
        typedef unsigned char       SampleT;
        typedef int                 LevelT;

        LevelT  max_level           = 256;
        int     num_levels[1]       = {257};
        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;

        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
        if (compare_npp)
            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
    }


#elif defined(QUICK_TEST)

    // Compile/run quick tests
    {
        // HistogramEven: unsigned char 256 bins
        typedef unsigned char       SampleT;
        typedef int                 LevelT;

        LevelT  max_level           = 256;
        int     num_levels[1]       = {257};
        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;

        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
        if (compare_npp)
            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
    }

    {
        // HistogramEven: 4/4 multichannel Unsigned char 256 bins



( run in 0.742 second using v1.01-cache-2.11-cpan-71847e10f99 )