Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of DeviceHistogram utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <limits>
#include <algorithm>
#include <typeinfo>
#if defined(QUICK_TEST) || defined(QUICKER_TEST)
#include <npp.h>
#endif
#include <cub/util_allocator.cuh>
#include <cub/device/device_histogram.cuh>
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
// Dispatch types
enum Backend
{
CUB, // CUB method
NPP, // NPP method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
};
bool g_verbose_input = false;
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
//---------------------------------------------------------------------
// Dispatch to NPP histogram
//---------------------------------------------------------------------
#if defined(QUICK_TEST) || defined(QUICKER_TEST)
/**
* Dispatch to single-channel 8b NPP histo-even
*/
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<1> num_channels,
Int2Type<1> num_active_channels,
Int2Type<NPP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[1], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[1], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef unsigned char SampleT;
cudaError_t error = cudaSuccess;
NppiSize oSizeROI = {
num_row_pixels,
num_rows
};
if (d_temp_storage_bytes == NULL)
{
int nDeviceBufferSize;
nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
temp_storage_bytes = nDeviceBufferSize;
}
else
{
for (int i = 0; i < timing_timing_iterations; ++i)
{
// compute the histogram
nppiHistogramEven_8u_C1R(
d_samples,
row_stride_bytes,
oSizeROI,
d_histogram[0],
num_levels[0],
lower_level[0],
upper_level[0],
(Npp8u*) d_temp_storage);
}
}
return error;
}
/**
* Dispatch to 3/4 8b NPP histo-even
*/
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<4> num_channels,
Int2Type<3> num_active_channels,
Int2Type<NPP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
CounterT *d_histogram[3], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
int num_levels[3], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[3], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[3], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef unsigned char SampleT;
cudaError_t error = cudaSuccess;
NppiSize oSizeROI = {
num_row_pixels,
num_rows
};
if (d_temp_storage_bytes == NULL)
{
int nDeviceBufferSize;
nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
temp_storage_bytes = nDeviceBufferSize;
}
else
{
for (int i = 0; i < timing_timing_iterations; ++i)
{
// compute the histogram
nppiHistogramEven_8u_AC4R(
d_samples,
row_stride_bytes,
oSizeROI,
d_histogram,
num_levels,
lower_level,
upper_level,
(Npp8u*) d_temp_storage);
}
}
return error;
}
#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
//---------------------------------------------------------------------
// Dispatch to different DeviceHistogram entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to CUB single histogram-even entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<1> num_channels,
Int2Type<1> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT lower_level[1], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[1], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramEven(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram[0],
num_levels[0],
lower_level[0],
upper_level[0],
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
/**
* Dispatch to CUB multi histogram-even entrypoint
*/
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<NUM_CHANNELS> num_channels,
Int2Type<NUM_ACTIVE_CHANNELS> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
/**
* Dispatch to CUB single histogram-range entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
Int2Type<1> num_channels,
Int2Type<1> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT *d_levels[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramRange(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram[0],
num_levels[0],
d_levels[0],
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
/**
* Dispatch to CUB multi histogram-range entrypoint
*/
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
Int2Type<NUM_CHANNELS> num_channels,
Int2Type<NUM_ACTIVE_CHANNELS> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram,
num_levels,
d_levels,
num_row_pixels,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
return error;
}
//---------------------------------------------------------------------
// CUDA nested-parallelism test kernel
//---------------------------------------------------------------------
/**
* Simple wrapper kernel to invoke DeviceHistogram
* /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
__global__ void CnpDispatchKernel(
Int2Type<ALGORITHM> algorithm,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t temp_storage_bytes,
SampleT *d_samples,
SampleIteratorT d_sample_itr,
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
int num_samples,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_s...
*d_temp_storage_bytes = temp_storage_bytes;
#endif
}
/ **
* Dispatch to CDP kernel
* /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
cudaError_t Dispatch(
Int2Type<ALGORITHM> algorithm,
Int2Type<true> use_cdp,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleT *d_samples,
SampleIteratorT d_sample_itr,
CounterT *d_histograms[NUM_ACTIVE_CHANNELS],
int num_samples,
cudaStream_t stream,
bool debug_synchronous)
{
// Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo...
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
// Copy out error
cudaError_t retval;
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
return retval;
}
*/
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
// Searches for bin given a list of bin-boundary levels
template <typename LevelT>
struct SearchTransform
{
LevelT *levels; // Pointer to levels array
int num_levels; // Number of levels in array
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
template <typename SampleT>
int operator()(SampleT sample)
{
int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
if (bin < 0)
{
// Sample out of range
return num_levels;
}
return bin;
}
};
// Scales samples to evenly-spaced bins
template <typename LevelT>
struct ScaleTransform
{
int num_levels; // Number of levels in array
LevelT max; // Max sample level (exclusive)
LevelT min; // Min sample level (inclusive)
LevelT scale; // Bin scaling factor
void Init(
int num_levels, // Number of levels in array
LevelT max, // Max sample level (exclusive)
LevelT min, // Min sample level (inclusive)
LevelT scale) // Bin scaling factor
{
this->num_levels = num_levels;
this->max = max;
this->min = min;
this->scale = scale;
}
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 8;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Check canary zones
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (h_samples) delete[] h_samples;
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
}
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
/**
* Test histogram-range
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestRange(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[...
LevelT* levels[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
printf("\n----------------------------\n");
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 9;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Check canary zones
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (h_samples) delete[] h_samples;
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
if (d_levels[channel])
CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
}
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
/**
* Test histogram-even
*/
template <
Backend BACKEND,
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEven(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT max_level,
int max_num_levels)
{
LevelT lower_level[NUM_ACTIVE_CHANNELS];
LevelT upper_level[NUM_ACTIVE_CHANNELS];
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
*/
template <
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestChannels(
LevelT max_level,
int max_num_levels,
Int2Type<true> is_valid_tag)
{
Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
}
/**
* Test different channel interleavings (invalid specialiation)
*/
template <
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestChannels(
LevelT max_level,
int max_num_levels,
Int2Type<false> is_valid_tag)
{}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
int num_row_pixels = -1;
int entropy_reduction = 0;
int num_rows = 1;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
g_verbose_input = args.CheckCmdLineFlag("v2");
args.GetCmdLineArgument("n", num_row_pixels);
int row_stride_pixels = num_row_pixels;
args.GetCmdLineArgument("rows", num_rows);
args.GetCmdLineArgument("stride", row_stride_pixels);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("entropy", entropy_reduction);
bool compare_npp = args.CheckCmdLineFlag("npp");
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<pixels per row> "
"[--rows=<number of rows> "
"[--stride=<row stride in pixels> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--entropy=<entropy-reduction factor (default 0)>]"
"[--v] "
"[--cdp]"
"[--npp]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Get ptx version
int ptx_version;
CubDebugExit(PtxVersion(ptx_version));
if (num_row_pixels < 0)
{
num_row_pixels = 1920 * 1080;
row_stride_pixels = num_row_pixels;
}
#if defined(QUICKER_TEST)
// Compile/run quick tests
{
// HistogramEven: unsigned char 256 bins
typedef unsigned char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
if (compare_npp)
TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
#elif defined(QUICK_TEST)
// Compile/run quick tests
{
// HistogramEven: unsigned char 256 bins
typedef unsigned char SampleT;
typedef int LevelT;
LevelT max_level = 256;
int num_levels[1] = {257};
int row_stride_bytes = sizeof(SampleT) * row_stride_pixels * 1;
TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
if (compare_npp)
TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
}
{
// HistogramEven: 4/4 multichannel Unsigned char 256 bins
( run in 0.742 second using v1.01-cache-2.11-cpan-71847e10f99 )