Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
if ((bin >= 0) && (bin < num_levels[channel] - 1))
{
// valid bin
h_histogram[channel][bin]++;
}
}
if (g_verbose_input) printf("]");
}
if (g_verbose_input) printf("\n\n");
}
printf("Done\n"); fflush(stdout);
}
/**
* Test histogram-even
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEven(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
printf("\n----------------------------\n");
printf("%s cub::DeviceHistogramEven %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
(int) (num_row_pixels * num_rows),
(int) num_rows,
(int) num_row_pixels,
(int) row_stride_bytes,
(int) total_samples,
(int) sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
std::cout << CoutCast(max_level) << "\n";
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
fflush(stdout);
// Allocate and initialize host and device data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
SampleT* h_samples = new Foo[total_samples];
CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
ScaleTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int bins = num_levels[channel] - 1;
h_histogram[channel] = new CounterT[bins];
transform_op[channel].Init(
num_levels[channel],
upper_level[channel],
lower_level[channel],
((upper_level[channel] - lower_level[channel]) / bins));
}
Initialize<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
max_level, entropy_reduction, h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
SampleT* d_samples = NULL;
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 8;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
DispatchEven(
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
/**
* Test histogram-range
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestRange(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[...
LevelT* levels[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
printf("\n----------------------------\n");
printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
(int) (num_row_pixels * num_rows),
(int) num_rows,
(int) num_row_pixels,
(int) row_stride_bytes,
(int) total_samples,
(int) sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
std::cout << CoutCast(max_level) << "\n";
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
std::cout << levels[channel][0];
for (int level = 1; level < num_levels[channel]; ++level)
std::cout << ", " << levels[channel][level];
printf("]\n");
}
fflush(stdout);
// Allocate and initialize host and device data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
SampleT* h_samples = new Foo[total_samples];
CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
SearchTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
transform_op[channel].levels = levels[channel];
transform_op[channel].num_levels = num_levels[channel];
int bins = num_levels[channel] - 1;
h_histogram[channel] = new CounterT[bins];
}
Initialize<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
max_level, entropy_reduction, h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
SampleT* d_samples = NULL;
LevelT* d_levels[NUM_ACTIVE_CHANNELS];
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel], sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
int bins = num_levels[channel] - 1;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * bins));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * bins));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, true);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 9;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
( run in 0.878 second using v1.01-cache-2.11-cpan-df04353d9ac )