Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN
{
unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
}
// Decode uchar1 pixel into bins
template <int NUM_BINS, int ACTIVE_CHANNELS>
void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
{
bins[0] = (unsigned int) pixel.x;
}
// Compute reference histogram. Specialized for uchar4
template <
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
{
memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
for (int i = 0; i < width; i++)
{
for (int j = 0; j < height; j++)
{
PixelType pixel = image[i + j * width];
unsigned int bins[ACTIVE_CHANNELS];
DecodePixelGold<NUM_BINS>(pixel, bins);
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
{
hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
}
}
}
}
//---------------------------------------------------------------------
// Test execution
//---------------------------------------------------------------------
/**
* Run a specific histogram implementation
*/
template <
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void RunTest(
std::vector<std::pair<std::string, double> >& timings,
PixelType* d_pixels,
const int width,
const int height,
unsigned int * d_hist,
unsigned int * h_hist,
int timing_iterations,
const char * long_name,
const char * short_name,
double (*f)(PixelType*, int, int, unsigned int*, bool))
{
if (!g_report) printf("%s ", long_name); fflush(stdout);
// Run single test to verify (and code cache)
(*f)(d_pixels, width, height, d_hist, !g_report);
int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
double elapsed_ms = 0;
for (int i = 0; i < timing_iterations; i++)
{
elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
}
double avg_us = (elapsed_ms / timing_iterations) * 1000; // average in us
timings.push_back(std::pair<std::string, double>(short_name, avg_us));
if (!g_report)
{
printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
}
else
{
printf("%.3f, ", avg_us); fflush(stdout);
}
AssertEquals(0, compare);
}
/**
* Evaluate corpus of histogram implementations
*/
template <
int NUM_CHANNELS,
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void TestMethods(
PixelType* h_pixels,
int height,
int width,
int timing_iterations,
double bandwidth_GBs)
{
// Copy data to gpu
PixelType* d_pixels;
size_t pixel_bytes = width * height * sizeof(PixelType);
CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
// Allocate results arrays on cpu/gpu
unsigned int *h_hist;
unsigned int *d_hist;
size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
h_hist = (unsigned int *) malloc(histogram_bytes);
g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
// Compute reference cpu histogram
HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
// Store timings
std::vector<std::pair<std::string, double> > timings;
// Run experiments
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
// Report timings
if (!g_report)
{
std::sort(timings.begin(), timings.end(), less_than_value());
printf("Timings (us):\n");
for (int i = 0; i < timings.size(); i++)
{
double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
}
printf("\n");
}
// Free data
CubDebugExit(g_allocator.DeviceFree(d_pixels));
CubDebugExit(g_allocator.DeviceFree(d_hist));
free(h_hist);
}
/**
* Test different problem genres
*/
void TestGenres(
uchar4* uchar4_pixels,
int height,
int width,
int timing_iterations,
double bandwidth_GBs)
{
int num_pixels = width * height;
{
if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
size_t image_bytes = num_pixels * sizeof(uchar1);
uchar1* uchar1_pixels = (uchar1*) malloc(image_bytes);
// Convert to 1-channel (averaging first 3 channels)
for (int i = 0; i < num_pixels; ++i)
{
uchar1_pixels[i].x = (unsigned char)
(((unsigned int) uchar4_pixels[i].x +
(unsigned int) uchar4_pixels[i].y +
(unsigned int) uchar4_pixels[i].z) / 3);
}
TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
free(uchar1_pixels);
if (g_report) printf(", ");
}
{
if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
if (g_report) printf(", ");
}
{
if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
size_t image_bytes = num_pixels * sizeof(float4);
float4* float4_pixels = (float4*) malloc(image_bytes);
// Convert to float4 with range [0.0, 1.0)
for (int i = 0; i < num_pixels; ++i)
{
float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
}
TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
free(float4_pixels);
if (g_report) printf("\n");
}
}
/**
* Main
*/
int main(int argc, char **argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
if (args.CheckCmdLineFlag("help"))
{
printf(
"%s "
"[--device=<device-id>] "
"[--v] "
"[--i=<timing iterations>] "
"\n\t"
"--file=<.tga filename> "
"\n\t"
"--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
"[--height=<default: 1080>] "
"[--width=<default: 1920>] "
"\n", argv[0]);
exit(0);
}
std::string filename;
int timing_iterations = 100;
int entropy_reduction = 0;
int height = 1080;
int width = 1920;
g_verbose = args.CheckCmdLineFlag("v");
g_report = args.CheckCmdLineFlag("report");
args.GetCmdLineArgument("i", timing_iterations);
args.GetCmdLineArgument("file", filename);
args.GetCmdLineArgument("height", height);
args.GetCmdLineArgument("width", width);
args.GetCmdLineArgument("entropy", entropy_reduction);
// Initialize device
CubDebugExit(args.DeviceInit());
// Get GPU device bandwidth (GB/s)
int device_ordinal, bus_width, mem_clock_khz;
CubDebugExit(cudaGetDevice(&device_ordinal));
CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
// Run test(s)
uchar4* uchar4_pixels = NULL;
if (!g_report)
{
if (!filename.empty())
{
// Parse targa file
ReadTga(uchar4_pixels, width, height, filename.c_str());
printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
}
else
{
// Generate image
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
}
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
else
{
// Run test suite
printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
// Entropy reduction tests
for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
{
printf("entropy reduction %d, ", entropy_reduction);
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
printf("entropy reduction -1, ");
GenerateRandomImage(uchar4_pixels, width, height, -1);
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
printf("\n");
// File image tests
std::vector<std::string> file_tests;
file_tests.push_back("animals");
file_tests.push_back("apples");
file_tests.push_back("sunset");
file_tests.push_back("cheetah");
file_tests.push_back("nature");
file_tests.push_back("operahouse");
file_tests.push_back("austin");
file_tests.push_back("cityscape");
for (int i = 0; i < file_tests.size(); ++i)
{
printf("%s, ", file_tests[i].c_str());
std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
ReadTga(uchar4_pixels, width, height, filename.c_str());
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
}
free(uchar4_pixels);
CubDebugExit(cudaDeviceSynchronize());
printf("\n\n");
return 0;
}
( run in 0.489 second using v1.01-cache-2.11-cpan-71847e10f99 )