Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/experimental/histogram_compare.cu  view on Meta::CPAN

{
    unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);

    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
        bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
}

// Decode uchar1 pixel into bins
template <int NUM_BINS, int ACTIVE_CHANNELS>
void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
{
    bins[0] = (unsigned int) pixel.x;
}


// Compute reference histogram.  Specialized for uchar4
template <
    int         ACTIVE_CHANNELS,
    int         NUM_BINS,
    typename    PixelType>
void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
{
    memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));

    for (int i = 0; i < width; i++)
    {
        for (int j = 0; j < height; j++)
        {
            PixelType pixel = image[i + j * width];

            unsigned int bins[ACTIVE_CHANNELS];
            DecodePixelGold<NUM_BINS>(pixel, bins);

            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
            {
                hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
            }
        }
    }
}


//---------------------------------------------------------------------
// Test execution
//---------------------------------------------------------------------

/**
 * Run a specific histogram implementation
 */
template <
    int         ACTIVE_CHANNELS,
    int         NUM_BINS,
    typename    PixelType>
void RunTest(
    std::vector<std::pair<std::string, double> >&   timings,
    PixelType*                                      d_pixels,
    const int                                       width,
    const int                                       height,
    unsigned int *                                  d_hist,
    unsigned int *                                  h_hist,
    int                                             timing_iterations,
    const char *                                    long_name,
    const char *                                    short_name,
    double (*f)(PixelType*, int, int, unsigned int*, bool))
{
    if (!g_report) printf("%s ", long_name); fflush(stdout);

    // Run single test to verify (and code cache)
    (*f)(d_pixels, width, height, d_hist, !g_report);

    int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
    if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);

    double elapsed_ms = 0;
    for (int i = 0; i < timing_iterations; i++)
    {
        elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
    }
    double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
    timings.push_back(std::pair<std::string, double>(short_name, avg_us));

    if (!g_report)
    {
        printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
    }
    else
    {
        printf("%.3f, ", avg_us); fflush(stdout);
    }

    AssertEquals(0, compare);
}


/**
 * Evaluate corpus of histogram implementations
 */
template <
    int         NUM_CHANNELS,
    int         ACTIVE_CHANNELS,
    int         NUM_BINS,
    typename    PixelType>
void TestMethods(
    PixelType*  h_pixels,
    int         height,
    int         width,
    int         timing_iterations,
    double      bandwidth_GBs)
{
    // Copy data to gpu
    PixelType* d_pixels;
    size_t pixel_bytes = width * height * sizeof(PixelType);
    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
    CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));

    if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);

    // Allocate results arrays on cpu/gpu
    unsigned int *h_hist;
    unsigned int *d_hist;
    size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
    h_hist = (unsigned int *) malloc(histogram_bytes);
    g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);

    // Compute reference cpu histogram
    HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);

    // Store timings
    std::vector<std::pair<std::string, double> > timings;

    // Run experiments
    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
        "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
        "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
        "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);

    // Report timings
    if (!g_report)
    {
        std::sort(timings.begin(), timings.end(), less_than_value());
        printf("Timings (us):\n");
        for (int i = 0; i < timings.size(); i++)
        {
            double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
            printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
        }
        printf("\n");
    }

    // Free data
    CubDebugExit(g_allocator.DeviceFree(d_pixels));
    CubDebugExit(g_allocator.DeviceFree(d_hist));
    free(h_hist);
}


/**
 * Test different problem genres
 */
void TestGenres(
    uchar4*     uchar4_pixels,
    int         height,
    int         width,
    int         timing_iterations,
    double      bandwidth_GBs)
{
    int num_pixels = width * height;

    {
        if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);

        size_t      image_bytes     = num_pixels * sizeof(uchar1);
        uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);

        // Convert to 1-channel (averaging first 3 channels)
        for (int i = 0; i < num_pixels; ++i)
        {
            uchar1_pixels[i].x = (unsigned char)
                (((unsigned int) uchar4_pixels[i].x +
                  (unsigned int) uchar4_pixels[i].y +
                  (unsigned int) uchar4_pixels[i].z) / 3);
        }

        TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
        free(uchar1_pixels);
        if (g_report) printf(", ");
    }

    {
        if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
        TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
        if (g_report) printf(", ");
    }

    {
        if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
        size_t      image_bytes     = num_pixels * sizeof(float4);
        float4*     float4_pixels   = (float4*) malloc(image_bytes);

        // Convert to float4 with range [0.0, 1.0)
        for (int i = 0; i < num_pixels; ++i)
        {
            float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
            float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
            float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
            float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
        }
        TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
        free(float4_pixels);
        if (g_report) printf("\n");
    }
}


/**
 * Main
 */
int main(int argc, char **argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    if (args.CheckCmdLineFlag("help"))
    {
        printf(
            "%s "
            "[--device=<device-id>] "
            "[--v] "
            "[--i=<timing iterations>] "
            "\n\t"
                "--file=<.tga filename> "
            "\n\t"
                "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
                "[--height=<default: 1080>] "
                "[--width=<default: 1920>] "
            "\n", argv[0]);
        exit(0);
    }

    std::string         filename;
    int                 timing_iterations   = 100;
    int                 entropy_reduction   = 0;
    int                 height              = 1080;
    int                 width               = 1920;

    g_verbose = args.CheckCmdLineFlag("v");
    g_report = args.CheckCmdLineFlag("report");
    args.GetCmdLineArgument("i", timing_iterations);
    args.GetCmdLineArgument("file", filename);
    args.GetCmdLineArgument("height", height);
    args.GetCmdLineArgument("width", width);
    args.GetCmdLineArgument("entropy", entropy_reduction);

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Get GPU device bandwidth (GB/s)
    int device_ordinal, bus_width, mem_clock_khz;
    CubDebugExit(cudaGetDevice(&device_ordinal));
    CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
    CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
    double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;

    // Run test(s)
    uchar4* uchar4_pixels = NULL;
    if (!g_report)
    {
        if (!filename.empty())
        {
            // Parse targa file
            ReadTga(uchar4_pixels, width, height, filename.c_str());
            printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
        }
        else
        {
            // Generate image
            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
            printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
        }

        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
    }
    else
    {
        // Run test suite
        printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");

        // Entropy reduction tests
        for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
        {
            printf("entropy reduction %d, ", entropy_reduction);
            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
        }
        printf("entropy reduction -1, ");
        GenerateRandomImage(uchar4_pixels, width, height, -1);
        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
        printf("\n");

        // File image tests
        std::vector<std::string> file_tests;
        file_tests.push_back("animals");
        file_tests.push_back("apples");
        file_tests.push_back("sunset");
        file_tests.push_back("cheetah");
        file_tests.push_back("nature");
        file_tests.push_back("operahouse");
        file_tests.push_back("austin");
        file_tests.push_back("cityscape");

        for (int i = 0; i < file_tests.size(); ++i)
        {
            printf("%s, ", file_tests[i].c_str());
            std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
            ReadTga(uchar4_pixels, width, height, filename.c_str());
            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
        }
    }

    free(uchar4_pixels);

    CubDebugExit(cudaDeviceSynchronize());
    printf("\n\n");

    return 0;
}



( run in 0.489 second using v1.01-cache-2.11-cpan-71847e10f99 )