Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/test/test_device_histogram.cu  view on Meta::CPAN

                if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
                if ((bin >= 0) && (bin < num_levels[channel] - 1))
                {
                    // valid bin
                    h_histogram[channel][bin]++;
                }
            }
            if (g_verbose_input) printf("]");
        }
        if (g_verbose_input) printf("\n\n");
    }

    printf("Done\n"); fflush(stdout);
}


/**
 * Test histogram-even
 */
template <
    Backend         BACKEND,
    int             NUM_CHANNELS,
    int             NUM_ACTIVE_CHANNELS,
    typename        SampleT,
    typename        CounterT,
    typename        LevelT,
    typename        OffsetT>
void TestEven(
    LevelT          max_level,
    int             entropy_reduction,
    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[...
    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));

    printf("\n----------------------------\n");
    printf("%s cub::DeviceHistogramEven %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
        (int) (num_row_pixels * num_rows),
        (int) num_rows,
        (int) num_row_pixels,
        (int) row_stride_bytes,
        (int) total_samples,
        (int) sizeof(SampleT),
        typeid(SampleT).name(),
        entropy_reduction,
        typeid(CounterT).name(),
        NUM_ACTIVE_CHANNELS,
        NUM_CHANNELS);
    std::cout << CoutCast(max_level) << "\n";
    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
        std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
    fflush(stdout);

    // Allocate and initialize host and device data

    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
    SampleT*                    h_samples = new Foo[total_samples];
    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
    ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];

    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        int bins = num_levels[channel] - 1;
        h_histogram[channel] = new CounterT[bins];

        transform_op[channel].Init(
            num_levels[channel],
            upper_level[channel],
            lower_level[channel],
            ((upper_level[channel] - lower_level[channel]) / bins));
    }

    Initialize<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
        max_level, entropy_reduction, h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);

    // Allocate and initialize device data

    SampleT*        d_samples = NULL;
    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];

    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
        CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
    }

    // Allocate CDP device arrays
    size_t          *d_temp_storage_bytes = NULL;
    cudaError_t     *d_cdp_error = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;

    DispatchEven(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, lower_level, upper_level,
        num_row_pixels, num_rows, row_stride_bytes,
        0, true);

    // Allocate temporary storage with "canary" zones
    int 	canary_bytes 	= 256;
    char 	canary_token 	= 8;
    char* 	canary_zone 	= new char[canary_bytes];

    memset(canary_zone, canary_token, canary_bytes);
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));

    // Run warmup/correctness iteration
    DispatchEven(

xgboost/cub/test/test_device_histogram.cu  view on Meta::CPAN

    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    // Correctness asserts
    AssertEquals(0, error);
}





/**
 * Test histogram-range
 */
template <
    Backend         BACKEND,
    int             NUM_CHANNELS,
    int             NUM_ACTIVE_CHANNELS,
    typename        SampleT,
    typename        CounterT,
    typename        LevelT,
    typename        OffsetT>
void TestRange(
    LevelT          max_level,
    int             entropy_reduction,
    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[...
    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));

    printf("\n----------------------------\n");
    printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
        (int) (num_row_pixels * num_rows),
        (int) num_rows,
        (int) num_row_pixels,
        (int) row_stride_bytes,
        (int) total_samples,
        (int) sizeof(SampleT),
        typeid(SampleT).name(),
        entropy_reduction,
        typeid(CounterT).name(),
        NUM_ACTIVE_CHANNELS,
        NUM_CHANNELS);
    std::cout << CoutCast(max_level) << "\n";
    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
        std::cout << levels[channel][0];
        for (int level = 1; level < num_levels[channel]; ++level)
            std::cout << ", " << levels[channel][level];
        printf("]\n");
    }
    fflush(stdout);

    // Allocate and initialize host and device data
    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
    SampleT*                    h_samples = new Foo[total_samples];
    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
    SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];

    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        transform_op[channel].levels = levels[channel];
        transform_op[channel].num_levels = num_levels[channel];

        int bins = num_levels[channel] - 1;
        h_histogram[channel] = new CounterT[bins];
    }

    Initialize<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
        max_level, entropy_reduction, h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);

    // Allocate and initialize device data
    SampleT*        d_samples = NULL;
    LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];

    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));

    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
        CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));

        int bins = num_levels[channel] - 1;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
        CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
    }

    // Allocate CDP device arrays
    size_t          *d_temp_storage_bytes = NULL;
    cudaError_t     *d_cdp_error = NULL;

    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;

    DispatchRange(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, d_levels,
        num_row_pixels, num_rows, row_stride_bytes,
        0, true);

    // Allocate temporary storage with "canary" zones
    int     canary_bytes    = 256;
    char    canary_token    = 9;
    char*   canary_zone     = new char[canary_bytes];

    memset(canary_zone, canary_token, canary_bytes);
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));



( run in 0.878 second using v1.01-cache-2.11-cpan-df04353d9ac )