Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/test/test_device_run_length_encode.cu  view on Meta::CPAN

    if (RLE_METHOD == NON_TRIVIAL)
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));

    // Allocate CDP device arrays
    size_t*          d_temp_storage_bytes = NULL;
    cudaError_t*     d_cdp_error = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));

    // Allocate temporary storage
    void*           d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Clear device output arrays
    if (RLE_METHOD == RLE)
        CubDebugExit(cudaMemset(d_unique_out,   0, sizeof(T) * num_items));
    if (RLE_METHOD == NON_TRIVIAL)
        CubDebugExit(cudaMemset(d_offsets_out,  0, sizeof(OffsetT) * num_items));
    CubDebugExit(cudaMemset(d_lengths_out,  0, sizeof(LengthT) * num_items));
    CubDebugExit(cudaMemset(d_num_runs,     0, sizeof(int)));

    // Run warmup/correctness iteration
    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));

    // Check for correctness (and display results, if specified)
    int compare0 = 0;
    int compare1 = 0;
    int compare2 = 0;
    int compare3 = 0;

    if (RLE_METHOD == RLE)
    {
        compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
        printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
    }

    if (RLE_METHOD != RLE)
    {
        compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
        printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
    }

    if (RLE_METHOD != CSR)
    {
        compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
        printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
    }

    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0...
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }
    printf("\n\n");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Cleanup
    if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
    if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
    if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    // Correctness asserts
    AssertEquals(0, compare0 | compare1 | compare2 | compare3);
}


/**
 * Test DeviceRunLengthEncode on pointer type
 */
template <
    RleMethod       RLE_METHOD,
    Backend         BACKEND,
    typename        T,
    typename        OffsetT,
    typename        LengthT>
void TestPointer(
    int             num_items,
    int             entropy_reduction,
    int             max_segment)
{
    // Allocate host arrays
    T*      h_in                    = new T[num_items];
    T*      h_unique_reference      = new T[num_items];
    OffsetT* h_offsets_reference     = new OffsetT[num_items];
    LengthT* h_lengths_reference     = new LengthT[num_items];

    for (int i = 0; i < num_items; ++i)
        InitValue(INTEGER_SEED, h_offsets_reference[i], 1);

    // Initialize problem and solution
    Equality equality_op;
    Initialize(entropy_reduction, h_in, num_items, max_segment);

    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);

    printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",



( run in 1.148 second using v1.01-cache-2.11-cpan-d7f47b0818f )