timer results from the CPAN

timer

Alien-XGBoost

view release on metacpan or search on metacpan

xgboost/dmlc-core/include/dmlc/memory.h
xgboost/dmlc-core/include/dmlc/memory_io.h
xgboost/dmlc-core/include/dmlc/omp.h
xgboost/dmlc-core/include/dmlc/optional.h
xgboost/dmlc-core/include/dmlc/parameter.h
xgboost/dmlc-core/include/dmlc/recordio.h
xgboost/dmlc-core/include/dmlc/registry.h
xgboost/dmlc-core/include/dmlc/serializer.h
xgboost/dmlc-core/include/dmlc/thread_local.h
xgboost/dmlc-core/include/dmlc/threadediter.h
xgboost/dmlc-core/include/dmlc/timer.h
xgboost/dmlc-core/include/dmlc/type_traits.h
xgboost/dmlc-core/make/config.mk
xgboost/dmlc-core/make/dmlc.mk
xgboost/dmlc-core/scripts/lint.py
xgboost/dmlc-core/scripts/packages.mk
xgboost/dmlc-core/scripts/setup_nvcc.sh
xgboost/dmlc-core/scripts/travis/travis_before_cache.sh
xgboost/dmlc-core/scripts/travis/travis_osx_install.sh
xgboost/dmlc-core/scripts/travis/travis_script.sh
xgboost/dmlc-core/scripts/travis/travis_setup_env.sh

MANIFEST view on Meta::CPAN

xgboost/rabit/include/dmlc/README.md
xgboost/rabit/include/dmlc/base.h
xgboost/rabit/include/dmlc/io.h
xgboost/rabit/include/dmlc/logging.h
xgboost/rabit/include/dmlc/serializer.h
xgboost/rabit/include/dmlc/type_traits.h
xgboost/rabit/include/rabit/c_api.h
xgboost/rabit/include/rabit/internal/engine.h
xgboost/rabit/include/rabit/internal/io.h
xgboost/rabit/include/rabit/internal/rabit-inl.h
xgboost/rabit/include/rabit/internal/timer.h
xgboost/rabit/include/rabit/internal/utils.h
xgboost/rabit/include/rabit/rabit.h
xgboost/rabit/include/rabit/serializable.h
xgboost/rabit/lib/README.md
xgboost/rabit/python/rabit.py
xgboost/rabit/scripts/travis_runtest.sh
xgboost/rabit/scripts/travis_script.sh
xgboost/rabit/src/README.md
xgboost/rabit/src/allreduce_base.cc
xgboost/rabit/src/allreduce_base.h

xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN


    // Our current block's offset
    int block_offset = blockIdx.x * TILE_SIZE;

    // Load items into a blocked arrangement
    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);

    // Barrier for smem reuse
    __syncthreads();

    // Start cycle timer
    clock_t start = clock();

    // Sort keys
    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);

    // Stop cycle timer
    clock_t stop = clock();

    // Store output in striped fashion
    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);

    // Store elapsed clocks
    if (threadIdx.x == 0)
    {
        d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
    }

xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN

    CubDebugExit(cudaDeviceSynchronize());

    // Check results
    printf("\tOutput items: ");
    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);
    fflush(stdout);

    // Run this several times and average the performance results
    GpuTimer            timer;
    float               elapsed_millis          = 0.0;
    unsigned long long  elapsed_clocks          = 0;

    for (int i = 0; i < g_timing_iterations; ++i)
    {
        timer.Start();

        // Run kernel
        BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
            d_in,
            d_out,
            d_elapsed);

        timer.Stop();
        elapsed_millis += timer.ElapsedMillis();

        // Copy clocks from device
        CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
        for (int i = 0; i < g_grid_size; i++)
            elapsed_clocks += h_elapsed[i];
    }

    // Check for kernel errors and STDIO from the kernel, if any
    CubDebugExit(cudaDeviceSynchronize());

xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN

    // Specialize BlockReduce type for our thread block
    typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;

    // Shared memory
    __shared__ typename BlockReduceT::TempStorage temp_storage;

    // Per-thread tile data
    int data[ITEMS_PER_THREAD];
    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);

    // Start cycle timer
    clock_t start = clock();

    // Compute sum
    int aggregate = BlockReduceT(temp_storage).Sum(data);

    // Stop cycle timer
    clock_t stop = clock();

    // Store aggregate and elapsed clocks
    if (threadIdx.x == 0)
    {
        *d_elapsed = (start > stop) ? start - stop : stop - start;
        *d_out = aggregate;
    }
}

xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN

        d_out,
        d_elapsed);

    // Check total aggregate
    printf("\tAggregate: ");
    int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Run this several times and average the performance results
    GpuTimer    timer;
    float       elapsed_millis          = 0.0;
    clock_t     elapsed_clocks          = 0;

    for (int i = 0; i < g_timing_iterations; ++i)
    {
        // Copy problem to device
        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

        timer.Start();

        // Run aggregate/prefix kernel
        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
            d_in,
            d_out,
            d_elapsed);

        timer.Stop();
        elapsed_millis += timer.ElapsedMillis();

        // Copy clocks from device
        clock_t clocks;
        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
        elapsed_clocks += clocks;

    }

    // Check for kernel errors and STDIO from the kernel, if any
    CubDebugExit(cudaPeekAtLastError());

xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN


    // Per-thread tile data
    int data[ITEMS_PER_THREAD];

    // Load items into a blocked arrangement
    BlockLoadT(temp_storage.load).Load(d_in, data);

    // Barrier for smem reuse
    __syncthreads();

    // Start cycle timer
    clock_t start = clock();

    // Compute exclusive prefix sum
    int aggregate;
    BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);

    // Stop cycle timer
    clock_t stop = clock();

    // Barrier for smem reuse
    __syncthreads();

    // Store items from a blocked arrangement
    BlockStoreT(temp_storage.store).Store(d_out, data);

    // Store aggregate and elapsed clocks
    if (threadIdx.x == 0)

xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN

    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Check total aggregate
    printf("\tAggregate: ");
    compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Run this several times and average the performance results
    GpuTimer    timer;
    float       elapsed_millis          = 0.0;
    clock_t     elapsed_clocks          = 0;

    for (int i = 0; i < g_timing_iterations; ++i)
    {
        // Copy problem to device
        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

        timer.Start();

        // Run aggregate/prefix kernel
        BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
            d_in,
            d_out,
            d_elapsed);

        timer.Stop();
        elapsed_millis += timer.ElapsedMillis();

        // Copy clocks from device
        clock_t clocks;
        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
        elapsed_clocks += clocks;

    }

    // Check for kernel errors and STDIO from the kernel, if any
    CubDebugExit(cudaPeekAtLastError());

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

    printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
    fflush(stdout);

    Initialize(h_keys, h_values, num_items, max_key);
    int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);

    printf("%d non-trivial runs\n", num_runs);
    fflush(stdout);

    // Repeat for performance timing
    GpuTimer gpu_timer;
    GpuTimer gpu_rle_timer;
    float elapsed_millis = 0.0;
    float elapsed_rle_millis = 0.0;
    for (int i = 0; i <= timing_iterations; ++i)
    {

        // Allocate and initialize device arrays for sorting
        DoubleBuffer<Key>       d_keys;
        DoubleBuffer<Value>     d_values;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));

        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
        CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));

        // Start timer
        gpu_timer.Start();

        // Allocate temporary storage for sorting
        size_t  temp_storage_bytes  = 0;
        void    *d_temp_storage     = NULL;
        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

        // Do the sort
        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));

        // Free unused buffers and sorting temporary storage
        if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
        if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

        // Start timer
        gpu_rle_timer.Start();

        // Allocate device arrays for enumerating non-trivial runs
        int     *d_offests_out   = NULL;
        int     *d_lengths_out   = NULL;
        int     *d_num_runs      = NULL;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));

        // Allocate temporary storage for isolating non-trivial runs

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

            d_num_runs,
            num_items));

        // Free keys buffer
        if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));

        //
        // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
        //

        // Stop sort timer
        gpu_timer.Stop();
        gpu_rle_timer.Stop();

        if (i == 0)
        {
            // First iteration is a warmup: // Check for correctness (and display results, if specified)

            printf("\nRUN OFFSETS: \n");
            int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
            printf("\t\t %s ", compare ? "FAIL" : "PASS");

            printf("\nRUN LENGTHS: \n");

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

            printf("\t\t %s ", compare ? "FAIL" : "PASS");

            printf("\nNUM RUNS: \n");
            compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
            printf("\t\t %s ", compare ? "FAIL" : "PASS");

            AssertEquals(0, compare);
        }
        else
        {
            elapsed_millis += gpu_timer.ElapsedMillis();
            elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
        }

        // GPU cleanup

        if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
        if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
        if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
        if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
    }

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN

        COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
    if (coo_grid_size > 1)
    {
        printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
    }
    fflush(stdout);

    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));

    // Run kernel (always run one iteration without timing)
    GpuTimer gpu_timer;
    float elapsed_millis = 0.0;
    for (int i = 0; i <= g_timing_iterations; i++)
    {
        gpu_timer.Start();

        // Initialize output
        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));

        // Run the COO kernel
        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
            even_share,
            d_block_partials,
            d_rows,
            d_columns,

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN


        if (coo_grid_size > 1)
        {
            // Run the COO finalize kernel
            CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
                d_block_partials,
                num_partials,
                d_result);
        }

        gpu_timer.Stop();

        if (i > 0)
            elapsed_millis += gpu_timer.ElapsedMillis();
    }

    // Force any kernel stdio to screen
    CubDebugExit(cudaThreadSynchronize());
    fflush(stdout);

    // Display timing
    if (g_timing_iterations > 0)
    {
        float avg_elapsed = elapsed_millis / g_timing_iterations;

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN


    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Get graph type
    string type;
    args.GetCmdLineArgument("type", type);

    // Generate graph structure

    CpuTimer timer;
    timer.Start();
    CooGraph<VertexId, Value> coo_graph;
    if (type == string("grid2d"))
    {
        VertexId width;
        args.GetCmdLineArgument("width", width);
        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
        printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
        if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
    } else if (type == string("grid3d"))
    {

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN

        string filename;
        args.GetCmdLineArgument("file", filename);
        printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
        if (coo_graph.InitMarket(filename)) exit(1);
    }
    else
    {
        printf("Unsupported graph type\n");
        exit(1);
    }
    timer.Stop();
    printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
        timer.ElapsedMillis() / 1000.0,
        coo_graph.coo_tuples.size(),
        coo_graph.row_dim,
        coo_graph.col_dim);
    fflush(stdout);

    if (g_verbose)
    {
        cout << coo_graph << "\n";
    }

xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN


    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    for (int i = 0; i < g_timing_iterations; ++i)
    {
        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
    }
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
        float giga_bandwidth = giga_rate *
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

xgboost/cub/experimental/histogram/histogram_cub.h view on Meta::CPAN

        d_histogram,
        num_levels,
        lower_level,
        upper_level,
        width * height, 
        (cudaStream_t) 0,
        is_warmup);

    cudaMalloc(&d_temp_storage, temp_storage_bytes);

    GpuTimer gpu_timer;
    gpu_timer.Start();

    // Compute histogram
    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
        d_temp_storage,
        temp_storage_bytes,
        d_image_samples,
        d_histogram,
        num_levels,
        lower_level,
        upper_level,
        width * height, 
        (cudaStream_t) 0,
        is_warmup);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    cudaFree(d_temp_storage);

    return elapsed_millis;
}

xgboost/cub/experimental/histogram/histogram_gmem_atomics.h view on Meta::CPAN

    dim3 grid(16, 16);
    int total_blocks = grid.x * grid.y;

    // allocate partial histogram
    unsigned int *d_part_hist;
    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));

    dim3 block2(128);
    dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);

    GpuTimer gpu_timer;
    gpu_timer.Start();

    histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
        d_image,
        width,
        height,
        d_part_hist);

    histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
        d_part_hist,
        total_blocks,
        d_hist);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    cudaFree(d_part_hist);

    return elapsed_millis;
}

xgboost/cub/experimental/histogram/histogram_smem_atomics.h view on Meta::CPAN

    dim3 grid(16, 16);
    int total_blocks = grid.x * grid.y;

    // allocate partial histogram
    unsigned int *d_part_hist;
    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));

    dim3 block2(128);
    dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);

    GpuTimer gpu_timer;
    gpu_timer.Start();

    histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
        d_image,
        width,
        height,
        d_part_hist);

    histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
        d_part_hist,
        total_blocks,
        d_hist);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    cudaFree(d_part_hist);

    return elapsed_millis;
}

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

        printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);

    // Warmup
    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);

    // Check for failures
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(SyncStream(0));

    // Timing
    GpuTimer timer;
    float elapsed_millis = 0.0;
    timer.Start();
    for (int it = 0; it < timing_iterations; ++it)
    {
        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    CubDebugExit(x_itr.UnbindTexture());

    return elapsed_millis / timing_iterations;
}



//---------------------------------------------------------------------
// cuSparse HybMV

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

 */
template <
    typename OffsetT>
float TestCusparseHybmv(
    float*                          vector_y_in,
    float*                          reference_vector_y_out,
    SpmvParams<float, OffsetT>&     params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    CpuTimer cpu_timer;
    cpu_timer.Start();

    // Construct Hyb matrix
    cusparseMatDescr_t mat_desc;
    cusparseHybMat_t hyb_desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
    cusparseStatus_t status = cusparseScsr2hyb(
        cusparse,
        params.num_rows, params.num_cols,
        mat_desc,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        hyb_desc,
        0,
        CUSPARSE_HYB_PARTITION_AUTO);
    AssertEquals(CUSPARSE_STATUS_SUCCESS, status);

    cudaDeviceSynchronize();
    cpu_timer.Stop();
    float elapsed_millis = cpu_timer.ElapsedMillis();
    printf("HYB setup ms, %.5f, ", elapsed_millis);

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
        cusparse,
        CUSPARSE_OPERATION_NON_TRANSPOSE,
        &params.alpha, mat_desc,

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
            cusparse,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            &params.alpha, mat_desc,
            hyb_desc,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    // Cleanup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));

    return elapsed_millis / timing_iterations;
}


/**

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

 */
template <
    typename OffsetT>
float TestCusparseHybmv(
    double*                         vector_y_in,
    double*                         reference_vector_y_out,
    SpmvParams<double, OffsetT>&    params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    CpuTimer cpu_timer;
    cpu_timer.Start();

    // Construct Hyb matrix
    cusparseMatDescr_t mat_desc;
    cusparseHybMat_t hyb_desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
        cusparse,
        params.num_rows, params.num_cols,
        mat_desc,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        hyb_desc,
        0,
        CUSPARSE_HYB_PARTITION_AUTO));

    cudaDeviceSynchronize();
    cpu_timer.Stop();
    float elapsed_millis = cpu_timer.ElapsedMillis();
    printf("HYB setup ms, %.5f, ", elapsed_millis);

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
        cusparse,
        CUSPARSE_OPERATION_NON_TRANSPOSE,
        &params.alpha, mat_desc,

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
            cusparse,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            &params.alpha, mat_desc,
            hyb_desc,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    // Cleanup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));

    return elapsed_millis / timing_iterations;
}

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    float elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
    return elapsed_millis / timing_iterations;
}


/**
 * Run cuSparse SpMV (specialized for fp64)
 */
template <

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

        params.d_vector_x, &params.beta, params.d_vector_y));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    float elapsed_millis = 0.0;
    GpuTimer timer;
    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, &params.beta, params.d_vector_y));

    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
    return elapsed_millis / timing_iterations;
}

//---------------------------------------------------------------------
// GPU Merge-based SpMV
//---------------------------------------------------------------------

/**

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

// params.alpha, params.beta,
        (cudaStream_t) 0, !g_quiet));

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    GpuTimer timer;
    float elapsed_millis = 0.0;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        CubDebugExit(DeviceSpmv::CsrMV(
            d_temp_storage, temp_storage_bytes,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, params.d_vector_y,
            params.num_rows, params.num_cols, params.num_nonzeros, 
// params.alpha, params.beta,
            (cudaStream_t) 0, false));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    return elapsed_millis / timing_iterations;
}

//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------

/**
 * Display perf

xgboost/cub/test/test_allocator.cu view on Meta::CPAN

#endif  // CUB_CDP

    //
    // Performance
    //

    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
    fflush(stdout); fflush(stderr);

    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
    CpuTimer    cpu_timer;
    char        *d_1024MB                       = NULL;
    allocator.debug                             = false;

    // Prime the caching allocator and the kernel
    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
    CubDebugExit(allocator.DeviceFree(d_1024MB));
    cub::EmptyKernel<void><<<1, 32>>>();

    // CUDA
    cpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
        CubDebugExit(cudaFree(d_1024MB));
    }
    cpu_timer.Stop();
    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();

    // CUB
    cpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
        CubDebugExit(allocator.DeviceFree(d_1024MB));
    }
    cpu_timer.Stop();
    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();

    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
        cuda_malloc_elapsed_millis / timing_iterations,
        cub_calloc_elapsed_millis / timing_iterations);

    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
    GpuTimer gpu_timer;

    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
    fflush(stdout); fflush(stderr);

    // Kernel-only
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        cub::EmptyKernel<void><<<1, 32>>>();
    }
    gpu_timer.Stop();
    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();

    // CUDA
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
        cub::EmptyKernel<void><<<1, 32>>>();
        CubDebugExit(cudaFree(d_1024MB));
    }
    gpu_timer.Stop();
    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

    // CUB
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
        cub::EmptyKernel<void><<<1, 32>>>();
        CubDebugExit(allocator.DeviceFree(d_1024MB));
    }
    gpu_timer.Stop();
    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
        cuda_malloc_elapsed_millis / timing_iterations,
        cub_calloc_elapsed_millis / timing_iterations);


#endif

    printf("Success\n");

xgboost/cub/test/test_block_radix_sort.cu view on Meta::CPAN

    // Allocate temp storage in shared memory
    __shared__ typename BlockRadixSortT::TempStorage temp_storage;

    // Items per thread
    Key     keys[ITEMS_PER_THREAD];
    Value   values[ITEMS_PER_THREAD];

    LoadDirectBlocked(threadIdx.x, d_keys, keys);
    LoadDirectBlocked(threadIdx.x, d_values, values);

    // Start cycle timer
    clock_t stop;
    clock_t start = clock();

    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());

    // Store time
    if (threadIdx.x == 0)
        *d_elapsed = (start > stop) ? start - stop : stop - start;
}

xgboost/cub/test/test_block_reduce.cu view on Meta::CPAN

    T data[ITEMS_PER_THREAD];

    // Load first tile of data
    int block_offset = 0;

    if (block_offset < TILE_SIZE * tiles)
    {
        LoadDirectBlocked(linear_tid, d_in + block_offset, data);
        block_offset += TILE_SIZE;

        // Start cycle timer
        clock_t start = clock();

        // Cooperative reduce first tile
        BlockReduceT block_reduce(temp_storage) ;
        T block_aggregate = DeviceTest(block_reduce, data, reduction_op);

        // Stop cycle timer
 #if CUB_PTX_ARCH == 100
        // Bug: recording stop clock causes mis-write of running prefix value
        clock_t stop = 0;
#else
        clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
        clock_t elapsed = (start > stop) ? start - stop : stop - start;

        // Loop over input tiles
        while (block_offset < TILE_SIZE * tiles)
        {
            // TestBarrier between threadblock reductions
            __syncthreads();
    
            // Load tile of data
            LoadDirectBlocked(linear_tid, d_in + block_offset, data);
            block_offset += TILE_SIZE;

            // Start cycle timer
            clock_t start = clock();

            // Cooperatively reduce the tile's aggregate
            BlockReduceT block_reduce(temp_storage) ;
            T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);

            // Stop cycle timer
#if CUB_PTX_ARCH == 100
            // Bug: recording stop clock causes mis-write of running prefix value
            clock_t stop = 0;
#else
            clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100
            elapsed += (start > stop) ? start - stop : stop - start;

            // Reduce threadblock aggregate
            block_aggregate = reduction_op(block_aggregate, tile_aggregate);

xgboost/cub/test/test_block_reduce.cu view on Meta::CPAN


    // Per-thread tile data
    T partial;

    // Load partial tile data
    if (linear_tid < num_items)
    {
        partial = d_in[linear_tid];
    }

    // Start cycle timer
    clock_t start = clock();

    // Cooperatively reduce the tile's aggregate
    BlockReduceT block_reduce(temp_storage) ;
    T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);

    // Stop cycle timer
#if CUB_PTX_ARCH == 100
    // Bug: recording stop clock causes mis-write of running prefix value
    clock_t stop = 0;
#else
    clock_t stop = clock();
#endif // CUB_PTX_ARCH == 100

    clock_t elapsed = (start > stop) ? start - stop : stop - start;

    // Store data

xgboost/cub/test/test_block_scan.cu view on Meta::CPAN

    __threadfence_block();      // workaround to prevent clock hoisting

    // Test scan
    T                                   block_aggregate;
    BlockScanT                          block_scan(temp_storage);
    BlockPrefixCallbackOp<T, ScanOpT>   prefix_op(linear_tid, initial_value, scan_op);

    DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
        Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());

    // Stop cycle timer
    __threadfence_block();      // workaround to prevent clock hoisting
    clock_t stop = clock();
    __threadfence_block();      // workaround to prevent clock hoisting

    // Store output
    StoreDirectBlocked(linear_tid, d_out, data);

    // Store block_aggregate
    if (TEST_MODE != BASIC)
        d_aggregate[linear_tid] = block_aggregate;

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


    // Check for correctness (and display results, if specified)
    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
        error |= channel_error;
    }

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();

    DispatchEven(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, lower_level, upper_level,
        num_row_pixels, num_rows, row_stride_bytes,
        0, false);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(SampleT);
        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
            avg_millis,
            giga_rate,

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


    // Check for correctness (and display results, if specified)
    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
    {
        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
        error |= channel_error;
    }

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();

    DispatchRange(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, d_levels,
        num_row_pixels, num_rows, row_stride_bytes,
        0, false);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(SampleT);
        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
            avg_millis,
            giga_rate,

xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN

        // Check that input isn't overwritten
        int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
        compare |= input_compare;
        printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Performance
    if (g_timing_iterations)
        printf("\nPerforming timing iterations:\n"); fflush(stdout);

    GpuTimer gpu_timer;
    float elapsed_millis = 0.0f;
    for (int i = 0; i < g_timing_iterations; ++i)
    {
        // Initialize/clear device arrays
        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
        if (!KEYS_ONLY)
        {
            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
        }

        gpu_timer.Start();
        CubDebugExit(Dispatch(
            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
            num_items, num_segments, d_segment_offsets,
            begin_bit, end_bit, 0, false));
        gpu_timer.Stop();
        elapsed_millis += gpu_timer.ElapsedMillis();
    }

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = (KEYS_ONLY) ?
            giga_rate * sizeof(KeyT) * 2 :
            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    if (g_timing_iterations > 0)
    {
        GpuTimer gpu_timer;
        gpu_timer.Start();

        CubDebugExit(Dispatch(backend, g_timing_iterations,
            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
            d_in, d_out, num_items, num_segments, d_segment_offsets,
            reduction_op, 0, false));

        gpu_timer.Stop();
        float elapsed_millis = gpu_timer.ElapsedMillis();

        // Display performance
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(InputT);
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

    printf("\t Values %s ", compare2 ? "FAIL" : "PASS");

    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float   avg_millis  = elapsed_millis / g_timing_iterations;
        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

    }

    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0...
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN


    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
        Int2Type<Traits<OutputT>::PRIMITIVE>(),
        g_timing_iterations,
        d_temp_storage_bytes,
        d_cdp_error,
        d_temp_storage,
        temp_storage_bytes,
        d_in,
        d_out,
        scan_op,
        initial_value,
        num_items,
        0,
        false));
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
    }

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

    printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");

    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float   avg_millis          = elapsed_millis / g_timing_iterations;
        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;

xgboost/cub/test/test_device_select_unique.cu view on Meta::CPAN

    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");

    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis        = elapsed_millis / g_timing_iterations;
        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth    = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
    }
    printf("\n\n");

xgboost/cub/test/test_grid_barrier.cu view on Meta::CPAN


    printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
        grid_size, block_size, occupancy);
    fflush(stdout);

    // Init global barrier
    GridBarrierLifetime global_barrier;
    global_barrier.Setup(grid_size);

    // Time kernel
    GpuTimer gpu_timer;
    gpu_timer.Start();
    Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
    gpu_timer.Stop();

    retval = CubDebug(cudaThreadSynchronize());

    // Output timing results
    float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
    printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
        iterations,
        gpu_timer.ElapsedMillis(),
        avg_elapsed);

    return retval;
}

xgboost/cub/test/test_warp_scan.cu view on Meta::CPAN

{
    // Cooperative warp-scan utility type (1 warp)
    typedef WarpScan<T, LOGICAL_WARP_THREADS> WarpScanT;

    // Allocate temp storage in shared memory
    __shared__ typename WarpScanT::TempStorage temp_storage;

    // Per-thread tile data
    T data = d_in[threadIdx.x];

    // Start cycle timer
    __threadfence_block();      // workaround to prevent clock hoisting
    clock_t start = clock();
    __threadfence_block();      // workaround to prevent clock hoisting

    T aggregate;

    // Test scan
    WarpScanT warp_scan(temp_storage);
    DeviceTest(
        warp_scan,
        data,
        initial_value,
        scan_op,
        aggregate,
        Int2Type<TEST_MODE>(),
        Int2Type<Traits<T>::PRIMITIVE>());

    // Stop cycle timer
    __threadfence_block();      // workaround to prevent clock hoisting
    clock_t stop = clock();
    __threadfence_block();      // workaround to prevent clock hoisting

    // Store data
    d_out[threadIdx.x] = data;

    if (TEST_MODE != BASIC)
    {
        // Store aggregate

xgboost/cub/tune/tune_device_reduce.cu view on Meta::CPAN

            reduction_op));

        if (g_verify) CubDebugExit(cudaDeviceSynchronize());

        // Copy out and display results
        int compare = (g_verify) ?
            CompareDeviceResults(h_reference, d_out, 1, true, false) :
            0;

        // Performance
        GpuTimer gpu_timer;
        float elapsed_millis = 0.0;
        for (int i = 0; i < g_timing_iterations; i++)
        {
            gpu_timer.Start();

            CubDebugExit(DeviceReduce::Dispatch(
                d_temp_storage,
                temp_storage_bytes,
                multi_dispatch.kernel_ptr,
                single_dispatch.kernel_ptr,
                FillAndResetDrainKernel<OffsetT>,
                multi_dispatch.params,
                single_dispatch.params,
                d_in,
                d_out,
                num_items,
                reduction_op));

            gpu_timer.Stop();
            elapsed_millis += gpu_timer.ElapsedMillis();
        }

        // Mooch
        CubDebugExit(cudaDeviceSynchronize());

        float avg_elapsed = elapsed_millis / g_timing_iterations;
        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
        float avg_bandwidth = avg_throughput * sizeof(T);

        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);

xgboost/dmlc-core/include/dmlc/timer.h view on Meta::CPAN

/*!
 *  Copyright (c) 2015 by Contributors
 * \file timer.h
 * \brief cross platform timer for timing
 * \author Tianqi Chen
 */
#ifndef DMLC_TIMER_H_
#define DMLC_TIMER_H_

#include "base.h"

#if DMLC_USE_CXX11
#include <chrono>
#endif

xgboost/dmlc-core/scripts/lint.py view on Meta::CPAN

# singleton helper for lint check
_HELPER = LintHelper()

def get_header_guard_dmlc(filename):
    """Get Header Guard Convention for DMLC Projects.

    For headers in include, directly use the path
    For headers in src, use project name plus path

    Examples: with project-name = dmlc
        include/dmlc/timer.h -> DMLC_TIMTER_H_
        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
    """
    fileinfo = cpplint.FileInfo(filename)
    file_path_from_root = fileinfo.RepositoryName()
    inc_list = ['include', 'api', 'wrapper']

    if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
        idx = file_path_from_root.find('src/')
        file_path_from_root = _HELPER.project_name +  file_path_from_root[idx + 3:]
    else:

xgboost/dmlc-core/src/data/basic_row_iter.h view on Meta::CPAN

 * \file basic_row_iter.h
 * \brief row based iterator that
 *   loads in everything into memory and returns
 * \author Tianqi Chen
 */
#ifndef DMLC_DATA_BASIC_ROW_ITER_H_
#define DMLC_DATA_BASIC_ROW_ITER_H_
#include <dmlc/io.h>
#include <dmlc/logging.h>
#include <dmlc/data.h>
#include <dmlc/timer.h>
#include "./row_block.h"
#include "./parser.h"

namespace dmlc {
namespace data {
/*!
 * \brief basic set of row iterators that provides
 * \tparam IndexType the type of index we are using
 */
template<typename IndexType>

xgboost/dmlc-core/src/data/disk_row_iter.h view on Meta::CPAN

 * \brief row based iterator that
 *   caches things into disk and then load segments
 * \author Tianqi Chen
 */
#ifndef DMLC_DATA_DISK_ROW_ITER_H_
#define DMLC_DATA_DISK_ROW_ITER_H_

#include <dmlc/io.h>
#include <dmlc/logging.h>
#include <dmlc/data.h>
#include <dmlc/timer.h>
#include <dmlc/threadediter.h>
#include <algorithm>
#include <string>
#include "./row_block.h"
#include "./libsvm_parser.h"

#if DMLC_ENABLE_STD_THREAD
namespace dmlc {
namespace data {
/*!

( run in 1.545 second using v1.01-cache-2.11-cpan-49f99fa48dc )