timer results from the CPAN

Alien-XGBoost


    //---------------------------------------------------------------------
    // Test methods
    //---------------------------------------------------------------------

    /**
     * Test a configuration
     */
    void TestConfiguration(
        MultiDispatchTuple      &multi_dispatch,
        SingleDispatchTuple     &single_dispatch,
        T*                      d_in,
        T*                      d_out,
        T*                      h_reference,
        OffsetT                  num_items,
        ReductionOp             reduction_op)
    {
        // Clear output
        if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));

        // Allocate temporary storage
        void            *d_temp_storage = NULL;
        size_t          temp_storage_bytes = 0;
        CubDebugExit(DeviceReduce::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            multi_dispatch.kernel_ptr,
            single_dispatch.kernel_ptr,
            FillAndResetDrainKernel<OffsetT>,
            multi_dispatch.params,
            single_dispatch.params,
            d_in,
            d_out,
            num_items,
            reduction_op));
        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

        // Warmup/correctness iteration
        CubDebugExit(DeviceReduce::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            multi_dispatch.kernel_ptr,
            single_dispatch.kernel_ptr,
            FillAndResetDrainKernel<OffsetT>,
            multi_dispatch.params,
            single_dispatch.params,
            d_in,
            d_out,
            num_items,
            reduction_op));

        if (g_verify) CubDebugExit(cudaDeviceSynchronize());

        // Copy out and display results
        int compare = (g_verify) ?
            CompareDeviceResults(h_reference, d_out, 1, true, false) :
            0;

        // Performance
        GpuTimer gpu_timer;
        float elapsed_millis = 0.0;
        for (int i = 0; i < g_timing_iterations; i++)
        {
            gpu_timer.Start();

            CubDebugExit(DeviceReduce::Dispatch(
                d_temp_storage,
                temp_storage_bytes,
                multi_dispatch.kernel_ptr,
                single_dispatch.kernel_ptr,
                FillAndResetDrainKernel<OffsetT>,
                multi_dispatch.params,
                single_dispatch.params,
                d_in,
                d_out,
                num_items,
                reduction_op));

            gpu_timer.Stop();
            elapsed_millis += gpu_timer.ElapsedMillis();
        }

        // Mooch
        CubDebugExit(cudaDeviceSynchronize());

        float avg_elapsed = elapsed_millis / g_timing_iterations;
        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
        float avg_bandwidth = avg_throughput * sizeof(T);

        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
        if (avg_throughput > multi_dispatch.best_avg_throughput)
        {
            multi_dispatch.best_avg_throughput = avg_throughput;
            multi_dispatch.best_size = num_items;
        }

        single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
        if (avg_throughput > single_dispatch.best_avg_throughput)
        {
            single_dispatch.best_avg_throughput = avg_throughput;
            single_dispatch.best_size = num_items;
        }

        if (g_verbose)
        {
            printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
            multi_dispatch.params.Print();
            printf(" ), single_dispatch( ");
            single_dispatch.params.Print();
            printf(" )\n");
            fflush(stdout);
        }

        AssertEquals(0, compare);

        // Cleanup temporaries
        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
    }


    /**
     * Evaluate multi-block configurations
     */
    void TestMulti(
        T*                      h_in,
        T*                      d_in,
        T*                      d_out,
        ReductionOp             reduction_op)
    {
        // Simple single kernel tuple for use with multi kernel sweep
        typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
        SingleDispatchTuple simple_single_tuple;
        simple_single_tuple.params.template Init<SimpleSinglePolicy>();
        simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;

        double max_exponent      = log2(double(g_max_items));
        double min_exponent      = log2(double(simple_single_tuple.params.tile_size));
        unsigned int max_int     = (unsigned int) -1;

        for (int sample = 0; sample < g_samples; ++sample)
( run in 0.576 second using v1.01-cache-2.11-cpan-39bf76dae61 )