Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/tune/tune_device_reduce.cu  view on Meta::CPAN

 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Evaluates different tuning configurations of DeviceReduce.
 *
 * The best way to use this program:
 * (1) Find the best all-around single-block tune for a given arch.
 *     For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
 *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
 * (2) Update the single tune in device_reduce.cuh
 * (3) Find the best all-around multi-block tune for a given arch.
 *     For example, 100 samples [single-block tile-size ..  50,331,648], 100 timing iterations per config per sample:
 *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
 * (4) Update the multi-block tune in device_reduce.cuh
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <vector>
#include <algorithm>
#include <stdio.h>
#include <cub/cub.cuh>
#include "../test/test_util.h"

using namespace cub;
using namespace std;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

#ifndef TUNE_ARCH
#define TUNE_ARCH 100
#endif

int     g_max_items         = 48 * 1024 * 1024;
int     g_samples           = 100;
int     g_timing_iterations        = 2;
bool    g_verbose           = false;
bool    g_single            = false;
bool    g_verify            = true;
CachingDeviceAllocator  g_allocator;


//---------------------------------------------------------------------
// Host utility subroutines
//---------------------------------------------------------------------

/**
 * Initialize problem
 */
template <typename T>
void Initialize(
    GenMode         gen_mode,
    T               *h_in,
    int             num_items)
{
    for (int i = 0; i < num_items; ++i)
    {
        InitValue(gen_mode, h_in[i], i);
    }
}

/**
 * Sequential reduction
 */
template <typename T, typename ReductionOp>
T Reduce(
    T               *h_in,
    ReductionOp     reduction_op,
    int             num_items)
{
    T retval = h_in[0];
    for (int i = 1; i < num_items; ++i)
        retval = reduction_op(retval, h_in[i]);

    return retval;
}



//---------------------------------------------------------------------
// Full tile test generation
//---------------------------------------------------------------------



/**
 * Wrapper structure for generating and running different tuning configurations
 */
template <
    typename T,
    typename OffsetT,
    typename ReductionOp>
struct Schmoo
{
    //---------------------------------------------------------------------
    // Types
    //---------------------------------------------------------------------

    /// Pairing of kernel function pointer and corresponding dispatch params

xgboost/cub/tune/tune_device_reduce.cu  view on Meta::CPAN


    /// Enumerate thread-granularity variations
    template <int BLOCK_THREADS>
    void Enumerate()
    {
        Enumerate<BLOCK_THREADS, 7>();
        Enumerate<BLOCK_THREADS, 8>();
        Enumerate<BLOCK_THREADS, 9>();

        Enumerate<BLOCK_THREADS, 11>();
        Enumerate<BLOCK_THREADS, 12>();
        Enumerate<BLOCK_THREADS, 13>();

        Enumerate<BLOCK_THREADS, 15>();
        Enumerate<BLOCK_THREADS, 16>();
        Enumerate<BLOCK_THREADS, 17>();

        Enumerate<BLOCK_THREADS, 19>();
        Enumerate<BLOCK_THREADS, 20>();
        Enumerate<BLOCK_THREADS, 21>();

        Enumerate<BLOCK_THREADS, 23>();
        Enumerate<BLOCK_THREADS, 24>();
        Enumerate<BLOCK_THREADS, 25>();
    }


    /// Enumerate block size variations
    void Enumerate()
    {
        printf("\nEnumerating kernels\n"); fflush(stdout);

        Enumerate<32>();
        Enumerate<64>();
        Enumerate<96>();
        Enumerate<128>();
        Enumerate<160>();
        Enumerate<192>();
        Enumerate<256>();
        Enumerate<512>();
    }


    //---------------------------------------------------------------------
    // Test methods
    //---------------------------------------------------------------------

    /**
     * Test a configuration
     */
    void TestConfiguration(
        MultiDispatchTuple      &multi_dispatch,
        SingleDispatchTuple     &single_dispatch,
        T*                      d_in,
        T*                      d_out,
        T*                      h_reference,
        OffsetT                  num_items,
        ReductionOp             reduction_op)
    {
        // Clear output
        if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));

        // Allocate temporary storage
        void            *d_temp_storage = NULL;
        size_t          temp_storage_bytes = 0;
        CubDebugExit(DeviceReduce::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            multi_dispatch.kernel_ptr,
            single_dispatch.kernel_ptr,
            FillAndResetDrainKernel<OffsetT>,
            multi_dispatch.params,
            single_dispatch.params,
            d_in,
            d_out,
            num_items,
            reduction_op));
        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

        // Warmup/correctness iteration
        CubDebugExit(DeviceReduce::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            multi_dispatch.kernel_ptr,
            single_dispatch.kernel_ptr,
            FillAndResetDrainKernel<OffsetT>,
            multi_dispatch.params,
            single_dispatch.params,
            d_in,
            d_out,
            num_items,
            reduction_op));

        if (g_verify) CubDebugExit(cudaDeviceSynchronize());

        // Copy out and display results
        int compare = (g_verify) ?
            CompareDeviceResults(h_reference, d_out, 1, true, false) :
            0;

        // Performance
        GpuTimer gpu_timer;
        float elapsed_millis = 0.0;
        for (int i = 0; i < g_timing_iterations; i++)
        {
            gpu_timer.Start();

            CubDebugExit(DeviceReduce::Dispatch(
                d_temp_storage,
                temp_storage_bytes,
                multi_dispatch.kernel_ptr,
                single_dispatch.kernel_ptr,
                FillAndResetDrainKernel<OffsetT>,
                multi_dispatch.params,
                single_dispatch.params,
                d_in,
                d_out,
                num_items,
                reduction_op));

            gpu_timer.Stop();
            elapsed_millis += gpu_timer.ElapsedMillis();
        }

        // Mooch
        CubDebugExit(cudaDeviceSynchronize());

        float avg_elapsed = elapsed_millis / g_timing_iterations;
        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
        float avg_bandwidth = avg_throughput * sizeof(T);

        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
        if (avg_throughput > multi_dispatch.best_avg_throughput)
        {
            multi_dispatch.best_avg_throughput = avg_throughput;
            multi_dispatch.best_size = num_items;
        }

        single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
        if (avg_throughput > single_dispatch.best_avg_throughput)
        {
            single_dispatch.best_avg_throughput = avg_throughput;
            single_dispatch.best_size = num_items;
        }

        if (g_verbose)
        {
            printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
            multi_dispatch.params.Print();
            printf(" ), single_dispatch( ");
            single_dispatch.params.Print();
            printf(" )\n");
            fflush(stdout);
        }

        AssertEquals(0, compare);

xgboost/cub/tune/tune_device_reduce.cu  view on Meta::CPAN


                best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
            }

            // Print best throughput for this problem size
            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));

            // Accumulate speedup (inverse for harmonic mean)
            for (int j = 0; j < single_kernels.size(); ++j)
                single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
        }

        // Find max overall throughput and compute hmean speedups
        float overall_max_throughput = 0.0;
        for (int j = 0; j < single_kernels.size(); ++j)
        {
            overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
            single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
        }

        // Sort by cumulative speedup
        sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);

        // Print ranked single configurations
        printf("\nRanked single_kernels:\n");
        for (int j = 0; j < single_kernels.size(); ++j)
        {
            printf("\t (%d) params( ", single_kernels.size() - j);
            single_kernels[j].params.Print();
            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
                single_kernels[j].hmean_speedup,
                single_kernels[j].best_avg_throughput,
                (int) single_kernels[j].best_size,
                single_kernels[j].best_avg_throughput * sizeof(T),
                single_kernels[j].best_avg_throughput / overall_max_throughput);
        }

        printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
    }

};



//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */
int main(int argc, char** argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    args.GetCmdLineArgument("n", g_max_items);
    args.GetCmdLineArgument("s", g_samples);
    args.GetCmdLineArgument("i", g_timing_iterations);
    g_verbose = args.CheckCmdLineFlag("v");
    g_single = args.CheckCmdLineFlag("single");
    g_verify = !args.CheckCmdLineFlag("noverify");

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>] "
            "[--n=<max items>]"
            "[--s=<samples>]"
            "[--i=<timing iterations>]"
            "[--single]"
            "[--v]"
            "[--noverify]"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

#if (TUNE_SIZE == 1)
    typedef unsigned char T;
#elif (TUNE_SIZE == 2)
    typedef unsigned short T;
#elif (TUNE_SIZE == 4)
    typedef unsigned int T;
#elif (TUNE_SIZE == 8)
    typedef unsigned long long T;
#else
    // Default
    typedef unsigned int T;
#endif

    typedef unsigned int OffsetT;
    Sum reduction_op;

    // Enumerate kernels
    Schmoo<T, OffsetT, Sum > schmoo;
    schmoo.Enumerate();

    // Allocate host arrays
    T *h_in = new T[g_max_items];

    // Initialize problem
    Initialize(UNIFORM, h_in, g_max_items);

    // Initialize device arrays
    T *d_in = NULL;
    T *d_out = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));

    // Test kernels
    if (g_single)
        schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
    else
        schmoo.TestMulti(h_in, d_in, d_out, reduction_op);

    // Cleanup
    if (h_in) delete[] h_in;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));

    return 0;
}





( run in 0.641 second using v1.01-cache-2.11-cpan-13bb782fe5a )