bind results from the CPAN

Alien-XGBoost

#include <iterator>
#include <vector>
#include <string>
#include <algorithm>
#include <stdio.h>

#include <cub/cub.cuh>

#include "coo_graph.cuh"
#include "../test/test_util.h"

using namespace cub;
using namespace std;


/******************************************************************************
 * Globals, constants, and typedefs
 ******************************************************************************/

typedef int         VertexId;   // uint32s as vertex ids
typedef double      Value;      // double-precision floating point values

bool                    g_verbose       = false;
int                     g_timing_iterations    = 1;
CachingDeviceAllocator  g_allocator;


/******************************************************************************
 * Texture referencing
 ******************************************************************************/

/**
 * Templated texture reference type for multiplicand vector
 */
template <typename Value>
struct TexVector
{
    // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
    typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;

    // Texture reference type
    typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;

    static TexRef ref;

    /**
     * Bind textures
     */
    static void BindTexture(void *d_in, int elements)
    {
        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
        if (d_in)
        {
            size_t offset;
            size_t bytes = sizeof(CastType) * elements;
            CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
        }
    }

    /**
     * Unbind textures
     */
    static void UnbindTexture()
    {
        CubDebugExit(cudaUnbindTexture(ref));
    }

    /**
     * Load
     */
    static __device__ __forceinline__ Value Load(int offset)
    {
        Value output;
        reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
        return output;
    }
};

// Texture reference definitions
template <typename Value>
typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;


/******************************************************************************
 * Utility types
 ******************************************************************************/


/**
 * A partial dot-product sum paired with a corresponding row-id
 */
template <typename VertexId, typename Value>
struct PartialProduct
{
    VertexId    row;            /// Row-id
    Value       partial;        /// PartialProduct sum
};


/**
 * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
 */
template <>
struct PartialProduct<int, double>
{
    long long   row;            /// Row-id
    double      partial;        /// PartialProduct sum
};


/**
 * Reduce-value-by-row scan operator
 */
struct ReduceByKeyOp
{
    template <typename PartialProduct>
    __device__ __forceinline__ PartialProduct operator()(
        const PartialProduct &first,
        const PartialProduct &second)
    {
        PartialProduct retval;

        retval.partial = (second.row != first.row) ?
                second.partial :
                first.partial + second.partial;

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN


    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));

    // Run kernel (always run one iteration without timing)
    GpuTimer gpu_timer;
    float elapsed_millis = 0.0;
    for (int i = 0; i <= g_timing_iterations; i++)
    {
        gpu_timer.Start();

        // Initialize output
        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));

        // Run the COO kernel
        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
            even_share,
            d_block_partials,
            d_rows,
            d_columns,
            d_values,
            d_vector,
            d_result);

        if (coo_grid_size > 1)
        {
            // Run the COO finalize kernel
            CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
                d_block_partials,
                num_partials,
                d_result);
        }

        gpu_timer.Stop();

        if (i > 0)
            elapsed_millis += gpu_timer.ElapsedMillis();
    }

    // Force any kernel stdio to screen
    CubDebugExit(cudaThreadSynchronize());
    fflush(stdout);

    // Display timing
    if (g_timing_iterations > 0)
    {
        float avg_elapsed = elapsed_millis / g_timing_iterations;
        int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
        printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
            g_timing_iterations,
            avg_elapsed,
            total_bytes / avg_elapsed / 1000.0 / 1000.0,
            num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
    }

    // Check results
    int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Cleanup
    TexVector<Value>::UnbindTexture();
    CubDebugExit(g_allocator.DeviceFree(d_block_partials));
    CubDebugExit(g_allocator.DeviceFree(d_rows));
    CubDebugExit(g_allocator.DeviceFree(d_columns));
    CubDebugExit(g_allocator.DeviceFree(d_values));
    CubDebugExit(g_allocator.DeviceFree(d_vector));
    CubDebugExit(g_allocator.DeviceFree(d_result));
    delete[] h_rows;
    delete[] h_columns;
    delete[] h_values;
}


/**
 * Compute reference answer on CPU
 */
template <typename VertexId, typename Value>
void ComputeReference(
    CooGraph<VertexId, Value>&  coo_graph,
    Value*                      h_vector,
    Value*                      h_reference)
{
    for (VertexId i = 0; i < coo_graph.row_dim; i++)
    {
        h_reference[i] = 0.0;
    }

    for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
    {
        h_reference[coo_graph.coo_tuples[i].row] +=
            coo_graph.coo_tuples[i].val *
            h_vector[coo_graph.coo_tuples[i].col];
    }
}


/**
 * Assign arbitrary values to vector items
 */
template <typename Value>
void AssignVectorValues(Value *vector, int col_dim)
{
    for (int i = 0; i < col_dim; i++)
    {
        vector[i] = 1.0;
    }
}


/**
 * Main
 */
int main(int argc, char** argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("i", g_timing_iterations);

    // Print usage
    if (args.CheckCmdLineFlag("help"))

( run in 0.676 second using v1.01-cache-2.11-cpan-2398b32b56e )