Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* An implementation of segmented reduction using a load-balanced parallelization
* strategy based on the MergePath decision path.
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <iterator>
#include <vector>
#include <string>
#include <algorithm>
#include <stdio.h>
#include <cub/cub.cuh>
#include "test_util.h"
using namespace cub;
using namespace std;
/******************************************************************************
* Globals, constants, and typedefs
******************************************************************************/
bool g_verbose = false;
int g_timing_iterations = 1;
CachingDeviceAllocator g_allocator(true);
/******************************************************************************
* Utility routines
******************************************************************************/
/**
* An pair of index offsets
*/
template <typename OffsetT>
struct IndexPair
{
OffsetT a_idx;
OffsetT b_idx;
};
/**
* Computes the begin offsets into A and B for the specified
* location (diagonal) along the merge decision path
*/
template <
int BLOCK_THREADS,
typename IteratorA,
typename IteratorB,
typename OffsetT>
__device__ __forceinline__ void ParallelMergePathSearch(
OffsetT diagonal,
IteratorA a,
IteratorB b,
IndexPair<OffsetT> begin, // Begin offsets into a and b
IndexPair<OffsetT> end, // End offsets into a and b
IndexPair<OffsetT> &intersection) // [out] Intersection offsets into a and b
{
OffsetT a_split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
OffsetT a_split_max = CUB_MIN(diagonal, end.a_idx);
while (a_split_min < a_split_max)
{
OffsetT a_distance = a_split_max - a_split_min;
OffsetT a_slice = (a_distance + BLOCK_THREADS - 1) >> Log2<BLOCK_THREADS>::VALUE;
OffsetT a_split_pivot = CUB_MIN(a_split_min + (threadIdx.x * a_slice), end.a_idx - 1);
int move_up = (a[a_split_pivot] <= b[diagonal - a_split_pivot - 1]);
int num_up = __syncthreads_count(move_up);
/*
_CubLog("a_split_min(%d), a_split_max(%d) a_distance(%d), a_slice(%d), a_split_pivot(%d), move_up(%d), num_up(%d), a_begin(%d), a_end(%d)\n",
a_split_min, a_split_max, a_distance, a_slice, a_split_pivot, move_up, num_up, a_begin, a_end);
*/
a_split_max = CUB_MIN(num_up * a_slice, end.a_idx);
a_split_min = CUB_MAX(a_split_max - a_slice, begin.a_idx) + 1;
}
intersection.a_idx = CUB_MIN(a_split_min, end.a_idx);
intersection.b_idx = CUB_MIN(diagonal - a_split_min, end.b_idx);
}
/**
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN
Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);
// Allocate simple offsets array and copy STL vector into it
h_segment_offsets = new OffsetT[segment_offsets.size()];
for (int i = 0; i < segment_offsets.size(); ++i)
h_segment_offsets[i] = segment_offsets[i];
OffsetT num_segments = segment_offsets.size() - 1;
if (g_verbose)
{
printf("%d segment offsets: ", num_segments);
for (int i = 0; i < num_segments; ++i)
std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
if (g_verbose) std::cout << std::endl << std::endl;
}
// Solve problem on host
h_reference = new Value[num_segments];
ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);
printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
(CDP) ? "CDP device invoked" : "Host-invoked",
(Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
num_values, (int) sizeof(Value), type_string,
num_segments, (int) sizeof(OffsetT));
fflush(stdout);
// Allocate and initialize problem on device
Value *d_values = NULL;
OffsetT *d_segment_offsets = NULL;
Value *d_output = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
// Request and allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Clear device output
CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));
// Run warmup/correctness iteration
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
for (int i = 0; i < g_timing_iterations; ++i)
{
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
}
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
float giga_bandwidth = giga_rate *
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
// Device cleanup
if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Host cleanup
if (h_values) delete[] h_values;
if (h_segment_offsets) delete[] h_segment_offsets;
if (h_reference) delete[] h_reference;
}
/**
* Main
*/
int main(int argc, char** argv)
{
int num_values = 32 * 1024 * 1024;
int avg_segment_size = 500;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_values);
args.GetCmdLineArgument("ss", avg_segment_size);
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--v] "
"[--i=<timing iterations>] "
"[--n=<input samples>]\n"
"[--ss=<average segment size>]\n"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
return 0;
}
( run in 0.849 second using v1.01-cache-2.11-cpan-71847e10f99 )