Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/tune/tune_device_reduce.cu view on Meta::CPAN
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Evaluates different tuning configurations of DeviceReduce.
*
* The best way to use this program:
* (1) Find the best all-around single-block tune for a given arch.
* For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
* ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
* (2) Update the single tune in device_reduce.cuh
* (3) Find the best all-around multi-block tune for a given arch.
* For example, 100 samples [single-block tile-size .. 50,331,648], 100 timing iterations per config per sample:
* ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
* (4) Update the multi-block tune in device_reduce.cuh
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <vector>
#include <algorithm>
#include <stdio.h>
#include <cub/cub.cuh>
#include "../test/test_util.h"
using namespace cub;
using namespace std;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
#ifndef TUNE_ARCH
#define TUNE_ARCH 100
#endif
int g_max_items = 48 * 1024 * 1024;
int g_samples = 100;
int g_timing_iterations = 2;
bool g_verbose = false;
bool g_single = false;
bool g_verify = true;
CachingDeviceAllocator g_allocator;
//---------------------------------------------------------------------
// Host utility subroutines
//---------------------------------------------------------------------
/**
* Initialize problem
*/
template <typename T>
void Initialize(
GenMode gen_mode,
T *h_in,
int num_items)
{
for (int i = 0; i < num_items; ++i)
{
InitValue(gen_mode, h_in[i], i);
}
}
/**
* Sequential reduction
*/
template <typename T, typename ReductionOp>
T Reduce(
T *h_in,
ReductionOp reduction_op,
int num_items)
{
T retval = h_in[0];
for (int i = 1; i < num_items; ++i)
retval = reduction_op(retval, h_in[i]);
return retval;
}
//---------------------------------------------------------------------
// Full tile test generation
//---------------------------------------------------------------------
/**
* Wrapper structure for generating and running different tuning configurations
*/
template <
typename T,
typename OffsetT,
typename ReductionOp>
struct Schmoo
{
//---------------------------------------------------------------------
// Types
//---------------------------------------------------------------------
/// Pairing of kernel function pointer and corresponding dispatch params
xgboost/cub/tune/tune_device_reduce.cu view on Meta::CPAN
/// Enumerate thread-granularity variations
template <int BLOCK_THREADS>
void Enumerate()
{
Enumerate<BLOCK_THREADS, 7>();
Enumerate<BLOCK_THREADS, 8>();
Enumerate<BLOCK_THREADS, 9>();
Enumerate<BLOCK_THREADS, 11>();
Enumerate<BLOCK_THREADS, 12>();
Enumerate<BLOCK_THREADS, 13>();
Enumerate<BLOCK_THREADS, 15>();
Enumerate<BLOCK_THREADS, 16>();
Enumerate<BLOCK_THREADS, 17>();
Enumerate<BLOCK_THREADS, 19>();
Enumerate<BLOCK_THREADS, 20>();
Enumerate<BLOCK_THREADS, 21>();
Enumerate<BLOCK_THREADS, 23>();
Enumerate<BLOCK_THREADS, 24>();
Enumerate<BLOCK_THREADS, 25>();
}
/// Enumerate block size variations
void Enumerate()
{
printf("\nEnumerating kernels\n"); fflush(stdout);
Enumerate<32>();
Enumerate<64>();
Enumerate<96>();
Enumerate<128>();
Enumerate<160>();
Enumerate<192>();
Enumerate<256>();
Enumerate<512>();
}
//---------------------------------------------------------------------
// Test methods
//---------------------------------------------------------------------
/**
* Test a configuration
*/
void TestConfiguration(
MultiDispatchTuple &multi_dispatch,
SingleDispatchTuple &single_dispatch,
T* d_in,
T* d_out,
T* h_reference,
OffsetT num_items,
ReductionOp reduction_op)
{
// Clear output
if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Warmup/correctness iteration
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
if (g_verify) CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
int compare = (g_verify) ?
CompareDeviceResults(h_reference, d_out, 1, true, false) :
0;
// Performance
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i < g_timing_iterations; i++)
{
gpu_timer.Start();
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Mooch
CubDebugExit(cudaDeviceSynchronize());
float avg_elapsed = elapsed_millis / g_timing_iterations;
float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
float avg_bandwidth = avg_throughput * sizeof(T);
multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
if (avg_throughput > multi_dispatch.best_avg_throughput)
{
multi_dispatch.best_avg_throughput = avg_throughput;
multi_dispatch.best_size = num_items;
}
single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
if (avg_throughput > single_dispatch.best_avg_throughput)
{
single_dispatch.best_avg_throughput = avg_throughput;
single_dispatch.best_size = num_items;
}
if (g_verbose)
{
printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
multi_dispatch.params.Print();
printf(" ), single_dispatch( ");
single_dispatch.params.Print();
printf(" )\n");
fflush(stdout);
}
AssertEquals(0, compare);
xgboost/cub/tune/tune_device_reduce.cu view on Meta::CPAN
best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
}
// Print best throughput for this problem size
printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
// Accumulate speedup (inverse for harmonic mean)
for (int j = 0; j < single_kernels.size(); ++j)
single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
}
// Find max overall throughput and compute hmean speedups
float overall_max_throughput = 0.0;
for (int j = 0; j < single_kernels.size(); ++j)
{
overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
}
// Sort by cumulative speedup
sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);
// Print ranked single configurations
printf("\nRanked single_kernels:\n");
for (int j = 0; j < single_kernels.size(); ++j)
{
printf("\t (%d) params( ", single_kernels.size() - j);
single_kernels[j].params.Print();
printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
single_kernels[j].hmean_speedup,
single_kernels[j].best_avg_throughput,
(int) single_kernels[j].best_size,
single_kernels[j].best_avg_throughput * sizeof(T),
single_kernels[j].best_avg_throughput / overall_max_throughput);
}
printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
}
};
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("n", g_max_items);
args.GetCmdLineArgument("s", g_samples);
args.GetCmdLineArgument("i", g_timing_iterations);
g_verbose = args.CheckCmdLineFlag("v");
g_single = args.CheckCmdLineFlag("single");
g_verify = !args.CheckCmdLineFlag("noverify");
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--n=<max items>]"
"[--s=<samples>]"
"[--i=<timing iterations>]"
"[--single]"
"[--v]"
"[--noverify]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
#if (TUNE_SIZE == 1)
typedef unsigned char T;
#elif (TUNE_SIZE == 2)
typedef unsigned short T;
#elif (TUNE_SIZE == 4)
typedef unsigned int T;
#elif (TUNE_SIZE == 8)
typedef unsigned long long T;
#else
// Default
typedef unsigned int T;
#endif
typedef unsigned int OffsetT;
Sum reduction_op;
// Enumerate kernels
Schmoo<T, OffsetT, Sum > schmoo;
schmoo.Enumerate();
// Allocate host arrays
T *h_in = new T[g_max_items];
// Initialize problem
Initialize(UNIFORM, h_in, g_max_items);
// Initialize device arrays
T *d_in = NULL;
T *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));
// Test kernels
if (g_single)
schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
else
schmoo.TestMulti(h_in, d_in, d_out, reduction_op);
// Cleanup
if (h_in) delete[] h_in;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
return 0;
}
( run in 0.641 second using v1.01-cache-2.11-cpan-13bb782fe5a )