Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
stream(stream),
debug_synchronous(debug_synchronous),
ptx_version(ptx_version),
is_overwrite_okay(is_overwrite_okay)
{}
//------------------------------------------------------------------------------
// Small-problem (single tile) invocation
//------------------------------------------------------------------------------
/// Invoke a single block to sort in-core
template <
typename ActivePolicyT, ///< Umbrella policy active for the target device
typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t InvokeSingleTile(
SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
{
#ifndef CUB_RUNTIME_ENABLED
(void)single_tile_kernel;
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported );
#else
cudaError error = cudaSuccess;
do
{
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
{
temp_storage_bytes = 1;
break;
}
// Return if empty problem
if (num_items == 0)
break;
// Log single_tile_kernel configuration
if (debug_synchronous)
_CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
// Invoke upsweep_kernel with same grid size as downsweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
d_keys.Current(),
d_keys.Alternate(),
d_values.Current(),
d_values.Alternate(),
num_items,
begin_bit,
end_bit);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update selector
d_keys.selector ^= 1;
d_values.selector ^= 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
//------------------------------------------------------------------------------
// Normal problem size invocation
//------------------------------------------------------------------------------
/**
* Invoke a three-kernel sorting pass at the current bit.
*/
template <typename PassConfigT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t InvokePass(
const KeyT *d_keys_in,
KeyT *d_keys_out,
const ValueT *d_values_in,
ValueT *d_values_out,
OffsetT *d_spine,
int spine_length,
int ¤t_bit,
PassConfigT &pass_config)
{
cudaError error = cudaSuccess;
do
{
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
// Log upsweep_kernel configuration
if (debug_synchronous)
_CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
// Invoke upsweep_kernel with same grid size as downsweep_kernel
pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
d_keys_in,
d_spine,
num_items,
current_bit,
pass_bits,
pass_config.even_share);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log scan_kernel configuration
if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
// Invoke scan_kernel
pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
typename ActivePolicyT::ScanPolicy,
typename ActivePolicyT::DownsweepPolicy>(
upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
if ((error = alt_pass_config.template InitPassConfig<
typename ActivePolicyT::AltUpsweepPolicy,
typename ActivePolicyT::ScanPolicy,
typename ActivePolicyT::AltDownsweepPolicy>(
alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
// Get maximum spine length
int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
// Temporary storage allocation requirements
void* allocations[3];
size_t allocation_sizes[3] =
{
spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms
(is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer
(is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer
};
// Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
return cudaSuccess;
// Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
int num_bits = end_bit - begin_bit;
int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
bool is_num_passes_odd = num_passes & 1;
int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits;
int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
// Alias the temporary storage allocations
OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
DoubleBuffer<KeyT> d_keys_remaining_passes(
(is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
(is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
DoubleBuffer<ValueT> d_values_remaining_passes(
(is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
(is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
// Run first pass, consuming from the input's current buffers
int current_bit = begin_bit;
if (CubDebug(error = InvokePass(
d_keys.Current(), d_keys_remaining_passes.Current(),
d_values.Current(), d_values_remaining_passes.Current(),
d_spine, spine_length, current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
// Run remaining passes
while (current_bit < end_bit)
{
if (CubDebug(error = InvokePass(
d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
d_spine, spine_length, current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
// Invert selectors
d_keys_remaining_passes.selector ^= 1;
d_values_remaining_passes.selector ^= 1;
}
// Update selector
if (!is_overwrite_okay) {
num_passes = 1; // Sorted data always ends up in the other vector
}
d_keys.selector = (d_keys.selector + num_passes) & 1;
d_values.selector = (d_values.selector + num_passes) & 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
//------------------------------------------------------------------------------
// Chained policy invocation
//------------------------------------------------------------------------------
/// Invocation
template <typename ActivePolicyT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Invoke()
{
typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
// Force kernel code-generation in all compiler passes
if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
{
// Small, single tile size
return InvokeSingleTile<ActivePolicyT>(
DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
}
else
{
// Regular size
return InvokePasses<ActivePolicyT>(
DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>,
DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>,
RadixSortScanBinsKernel< MaxPolicyT, OffsetT>,
DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>,
DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>);
}
}
//------------------------------------------------------------------------------
// Dispatch entrypoints
//------------------------------------------------------------------------------
/**
* Internal dispatch routine
*/
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
OffsetT num_items, ///< [in] Number of items to sort
int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
return CubDebug(cudaErrorNotSupported );
#else
cudaError error = cudaSuccess;
do
{
// Init regular and alternate kernel configurations
PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
if ((error = pass_config.template InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
if ((error = alt_pass_config.template InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
// Temporary storage allocation requirements
void* allocations[2];
size_t allocation_sizes[2] =
{
(is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer
(is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer
};
// Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
{
if (temp_storage_bytes == 0)
temp_storage_bytes = 1;
return cudaSuccess;
}
// Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
int num_bits = end_bit - begin_bit;
int num_passes = (num_bits + radix_bits - 1) / radix_bits;
bool is_num_passes_odd = num_passes & 1;
int max_alt_passes = (num_passes * radix_bits) - num_bits;
int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
DoubleBuffer<KeyT> d_keys_remaining_passes(
(is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
(is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
DoubleBuffer<ValueT> d_values_remaining_passes(
(is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
(is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
// Run first pass, consuming from the input's current buffers
int current_bit = begin_bit;
if (CubDebug(error = InvokePass(
d_keys.Current(), d_keys_remaining_passes.Current(),
d_values.Current(), d_values_remaining_passes.Current(),
current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
// Run remaining passes
while (current_bit < end_bit)
{
if (CubDebug(error = InvokePass(
d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
// Invert selectors and update current bit
d_keys_remaining_passes.selector ^= 1;
d_values_remaining_passes.selector ^= 1;
}
// Update selector
if (!is_overwrite_okay) {
num_passes = 1; // Sorted data always ends up in the other vector
}
d_keys.selector = (d_keys.selector + num_passes) & 1;
d_values.selector = (d_values.selector + num_passes) & 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
//------------------------------------------------------------------------------
// Chained policy invocation
//------------------------------------------------------------------------------
/// Invocation
template <typename ActivePolicyT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Invoke()
{
typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
// Force kernel code-generation in all compiler passes
return InvokePasses<ActivePolicyT>(
DeviceSegmentedRadixSortKernel<MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>,
DeviceSegmentedRadixSortKernel<MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>);
}
//------------------------------------------------------------------------------
// Dispatch entrypoints
//------------------------------------------------------------------------------
/// Internal dispatch routine
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
int num_items, ///< [in] Number of items to sort
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> dat...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data se...
int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
cudaError_t error;
do {
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Create dispatch functor
DispatchSegmentedRadixSort dispatch(
d_temp_storage, temp_storage_bytes,
( run in 1.647 second using v1.01-cache-2.11-cpan-13bb782fe5a )