view release on metacpan or search on metacpan
xgboost/cub/cub/iterator/tex_ref_input_iterator.cuh
xgboost/cub/cub/iterator/transform_input_iterator.cuh
xgboost/cub/cub/thread/thread_load.cuh
xgboost/cub/cub/thread/thread_operators.cuh
xgboost/cub/cub/thread/thread_reduce.cuh
xgboost/cub/cub/thread/thread_scan.cuh
xgboost/cub/cub/thread/thread_search.cuh
xgboost/cub/cub/thread/thread_store.cuh
xgboost/cub/cub/util_allocator.cuh
xgboost/cub/cub/util_arch.cuh
xgboost/cub/cub/util_debug.cuh
xgboost/cub/cub/util_device.cuh
xgboost/cub/cub/util_macro.cuh
xgboost/cub/cub/util_namespace.cuh
xgboost/cub/cub/util_ptx.cuh
xgboost/cub/cub/util_type.cuh
xgboost/cub/cub/warp/specializations/warp_reduce_shfl.cuh
xgboost/cub/cub/warp/specializations/warp_reduce_smem.cuh
xgboost/cub/cub/warp/specializations/warp_scan_shfl.cuh
xgboost/cub/cub/warp/specializations/warp_scan_smem.cuh
xgboost/cub/cub/warp/warp_reduce.cuh
xgboost/R-package/configure view on Meta::CPAN
fi
ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
ac_abs_confdir=`(
cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
pwd)`
# When building in place, set srcdir=.
if test "$ac_abs_confdir" = "$ac_pwd"; then
srcdir=.
fi
# Remove unnecessary trailing slashes from srcdir.
# Double slashes in file names in object file debugging info
# mess up M-x gdb in Emacs.
case $srcdir in
*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
esac
for ac_var in $ac_precious_vars; do
eval ac_env_${ac_var}_set=\${${ac_var}+set}
eval ac_env_${ac_var}_value=\$${ac_var}
eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
eval ac_cv_env_${ac_var}_value=\$${ac_var}
done
xgboost/R-package/configure view on Meta::CPAN
gives unlimited permission to copy, distribute and modify it.
_ACEOF
exit
fi
## ------------------------ ##
## Autoconf initialization. ##
## ------------------------ ##
cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by xgboost $as_me 0.6-3, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
_ACEOF
exec 5>>config.log
{
cat <<_ASUNAME
xgboost/R-package/configure view on Meta::CPAN
done
{ ac_configure_args0=; unset ac_configure_args0;}
{ ac_configure_args1=; unset ac_configure_args1;}
# When interrupted or exit'd, cleanup temporary files, and complete
# config.log. We remove comments because anyway the quotes in there
# would cause problems or look ugly.
# WARNING: Use '\'' to represent an apostrophe within the trap.
# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
trap 'exit_status=$?
# Save into config.log some information that might help in debugging.
{
echo
$as_echo "## ---------------- ##
## Cache variables. ##
## ---------------- ##"
echo
# The following way of writing the cache mishandles newlines in values,
(
for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
xgboost/R-package/configure view on Meta::CPAN
ac_write_fail=0
ac_clean_files_save=$ac_clean_files
ac_clean_files="$ac_clean_files $CONFIG_STATUS"
{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
as_write_fail=0
cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
#! $SHELL
# Generated by $as_me.
# Run this file to recreate the current configuration.
# Compiler output produced by configure, useful for debugging
# configure, is in config.log if it exists.
debug=false
ac_cs_recheck=false
ac_cs_silent=false
SHELL=\${CONFIG_SHELL-$SHELL}
export SHELL
_ASEOF
cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
## -------------------- ##
## M4sh Initialization. ##
## -------------------- ##
xgboost/R-package/configure view on Meta::CPAN
from templates according to the current configuration. Unless the files
and actions are specified as TAGs, all are instantiated by default.
Usage: $0 [OPTION]... [TAG]...
-h, --help print this help, then exit
-V, --version print version number and configuration settings, then exit
--config print configuration, then exit
-q, --quiet, --silent
do not print progress messages
-d, --debug don't remove temporary files
--recheck update $as_me by reconfiguring in the same conditions
--file=FILE[:TEMPLATE]
instantiate the configuration file FILE
Configuration files:
$config_files
Report bugs to the package provider."
_ACEOF
xgboost/R-package/configure view on Meta::CPAN
esac
case $ac_option in
# Handling of the options.
-recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
ac_cs_recheck=: ;;
--version | --versio | --versi | --vers | --ver | --ve | --v | -V )
$as_echo "$ac_cs_version"; exit ;;
--config | --confi | --conf | --con | --co | --c )
$as_echo "$ac_cs_config"; exit ;;
--debug | --debu | --deb | --de | --d | -d )
debug=: ;;
--file | --fil | --fi | --f )
$ac_shift
case $ac_optarg in
*\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
'') as_fn_error $? "missing file argument" ;;
esac
as_fn_append CONFIG_FILES " '$ac_optarg'"
ac_need_defaults=false;;
--he | --h | --help | --hel | -h )
$as_echo "$ac_cs_usage"; exit ;;
xgboost/R-package/configure view on Meta::CPAN
# then the envvar interface is used. Set only those that are not.
# We use the long form for the default assignment because of an extremely
# bizarre bug on SunOS 4.1.3.
if $ac_need_defaults; then
test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
fi
# Have a temporary directory for convenience. Make it in the build tree
# simply because there is no reason against having it here, and in addition,
# creating and moving files from /tmp can sometimes cause problems.
# Hook for its removal unless debugging.
# Note that there is a small window in which the directory will not be cleaned:
# after its creation but before its name has been assigned to `$tmp'.
$debug ||
{
tmp= ac_tmp=
trap 'exit_status=$?
: "${ac_tmp:=$tmp}"
{ test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
' 0
trap 'as_fn_exit 1' 1 2 13 15
}
# Create a (secure) tmp directory for tmp files.
xgboost/cmake/Utils.cmake view on Meta::CPAN
)
foreach(variable ${variables})
if(${variable} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${variable} "${${variable}}")
set(${variable} "${${variable}}" PARENT_SCOPE)
endif()
endforeach()
endif()
endfunction(msvc_use_static_runtime)
# Set output directory of target, ignoring debug or release
function(set_output_directory target dir)
set_target_properties(${target} PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${dir}
RUNTIME_OUTPUT_DIRECTORY_DEBUG ${dir}
RUNTIME_OUTPUT_DIRECTORY_RELEASE ${dir}
LIBRARY_OUTPUT_DIRECTORY ${dir}
LIBRARY_OUTPUT_DIRECTORY_DEBUG ${dir}
LIBRARY_OUTPUT_DIRECTORY_RELEASE ${dir}
)
endfunction(set_output_directory)
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
different from output sequence types (as long as they are coercible)
value") for seeding the computation with an arbitrary prefix
- Reduce repository size (move doxygen binary to doc repository)
- Minor reductions in block-scan instruction count
- Bug fixes:
- Issue #55: warning in cub/device/dispatch/dispatch_reduce_by_key.cuh
- Issue #59: cub::DeviceScan::ExclusiveSum can't prefix sum of float into double
- Issue #58: Infinite loop in cub::CachingDeviceAllocator::NearestPowerOf
- Issue #47: Caching allocator needs to clean up cuda error upon successful retry
- Issue #46: Very high amount of needed memory from the cub::DeviceHistogram::HistogramEven routine
- Issue #45: Caching Device Allocator fails with debug output enabled
- Fix for generic-type reduce-by-key warpscan (sm3.x and newer)
//-----------------------------------------------------------------------------
1.5.2 03/21/2016
- Improved medium-size scan performance for sm5x (Maxwell)
- Refactored caching allocator for device memory
- Spends less time locked
- Failure to allocate a block from the runtime will retry once after
freeing cached allocations
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
allocated dynamically by the host at the kernel call site.
- Most CUB programs having typical 1D usage should not require any
changes to accomodate these updates.
- Added new "combination" WarpScan methods for efficiently computing
both inclusive and exclusive prefix scans (and sums).
- Bug fixes:
- Fixed bug in cub::WarpScan (which affected cub::BlockScan and
cub::DeviceScan) where incorrect results (e.g., NAN) would often be
returned when parameterized for floating-point types (fp32, fp64).
- Workaround-fix for ptxas error when compiling with with -G flag on Linux
(for debug instrumentation)
- Misc. workaround-fixes for certain scan scenarios (using custom
scan operators) where code compiled for SM1x is run on newer
GPUs of higher compute-capability: the compiler could not tell
which memory space was being used collective operations and was
mistakenly using global ops instead of shared ops.
//-----------------------------------------------------------------------------
1.2.3 04/01/2014
- Bug fixes:
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
- Added descending sorting to DeviceRadixSort and BlockRadixSort
- Added min, max, arg-min, and arg-max to DeviceReduce
- Added DeviceSelect (select-unique, select-if, and select-flagged)
- Added DevicePartition (partition-if, partition-flagged)
- Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
- Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
- Performance
- Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
- Interface changes:
- Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces. The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
- Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
- Documentation and testing:
- Added simple examples of device-wide methods
- Improved doxygen documentation and example snippets
- Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform)
- Bug fixes
- Fixed misc DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM10-SM13
- Fixed DeviceScan / WarpReduction bug: SHFL-based segmented reduction producting incorrect results for multi-word types (size > 4B) on Linux
- Fixed BlockScan bug: For warpscan-based scans, not all threads in the first warp were entering the prefix callback functor
- Fixed DeviceRadixSort bug: race condition with key-value pairs for pre-SM35 architectures
- Fixed DeviceRadixSort bug: incorrect bitfield-extract behavior with long keys on 64bit Linux
xgboost/cub/common.mk view on Meta::CPAN
ifeq ($(verbose), 1)
NVCCFLAGS += -v
endif
# [keep=<0|1>] Keep intermediate compilation artifacts option
ifeq ($(keep), 1)
NVCCFLAGS += -keep
endif
# [debug=<0|1>] Generate debug mode code
ifeq ($(debug), 1)
NVCCFLAGS += -G
endif
#-------------------------------------------------------------------------------
# Compiler and compilation platform
#-------------------------------------------------------------------------------
CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
xgboost/cub/cub/cub.cuh view on Meta::CPAN
#include "iterator/cache_modified_output_iterator.cuh"
#include "iterator/constant_input_iterator.cuh"
#include "iterator/counting_input_iterator.cuh"
#include "iterator/tex_obj_input_iterator.cuh"
#include "iterator/tex_ref_input_iterator.cuh"
#include "iterator/transform_input_iterator.cuh"
// Util
#include "util_allocator.cuh"
#include "util_arch.cuh"
#include "util_debug.cuh"
#include "util_device.cuh"
#include "util_macro.cuh"
#include "util_ptx.cuh"
#include "util_type.cuh"
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
static cudaError_t HistogramEven(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT lower_level1[1] = {lower_level};
LevelT upper_level1[1] = {upper_level};
return MultiHistogramEven<1, 1>(
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
lower_level1,
upper_level1,
num_samples,
1,
sizeof(SampleT) * num_samples,
stream,
debug_synchronous);
}
/**
* \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
*
* \par
* - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
* using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
* - The row stride must be a whole multiple of the sample data type
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT lower_level1[1] = {lower_level};
LevelT upper_level1[1] = {upper_level};
return MultiHistogramEven<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
lower_level1,
upper_level1,
num_row_samples,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
* for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
static cudaError_t MultiHistogramEven(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_pixels,
1,
sizeof(SampleT) * NUM_CHANNELS * num_pixels,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
Int2Type<sizeof(SampleT) == 1> is_byte_sample;
if ((sizeof(OffsetT) > sizeof(int)) &&
((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
{
// Down-convert OffsetT data type
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
(int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
//@} end member group
/******************************************************************//**
* \name Custom bin ranges
*********************************************************************/
//@{
/**
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t HistogramRange(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT* d_levels1[1] = {d_levels};
return MultiHistogramRange<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
d_levels1,
num_samples,
1,
sizeof(SampleT) * num_samples,
stream,
debug_synchronous);
}
/**
* \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
*
* \par
* - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
* using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
* - The row stride must be a whole multiple of the sample data type
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT* d_levels1[1] = {d_levels};
return MultiHistogramRange<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
d_levels1,
num_row_samples,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
* for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t MultiHistogramRange(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_pixels,
1,
sizeof(SampleT) * NUM_CHANNELS * num_pixels,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
Int2Type<sizeof(SampleT) == 1> is_byte_sample;
if ((sizeof(OffsetT) > sizeof(int)) &&
((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
{
// Down-convert OffsetT data type
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
(int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. 
*
* \par
* - Copies of the selected items are compacted into \p d_out and maintain their original
* relative ordering, however copies of the unselected items are compacted into the
* rear of \p d_out in reverse order.
xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
select_op,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_partition_flagged.cu
* \example example_device_partition_if.cu
*/
} // CUB namespace
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortPairs(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
* - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortPairsDescending(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
/******************************************************************//**
* \name Keys-only
*********************************************************************/
//@{
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeys(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
template <typename KeyT>
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeys(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
* - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
* - \devicestorage
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeysDescending(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
template <typename KeyT>
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeysDescending(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
};
/**
* \example example_device_radix_sort.cu
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Reduce(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
ReductionOpT reduction_op, ///< [in] Binary reduction functor
T init, ///< [in] Initial value of the reduction
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
reduction_op,
init,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide sum using the addition (\p +) operator.
*
* \par
* - Uses \p 0 as the initial value of the reduction.
* - Does not support \p + operators that are non-commutative..
* - \devicestorage
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t Sum(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
cub::Sum(),
OutputT(), // zero-initialize
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide minimum using the less-than ('<') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
* - Does not support \p < operators that are non-commutative.
* - \devicestorage
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t Min(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
cub::Min(),
Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
* - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMin(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_items,
cub::ArgMin(),
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide maximum using the greater-than ('>') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
* - Does not support \p > operators that are non-commutative.
* - \devicestorage
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t Max(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
cub::Max(),
Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
* - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMax(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_items,
cub::ArgMax(),
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
*
* \par
* This operation computes segmented reductions within \p d_values_in using
* the specified binary \p reduction_op functor. The segments are identified by
* "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values
AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
ReductionOpT reduction_op, ///< [in] Binary reduction functor
int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
// FlagT iterator type (not used)
// Selection op (not used)
// Default == operator
typedef Equality EqualityOp;
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
temp_storage_bytes,
d_keys_in,
d_unique_out,
d_values_in,
d_aggregates_out,
d_num_runs_out,
EqualityOp(),
reduction_op,
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_reduce.cu
*/
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_run_length_encode.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Encode(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs
int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType SelectOp; // Selection op (not used)
typedef Equality EqualityOp; // Default == operator
typedef cub::Sum ReductionOp; // Value reduction operator
// The lengths output value type
typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ?
OffsetT, // ... then the OffsetT type,
xgboost/cub/cub/device/device_run_length_encode.cuh view on Meta::CPAN
temp_storage_bytes,
d_in,
d_unique_out,
LengthsInputIteratorT((LengthT) 1),
d_counts_out,
d_num_runs_out,
EqualityOp(),
ReductionOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
*
* \par
* - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
* and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
* <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
xgboost/cub/cub/device/device_run_length_encode.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t NonTrivialRuns(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to input sequence of data items
OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef Equality EqualityOp; // Default == operator
return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_offsets_out,
d_lengths_out,
d_num_runs_out,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveSum(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
// Initial value
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
Sum(),
init_value,
num_items,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out.
*
* \par
* - Supports non-commutative scan operators.
* - \devicestorage
*
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveScan(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
ScanOpT scan_op, ///< [in] Binary scan functor
InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
init_value,
num_items,
stream,
debug_synchronous);
}
//@} end member group
/******************************************************************//**
* \name Inclusive scans
*********************************************************************/
//@{
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveSum(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
Sum(),
NullType(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
*
* \par
* - Supports non-commutative scan operators.
* - \devicestorage
*
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
typename ScanOpT>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveScan(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
ScanOpT scan_op, ///< [in] Binary scan functor
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
NullType(),
num_items,
stream,
debug_synchronous);
}
//@} end member group
};
/**
* \example example_device_scan.cu
*/
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
/******************************************************************//**
* \name Keys-only
*********************************************************************/
//@{
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
* for both the \p d_begin_offsets and \p d_end_offsets parameters (where
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
ReductionOp reduction_op, ///< [in] Binary reduction functor
T initial_value, ///< [in] Initial value of the reduction for each segment
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
reduction_op,
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide segmented sum using the addition ('+') operator.
*
* \par
* - Uses \p 0 as the initial value of the reduction for each segment.
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Sum(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Sum(),
OutputT(), // zero-initialize
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Min(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Min(),
Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
* - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMin(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::ArgMin(),
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Max(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Max(),
Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
* - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMax(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::ArgMax(),
initial_value,
stream,
debug_synchronous);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. 
*
* \par
* - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
* - \devicestorage
*
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
select_op,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. 
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType SelectOp; // Selection op (not used)
typedef Equality EqualityOp; // Default == operator
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_select_flagged.cu
* \example example_device_select_if.cu
* \example example_device_select_unique.cu
*/
xgboost/cub/cub/device/device_spmv.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
int num_rows, ///< [in] number of rows of matrix <b>A</b>.
int num_cols, ///< [in] number of columns of matrix <b>A</b>.
int num_nonzeros, ///< [in] number of nonzero elements of matrix <b>A</b>.
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
SpmvParams<ValueT, int> spmv_params;
spmv_params.d_values = d_values;
spmv_params.d_row_end_offsets = d_row_offsets + 1;
spmv_params.d_column_indices = d_column_indices;
spmv_params.d_vector_x = d_vector_x;
spmv_params.d_vector_y = d_vector_y;
spmv_params.num_rows = num_rows;
spmv_params.num_cols = num_cols;
spmv_params.num_nonzeros = num_nonzeros;
spmv_params.alpha = 1.0;
spmv_params.beta = 0.0;
return DispatchSpmv<ValueT, int>::Dispatch(
d_temp_storage,
temp_storage_bytes,
spmv_params,
stream,
debug_synchronous);
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
* cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include <limits>
#include "../../agent/agent_histogram.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i<...
OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel
int max_num_output_bins, ///< [in] Maximum number of output bins in any channel
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
cudaError error = cudaSuccess;
do
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
// Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
int histogram_init_block_threads = 256;
int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
// Log DeviceHistogramInitKernel configuration
if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
// Invoke histogram_init_kernel
histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
num_output_bins_wrapper,
d_output_histograms_wrapper,
tile_queue);
// Return if empty problem
if ((blocks_per_row == 0) || (blocks_per_col == 0))
break;
// Log histogram_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
// Invoke histogram_sweep_kernel
histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
d_samples,
num_output_bins_wrapper,
num_privatized_bins_wrapper,
d_output_histograms_wrapper,
d_privatized_histograms_wrapper,
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_row_pixels,
num_rows,
row_stride_samples,
tiles_per_row,
tile_queue);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<false> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
else
{
// Dispatch shared-privatized approach
const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
if (CubDebug(error = PrivatizedDispatch(
d_temp_storage,
temp_storage_bytes,
d_samples,
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
} while (0);
return error;
}
/**
* Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> ...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<true> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
} while (0);
return error;
}
/**
* Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
*/
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<false> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
else
{
// Dispatch shared-privatized approach
const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
if (CubDebug(error = PrivatizedDispatch(
d_temp_storage,
temp_storage_bytes,
d_samples,
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
}
while (0);
return error;
}
/**
* Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<true> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
#include <stdio.h>
#include <iterator>
#include "../../agent/agent_radix_sort_upsweep.cuh"
#include "../../agent/agent_radix_sort_downsweep.cuh"
#include "../../agent/agent_scan.cuh"
#include "../../block/block_radix_sort.cuh"
#include "../../grid/grid_even_share.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
//------------------------------------------------------------------------------
void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
OffsetT num_items; ///< [in] Number of items to sort
int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int ptx_version; ///< [in] PTX version
bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers
//------------------------------------------------------------------------------
// Constructor
//------------------------------------------------------------------------------
/// Constructor
CUB_RUNTIME_FUNCTION __forceinline__
DispatchRadixSort(
void* d_temp_storage,
size_t &temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
OffsetT num_items,
int begin_bit,
int end_bit,
bool is_overwrite_okay,
cudaStream_t stream,
bool debug_synchronous,
int ptx_version)
:
d_temp_storage(d_temp_storage),
temp_storage_bytes(temp_storage_bytes),
d_keys(d_keys),
d_values(d_values),
num_items(num_items),
begin_bit(begin_bit),
end_bit(end_bit),
stream(stream),
debug_synchronous(debug_synchronous),
ptx_version(ptx_version),
is_overwrite_okay(is_overwrite_okay)
{}
//------------------------------------------------------------------------------
// Small-problem (single tile) invocation
//------------------------------------------------------------------------------
/// Invoke a single block to sort in-core
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
{
temp_storage_bytes = 1;
break;
}
// Return if empty problem
if (num_items == 0)
break;
// Log single_tile_kernel configuration
if (debug_synchronous)
_CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
// Invoke upsweep_kernel with same grid size as downsweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
d_keys.Current(),
d_keys.Alternate(),
d_values.Current(),
d_values.Alternate(),
num_items,
begin_bit,
end_bit);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update selector
d_keys.selector ^= 1;
d_values.selector ^= 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
int spine_length,
int ¤t_bit,
PassConfigT &pass_config)
{
cudaError error = cudaSuccess;
do
{
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
// Log upsweep_kernel configuration
if (debug_synchronous)
_CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
// Invoke upsweep_kernel with same grid size as downsweep_kernel
pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
d_keys_in,
d_spine,
num_items,
current_bit,
pass_bits,
pass_config.even_share);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log scan_kernel configuration
if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
// Invoke scan_kernel
pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
d_spine,
spine_length);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log downsweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
// Invoke downsweep_kernel
pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
d_spine,
num_items,
current_bit,
pass_bits,
pass_config.even_share);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update current bit
current_bit += pass_bits;
}
while (0);
return error;
}
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
OffsetT num_items, ///< [in] Number of items to sort
int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
cudaError_t error;
do {
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Create dispatch functor
DispatchRadixSort dispatch(
d_temp_storage, temp_storage_bytes,
d_keys, d_values,
num_items, begin_bit, end_bit, is_overwrite_okay,
stream, debug_synchronous, ptx_version);
// Dispatch to chained policy
if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
} while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
OffsetT num_items; ///< [in] Number of items to sort
OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data
const OffsetT *d_begin_offsets; ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data se...
const OffsetT *d_end_offsets; ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segmen...
int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int ptx_version; ///< [in] PTX version
bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers
//------------------------------------------------------------------------------
// Constructors
//------------------------------------------------------------------------------
/// Constructor
CUB_RUNTIME_FUNCTION __forceinline__
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
OffsetT num_items,
OffsetT num_segments,
const OffsetT *d_begin_offsets,
const OffsetT *d_end_offsets,
int begin_bit,
int end_bit,
bool is_overwrite_okay,
cudaStream_t stream,
bool debug_synchronous,
int ptx_version)
:
d_temp_storage(d_temp_storage),
temp_storage_bytes(temp_storage_bytes),
d_keys(d_keys),
d_values(d_values),
num_items(num_items),
num_segments(num_segments),
d_begin_offsets(d_begin_offsets),
d_end_offsets(d_end_offsets),
begin_bit(begin_bit),
end_bit(end_bit),
is_overwrite_okay(is_overwrite_okay),
stream(stream),
debug_synchronous(debug_synchronous),
ptx_version(ptx_version)
{}
//------------------------------------------------------------------------------
// Multi-segment invocation
//------------------------------------------------------------------------------
/// Invoke a three-kernel sorting pass at the current bit.
template <typename PassConfigT>
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
ValueT *d_values_out,
int ¤t_bit,
PassConfigT &pass_config)
{
cudaError error = cudaSuccess;
do
{
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
// Log kernel configuration
if (debug_synchronous)
_CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
num_segments, pass_config.segmented_config.block_threads, (long long) stream,
pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
d_keys_in, d_keys_out,
d_values_in, d_values_out,
d_begin_offsets, d_end_offsets, num_segments,
current_bit, pass_bits);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update current bit
current_bit += pass_bits;
}
while (0);
return error;
}
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
int num_items, ///< [in] Number of items to sort
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> dat...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data se...
int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
cudaError_t error;
do {
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Create dispatch functor
DispatchSegmentedRadixSort dispatch(
d_temp_storage, temp_storage_bytes,
d_keys, d_values,
num_items, num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit, is_overwrite_okay,
stream, debug_synchronous, ptx_version);
// Dispatch to chained policy
if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
} while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
#include <stdio.h>
#include <iterator>
#include "../../agent/agent_reduce.cuh"
#include "../../iterator/arg_index_input_iterator.cuh"
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_even_share.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../iterator/arg_index_input_iterator.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
//------------------------------------------------------------------------------
void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out; ///< [out] Pointer to the output aggregate
OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in)
ReductionOpT reduction_op; ///< [in] Binary reduction functor
OutputT init; ///< [in] The initial value of the reduction
cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int ptx_version; ///< [in] PTX version
//------------------------------------------------------------------------------
// Constructor
//------------------------------------------------------------------------------
/// Constructor
CUB_RUNTIME_FUNCTION __forceinline__
DispatchReduce(
void* d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
OffsetT num_items,
ReductionOpT reduction_op,
OutputT init,
cudaStream_t stream,
bool debug_synchronous,
int ptx_version)
:
d_temp_storage(d_temp_storage),
temp_storage_bytes(temp_storage_bytes),
d_in(d_in),
d_out(d_out),
num_items(num_items),
reduction_op(reduction_op),
init(init),
stream(stream),
debug_synchronous(debug_synchronous),
ptx_version(ptx_version)
{}
//------------------------------------------------------------------------------
// Small-problem (single tile) invocation
//------------------------------------------------------------------------------
/// Invoke a single block block to reduce in-core
template <
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
do
{
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
{
temp_storage_bytes = 1;
break;
}
// Log single_reduce_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
// Invoke single_reduce_sweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
d_in,
d_out,
num_items,
reduction_op,
init);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
//------------------------------------------------------------------------------
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
}
else if (ActivePolicyT::ReducePolicy::GRID_MAPPING == GRID_MAPPING_DYNAMIC)
{
// Work is distributed dynamically
int num_tiles = (num_items + reduce_config.tile_size - 1) / reduce_config.tile_size;
reduce_grid_size = (num_tiles < reduce_device_occupancy) ?
num_tiles : // Not enough to fill the device with threadblocks
reduce_device_occupancy; // Fill the device with threadblocks
// Prepare the dynamic queue descriptor if necessary
if (debug_synchronous) _CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
// Invoke prepare_drain_kernel
prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
else
{
error = CubDebug(cudaErrorNotSupported ); break;
}
// Log device_reduce_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
reduce_grid_size,
ActivePolicyT::ReducePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
reduce_config.sm_occupancy);
// Invoke DeviceReduceKernel
reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
d_in,
d_block_reductions,
num_items,
even_share,
queue,
reduction_op);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log single_reduce_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
// Invoke DeviceReduceSingleTileKernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
d_block_reductions,
d_out,
reduce_grid_size,
reduction_op,
init);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
ReductionOpT reduction_op, ///< [in] Binary reduction functor
OutputT init, ///< [in] The initial value of the reduction
cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default ...
{
typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Create dispatch functor
DispatchReduce dispatch(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, reduction_op, init,
stream, debug_synchronous, ptx_version);
// Dispatch to chained policy
if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out; ///< [out] Pointer to the output aggregate
OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data
OffsetT *d_begin_offsets; ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segmen...
OffsetT *d_end_offsets; ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in...
ReductionOpT reduction_op; ///< [in] Binary reduction functor
OutputT init; ///< [in] The initial value of the reduction
cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int ptx_version; ///< [in] PTX version
//------------------------------------------------------------------------------
// Constructor
//------------------------------------------------------------------------------
/// Constructor
CUB_RUNTIME_FUNCTION __forceinline__
DispatchSegmentedReduce(
void* d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
OffsetT num_segments,
OffsetT *d_begin_offsets,
OffsetT *d_end_offsets,
ReductionOpT reduction_op,
OutputT init,
cudaStream_t stream,
bool debug_synchronous,
int ptx_version)
:
d_temp_storage(d_temp_storage),
temp_storage_bytes(temp_storage_bytes),
d_in(d_in),
d_out(d_out),
num_segments(num_segments),
d_begin_offsets(d_begin_offsets),
d_end_offsets(d_end_offsets),
reduction_op(reduction_op),
init(init),
stream(stream),
debug_synchronous(debug_synchronous),
ptx_version(ptx_version)
{}
//------------------------------------------------------------------------------
// Chained policy invocation
//------------------------------------------------------------------------------
/// Invocation
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
{
temp_storage_bytes = 1;
return cudaSuccess;
}
// Init kernel configuration
KernelConfig segmented_reduce_config;
if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
// Log device_reduce_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
num_segments,
ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
(long long) stream,
ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
segmented_reduce_config.sm_occupancy);
// Invoke DeviceReduceKernel
segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
d_in,
d_out,
d_begin_offsets,
d_end_offsets,
num_segments,
reduction_op,
init);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> dat...
ReductionOpT reduction_op, ///< [in] Binary reduction functor
OutputT init, ///< [in] The initial value of the reduction
cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default ...
{
typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
if (num_segments <= 0)
return cudaSuccess;
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Create dispatch functor
DispatchSegmentedReduce dispatch(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
reduction_op, init,
stream, debug_synchronous, ptx_version);
// Dispatch to chained policy
if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values
AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
EqualityOpT equality_op, ///< [in] KeyT equality operator
ReductionOpT reduction_op, ///< [in] ValueT reduction operator
OffsetT num_items, ///< [in] Total number of items to select from
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int /*ptx_version*/, ///< [in] PTX version of dispatch kernels
ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
(void)d_temp_storage;
(void)temp_storage_bytes;
(void)d_keys_in;
(void)d_unique_out;
(void)d_values_in;
(void)d_aggregates_out;
(void)d_num_runs_out;
(void)equality_op;
(void)reduction_op;
(void)num_items;
(void)stream;
(void)debug_synchronous;
(void)init_kernel;
(void)reduce_by_key_kernel;
(void)reduce_by_key_config;
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
cudaError error = cudaSuccess;
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
// Return if the caller is simply requesting the size of the storage allocation
break;
}
// Construct the tile status interface
ScanTileStateT tile_state;
if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
// Log init_kernel configuration
int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
// Invoke init_kernel to initialize tile descriptors
init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
tile_state,
num_tiles,
d_num_runs_out);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Return if empty problem
if (num_items == 0)
break;
// Get SM occupancy for reduce_by_key_kernel
int reduce_by_key_sm_occupancy;
if (CubDebug(error = MaxSmOccupancy(
reduce_by_key_sm_occupancy, // out
reduce_by_key_kernel,
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
// Get max x-dimension of grid
int max_dim_x;
if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
// Run grids in epochs (in case number of tiles exceeds max x-dimension
int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
{
// Log reduce_by_key_kernel configuration
if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
// Invoke reduce_by_key_kernel
reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
d_keys_in,
d_unique_out,
d_values_in,
d_aggregates_out,
d_num_runs_out,
tile_state,
start_tile,
equality_op,
reduction_op,
num_items);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values
AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
EqualityOpT equality_op, ///< [in] KeyT equality operator
ReductionOpT reduction_op, ///< [in] ValueT reduction operator
OffsetT num_items, ///< [in] Total number of items to select from
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
temp_storage_bytes,
d_keys_in,
d_unique_out,
d_values_in,
d_aggregates_out,
d_num_runs_out,
equality_op,
reduction_op,
num_items,
stream,
debug_synchronous,
ptx_version,
DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
reduce_by_key_config))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets
LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
EqualityOpT equality_op, ///< [in] Equality operator for input items
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
int ptx_version, ///< [in] PTX version of dispatch kernels
DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
// Return if the caller is simply requesting the size of the storage allocation
break;
}
// Construct the tile status interface
ScanTileStateT tile_status;
if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
// Log device_scan_init_kernel configuration
int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
// Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
tile_status,
num_tiles,
d_num_runs_out);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Return if empty problem
if (num_items == 0)
break;
// Get SM occupancy for device_rle_sweep_kernel
int device_rle_kernel_sm_occupancy;
if (CubDebug(error = MaxSmOccupancy(
device_rle_kernel_sm_occupancy, // out
device_rle_sweep_kernel,
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
int max_dim_x;
if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
// Get grid size for scanning tiles
dim3 scan_grid_size;
scan_grid_size.z = 1;
scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
// Log device_rle_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
// Invoke device_rle_sweep_kernel
device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
d_in,
d_offsets_out,
d_lengths_out,
d_num_runs_out,
tile_status,
equality_op,
num_items,
num_tiles);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to input sequence of data items
OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets
LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
EqualityOpT equality_op, ///< [in] Equality operator for input items
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_offsets_out,
d_lengths_out,
d_num_runs_out,
equality_op,
num_items,
stream,
debug_synchronous,
ptx_version,
DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
device_rle_config))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh view on Meta::CPAN
#pragma once
#include <stdio.h>
#include <iterator>
#include "../../agent/agent_scan.cuh"
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_arch.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
ScanOpT scan_op, ///< [in] Binary scan functor
InitValueT init_value, ///< [in] Initial value to seed the exclusive scan
OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int /*ptx_version*/, ///< [in] PTX version of dispatch kernels
ScanInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
ScanSweepKernelPtrT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
(void)d_temp_storage;
(void)temp_storage_bytes;
(void)d_in;
(void)d_out;
(void)scan_op;
(void)init_value;
(void)num_items;
(void)stream;
(void)debug_synchronous;
(void)init_kernel;
(void)scan_kernel;
(void)scan_kernel_config;
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
cudaError error = cudaSuccess;
do
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh view on Meta::CPAN
// Return if empty problem
if (num_items == 0)
break;
// Construct the tile status interface
ScanTileStateT tile_state;
if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
// Log init_kernel configuration
int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
// Invoke init_kernel to initialize tile descriptors
init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
tile_state,
num_tiles);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Get SM occupancy for scan_kernel
int scan_sm_occupancy;
if (CubDebug(error = MaxSmOccupancy(
scan_sm_occupancy, // out
scan_kernel,
scan_kernel_config.block_threads))) break;
// Get max x-dimension of grid
int max_dim_x;
if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
// Run grids in epochs (in case number of tiles exceeds max x-dimension
int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
{
// Log scan_kernel configuration
if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
// Invoke scan_kernel
scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
d_in,
d_out,
tile_state,
start_tile,
scan_op,
init_value,
num_items);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
ScanOpT scan_op, ///< [in] Binary scan functor
InitValueT init_value, ///< [in] Initial value to seed the exclusive scan
OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Get kernel kernel dispatch configurations
KernelConfig scan_kernel_config;
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh view on Meta::CPAN
// Dispatch
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
init_value,
num_items,
stream,
debug_synchronous,
ptx_version,
DeviceScanInitKernel<ScanTileStateT>,
DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
scan_kernel_config))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable)
SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
SelectOpT select_op, ///< [in] Selection operator
EqualityOpT equality_op, ///< [in] Equality operator
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
int /*ptx_version*/, ///< [in] PTX version of dispatch kernels
ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
(void)d_temp_storage;
(void)temp_storage_bytes;
(void)d_in;
(void)d_flags;
(void)d_selected_out;
(void)d_num_selected_out;
(void)select_op;
(void)equality_op;
(void)num_items;
(void)stream;
(void)debug_synchronous;
(void)scan_init_kernel;
(void)select_if_kernel;
(void)select_if_config;
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
cudaError error = cudaSuccess;
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
// Return if the caller is simply requesting the size of the storage allocation
break;
}
// Construct the tile status interface
ScanTileStateT tile_status;
if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
// Log scan_init_kernel configuration
int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
// Invoke scan_init_kernel to initialize tile descriptors
scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
tile_status,
num_tiles,
d_num_selected_out);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Return if empty problem
if (num_items == 0)
break;
// Get SM occupancy for select_if_kernel
int range_select_sm_occupancy;
if (CubDebug(error = MaxSmOccupancy(
range_select_sm_occupancy, // out
select_if_kernel,
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
int max_dim_x;
if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
// Get grid size for scanning tiles
dim3 scan_grid_size;
scan_grid_size.z = 1;
scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
// Log select_if_kernel configuration
if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
// Invoke select_if_kernel
select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
d_in,
d_flags,
d_selected_out,
d_num_selected_out,
tile_status,
select_op,
equality_op,
num_items,
num_tiles);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable)
SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
SelectOpT select_op, ///< [in] Selection operator
EqualityOpT equality_op, ///< [in] Equality operator
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_selected_out,
d_num_selected_out,
select_op,
equality_op,
num_items,
stream,
debug_synchronous,
ptx_version,
DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
select_if_config))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh view on Meta::CPAN
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "dispatch_scan.cuh"
#include "../../agent/agent_spmv_orig.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh view on Meta::CPAN
* kernel invocations.
*/
template <
typename SpmvKernelT> ///< Function type of cub::AgentSpmvKernel
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
KernelConfig spmv_config) ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported );
#else
cudaError error = cudaSuccess;
xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh view on Meta::CPAN
void* allocations[1];
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
if (d_temp_storage == NULL)
{
// Return if the caller is simply requesting the size of the storage allocation
return cudaSuccess;
}
KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[0]; // Agent carry-out pairs
// Log spmv_kernel configuration
if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
spmv_grid_size, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
// Invoke spmv_kernel
spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
spmv_params,
merge_items_per_block,
d_tile_carry_pairs);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* Internal dispatch routine for computing a device-wide reduction
*/
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh view on Meta::CPAN
// Get kernel kernel dispatch configurations
KernelConfig spmv_config;
InitConfigs(ptx_version, spmv_config);
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
spmv_params,
stream,
debug_synchronous,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, false, false>,
spmv_config))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh view on Meta::CPAN
#pragma once
#include <stdio.h>
#include <iterator>
#include "../../agent/single_pass_scan_operators.cuh"
#include "../../agent/agent_segment_fixup.cuh"
#include "../../agent/agent_spmv_orig.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh view on Meta::CPAN
typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel
typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel
typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel
typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh view on Meta::CPAN
{
// Return if the caller is simply requesting the size of the storage allocation
temp_storage_bytes = 1;
break;
}
// Get search/init grid dims
int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
// Invoke spmv_search_kernel
spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
spmv_params);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
break;
}
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh view on Meta::CPAN
// if (num_merge_tiles < spmv_sm_occupancy * sm_count)
{
// Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
d_tile_coordinates = NULL;
}
else
{
// Use separate search kernel if we have enough spmv tiles to saturate the device
// Log spmv_search_kernel configuration
if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
search_grid_size, search_block_size, (long long) stream);
// Invoke spmv_search_kernel
spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
num_merge_tiles,
d_tile_coordinates,
spmv_params);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
// Log spmv_kernel configuration
if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
// Invoke spmv_kernel
spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
spmv_params,
d_tile_coordinates,
d_tile_carry_pairs,
num_merge_tiles,
tile_state,
num_segment_fixup_tiles);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Run reduce-by-key fixup if necessary
if (num_merge_tiles > 1)
{
// Log segment_fixup_kernel configuration
if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
// Invoke segment_fixup_kernel
segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
d_tile_carry_pairs,
spmv_params.d_vector_y,
num_merge_tiles,
num_segment_fixup_tiles,
tile_state);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
#if (CUB_PTX_ARCH == 0)
// Free textures
if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
#endif
}
while (0);
return error;
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh view on Meta::CPAN
/**
* Internal dispatch routine for computing a device-wide reduction
*/
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif
// Get kernel kernel dispatch configurations
KernelConfig spmv_config, segment_fixup_config;
InitConfigs(ptx_version, spmv_config, segment_fixup_config);
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, segment_fixup_config))) break;
/*
// Dispatch
if (spmv_params.beta == 0.0)
{
if (spmv_params.alpha == 1.0)
{
// Dispatch y = A*x
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, segment_fixup_config))) break;
}
else
{
// Dispatch y = alpha*A*x
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, false>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, segment_fixup_config))) break;
}
}
else
{
if (spmv_params.alpha == 1.0)
{
// Dispatch y = A*x + beta*y
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false, true>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, segment_fixup_config))) break;
}
else
{
// Dispatch y = alpha*A*x + beta*y
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, true>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, segment_fixup_config))) break;
}
}
*/
}
while (0);
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN
#pragma once
#include <stdio.h>
#include <iterator>
#include "../../agent/single_pass_scan_operators.cuh"
#include "../../agent/agent_segment_fixup.cuh"
#include "../../agent/agent_spmv_row_based.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN
// typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel
// typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel
typename SpmvKernelT> ///< Function type of cub::AgentSpmvKernel
// typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
// Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
// SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
// SegmentFixupKernelT fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
KernelConfig fixup_config) ///< [in] Dispatch parameters that match the policy that \p fixup_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN
{
// Return if the caller is simply requesting the size of the storage allocation
temp_storage_bytes = 1;
return cudaSuccess;
}
// Get search/init grid dims
int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
// Invoke spmv_search_kernel
spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
spmv_params);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
break;
}
*/
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN
// Decrease rows per tile if needed to accomodate minimum parallelism
unsigned int spmv_device_occupancy = sm_count * 2;
// unsigned int spmv_device_occupancy = sm_count * ((spmv_sm_occupancy + 1) / 2);
if (spmv_params.num_rows < spmv_device_occupancy * rows_per_tile)
rows_per_tile = (spmv_params.num_rows) / spmv_device_occupancy;
}
rows_per_tile = CUB_MAX(rows_per_tile, 2);
if (debug_synchronous) _CubLog("Rows per tile: %d\n", rows_per_tile);
// Number of tiles for kernels
unsigned int num_spmv_tiles = (spmv_params.num_rows + rows_per_tile - 1) / rows_per_tile;
// unsigned int num_fixup_tiles = (num_spmv_tiles + fixup_tile_size - 1) / fixup_tile_size;
// Get grid dimensions
dim3 spmv_grid_size(
CUB_MIN(num_spmv_tiles, max_dim_x),
(num_spmv_tiles + max_dim_x - 1) / max_dim_x,
1);
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN
if (search_grid_size < sm_count)
{
// Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
d_tile_coordinates = NULL;
}
else
{
// Use separate search kernel if we have enough spmv tiles to saturate the device
// Log spmv_search_kernel configuration
if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
search_grid_size, search_block_size, (long long) stream);
// Invoke spmv_search_kernel
spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
num_spmv_tiles,
d_tile_coordinates,
spmv_params);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
*/
// Log spmv_kernel configuration
if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
// Invoke spmv_kernel
spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
spmv_params,
// d_tile_coordinates,
// d_tile_carry_pairs,
// num_spmv_tiles,
// tile_state,
// num_fixup_tiles,
rows_per_tile);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
/*
// Run reduce-by-key fixup if necessary
if (num_spmv_tiles > 1)
{
// Log fixup_kernel configuration
if (debug_synchronous) _CubLog("Invoking fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
fixup_grid_size.x, fixup_grid_size.y, fixup_grid_size.z, fixup_config.block_threads, (long long) stream, fixup_config.items_per_thread, fixup_sm_occupancy);
// Invoke fixup_kernel
fixup_kernel<<<fixup_grid_size, fixup_config.block_threads, 0, stream>>>(
d_tile_carry_pairs,
spmv_params.d_vector_y,
num_spmv_tiles,
num_fixup_tiles,
tile_state);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
*/
#if (CUB_PTX_ARCH == 0)
// Free textures
// if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
#endif
}
while (0);
return error;
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN
/**
* Internal dispatch routine for computing a device-wide reduction
*/
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif
// Get kernel kernel dispatch configurations
KernelConfig spmv_config, fixup_config;
InitConfigs(ptx_version, spmv_config, fixup_config);
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
// DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
// DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
// DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, fixup_config))) break;
/*
// Dispatch
if (spmv_params.beta == 0.0)
{
if (spmv_params.alpha == 1.0)
{
// Dispatch y = A*x
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, fixup_config))) break;
}
else
{
// Dispatch y = alpha*A*x
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, false>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, fixup_config))) break;
}
}
else
{
if (spmv_params.alpha == 1.0)
{
// Dispatch y = A*x + beta*y
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false, true>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, fixup_config))) break;
}
else
{
// Dispatch y = alpha*A*x + beta*y
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, true>,
DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
spmv_config, fixup_config))) break;
}
}
*/
}
while (0);
xgboost/cub/cub/grid/grid_barrier.cuh view on Meta::CPAN
*
******************************************************************************/
/**
* \file
* cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
*/
#pragma once
#include "../util_debug.cuh"
#include "../util_namespace.cuh"
#include "../thread/thread_load.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
xgboost/cub/cub/grid/grid_queue.cuh view on Meta::CPAN
******************************************************************************/
/**
* \file
* cub::GridQueue is a descriptor utility for dynamic queue management.
*/
#pragma once
#include "../util_namespace.cuh"
#include "../util_debug.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup GridModule
xgboost/cub/cub/iterator/tex_obj_input_iterator.cuh view on Meta::CPAN
*/
#pragma once
#include <iterator>
#include <iostream>
#include "../thread/thread_load.cuh"
#include "../thread/thread_store.cuh"
#include "../util_device.cuh"
#include "../util_debug.cuh"
#include "../util_namespace.cuh"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
/// Optional outer namespace(s)
xgboost/cub/cub/iterator/tex_ref_input_iterator.cuh view on Meta::CPAN
*/
#pragma once
#include <iterator>
#include <iostream>
#include "../thread/thread_load.cuh"
#include "../thread/thread_store.cuh"
#include "../util_device.cuh"
#include "../util_debug.cuh"
#include "../util_namespace.cuh"
#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE) // This iterator is compatible with CUDA 5.5 and newer
#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION