view release on metacpan or search on metacpan
xgboost/R-package/NAMESPACE view on Meta::CPAN
S3method("[",xgb.DMatrix)
S3method("dimnames<-",xgb.DMatrix)
S3method(dim,xgb.DMatrix)
S3method(dimnames,xgb.DMatrix)
S3method(getinfo,xgb.DMatrix)
S3method(predict,xgb.Booster)
S3method(predict,xgb.Booster.handle)
S3method(print,xgb.Booster)
S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous)
S3method(setinfo,xgb.DMatrix)
S3method(slice,xgb.DMatrix)
export("xgb.attr<-")
export("xgb.attributes<-")
export("xgb.parameters<-")
export(cb.cv.predict)
export(cb.early.stop)
export(cb.evaluation.log)
export(cb.print.evaluation)
export(cb.reset.parameters)
xgboost/R-package/R/xgb.cv.R view on Meta::CPAN
#'
#' Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data.
#'
#' The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
#'
#' All observations are used for both training and validation.
#'
#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
#'
#' @return
#' An object of class \code{xgb.cv.synchronous} with the following elements:
#' \itemize{
#' \item \code{call} a function call.
#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not
#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
#' \item \code{callbacks} callback functions that were either automatically assigned or
#' explicitely passed.
#' \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
#' first column corresponding to iteration number and the rest corresponding to the
#' CV-based evaluation means and standard deviations for the training and test CV-sets.
#' It is created by the \code{\link{cb.evaluation.log}} callback.
xgboost/R-package/R/xgb.cv.R view on Meta::CPAN
basket <- list()
# extract parameters that can affect the relationship b/w #trees and #iterations
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
# those are fixed for CV (no training continuation)
begin_iteration <- 1
end_iteration <- nrounds
# synchronous CV boosting: run CV folds' models within each iteration
for (iteration in begin_iteration:end_iteration) {
for (f in cb$pre_iter) f()
msg <- lapply(bst_folds, function(fd) {
xgb.iter.update(fd$bst, fd$dtrain, iteration - 1, obj)
xgb.iter.eval(fd$bst, fd$watchlist, iteration - 1, feval)
})
msg <- simplify2array(msg)
bst_evaluation <- rowMeans(msg)
xgboost/R-package/R/xgb.cv.R view on Meta::CPAN
ret <- list(
call = match.call(),
params = params,
callbacks = callbacks,
evaluation_log = evaluation_log,
niter = end_iteration,
folds = folds
)
ret <- c(ret, basket)
class(ret) <- 'xgb.cv.synchronous'
invisible(ret)
}
#' Print xgb.cv result
#'
#' Prints formatted results of \code{xgb.cv}.
#'
#' @param x an \code{xgb.cv.synchronous} object
#' @param verbose whether to print detailed data
#' @param ... passed to \code{data.table.print}
#'
#' @details
#' When not verbose, it would only print the evaluation results,
#' including the best iteration (when available).
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' print(cv)
#' print(cv, verbose=TRUE)
#'
#' @rdname print.xgb.cv
#' @method print xgb.cv.synchronous
#' @export
print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
cat('##### xgb.cv ', length(x$folds), '-folds\n', sep = '')
if (verbose) {
if (!is.null(x$call)) {
cat('call:\n ')
print(x$call)
}
if (!is.null(x$params)) {
cat('params (as set within xgb.cv):\n')
cat( ' ',
xgboost/R-package/man/print.xgb.cv.Rd view on Meta::CPAN
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.cv.R
\name{print.xgb.cv.synchronous}
\alias{print.xgb.cv.synchronous}
\title{Print xgb.cv result}
\usage{
\method{print}{xgb.cv.synchronous}(x, verbose = FALSE, ...)
}
\arguments{
\item{x}{an \code{xgb.cv.synchronous} object}
\item{verbose}{whether to print detailed data}
\item{...}{passed to \code{data.table.print}}
}
\description{
Prints formatted results of \code{xgb.cv}.
}
\details{
When not verbose, it would only print the evaluation results,
xgboost/R-package/man/xgb.cv.Rd view on Meta::CPAN
This parameter is passed to the \code{\link{cb.early.stop}} callback.}
\item{callbacks}{a list of callback functions to perform various task during boosting.
See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
parameters' values. User can provide either existing or their own callback methods in order
to customize the training process.}
\item{...}{other parameters to pass to \code{params}.}
}
\value{
An object of class \code{xgb.cv.synchronous} with the following elements:
\itemize{
\item \code{call} a function call.
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
\item \code{callbacks} callback functions that were either automatically assigned or
explicitely passed.
\item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
first column corresponding to iteration number and the rest corresponding to the
CV-based evaluation means and standard deviations for the training and test CV-sets.
It is created by the \code{\link{cb.evaluation.log}} callback.
xgboost/R-package/tests/testthat/test_basic.R view on Meta::CPAN
})
test_that("xgb.cv works", {
set.seed(11)
expect_output(
cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
verbose=TRUE)
, "train-error:")
expect_is(cv, 'xgb.cv.synchronous')
expect_false(is.null(cv$evaluation_log))
expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
expect_lt(cv$evaluation_log[, min(test_error_std)], 0.004)
expect_equal(cv$niter, 2)
expect_false(is.null(cv$folds) && is.list(cv$folds))
expect_length(cv$folds, 5)
expect_false(is.null(cv$params) && is.list(cv$params))
expect_false(is.null(cv$callbacks))
expect_false(is.null(cv$call))
})
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
- Added descending sorting to DeviceRadixSort and BlockRadixSort
- Added min, max, arg-min, and arg-max to DeviceReduce
- Added DeviceSelect (select-unique, select-if, and select-flagged)
- Added DevicePartition (partition-if, partition-flagged)
- Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
- Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
- Performance
- Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
- Interface changes:
- Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces. The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
- Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
- Documentation and testing:
- Added simple examples of device-wide methods
- Improved doxygen documentation and example snippets
- Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform)
- Bug fixes
- Fixed misc DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM10-SM13
- Fixed DeviceScan / WarpReduction bug: SHFL-based segmented reduction producting incorrect results for multi-word types (size > 4B) on Linux
- Fixed BlockScan bug: For warpscan-based scans, not all threads in the first warp were entering the prefix callback functor
- Fixed DeviceRadixSort bug: race condition with key-value pairs for pre-SM35 architectures
- Fixed DeviceRadixSort bug: incorrect bitfield-extract behavior with long keys on 64bit Linux
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
typedef LengthOffsetPair WarpAggregates[WARPS];
// Shared memory type for this threadblock
struct _TempStorage
{
union
{
struct
{
typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection
typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans
Uninitialized<LengthOffsetPair[WARPS]> warp_aggregates; // Smem needed for sharing warp-wide aggregates
typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback
};
// Smem needed for input loading
typename BlockLoadT::TempStorage load;
// Smem needed for two-phase scatter
union
{
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
/// CUB namespace
namespace cub {
/**
* \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. 
* \ingroup BlockModule
*
* \tparam T The data type to be exchanged.
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread.
* \tparam WARP_TIME_SLICING <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the ex...
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
*
* \par Overview
* - It is commonplace for blocks of threads to rearrange data items between
* threads. For example, the device-accessible memory subsystem prefers access patterns
* where data items are "striped" across threads (where consecutive threads access consecutive items),
* yet most block-wide operations prefer a "blocked" partitioning of items across threads
* (where consecutive items belong to a single thread).
xgboost/cub/cub/block/block_raking_layout.cuh view on Meta::CPAN
{
//---------------------------------------------------------------------
// Constants and type definitions
//---------------------------------------------------------------------
enum
{
/// The total number of elements that need to be cooperatively reduced
SHARED_ELEMENTS = BLOCK_THREADS,
/// Maximum number of warp-synchronous raking threads
MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
/// Number of raking elements per warp-synchronous raking thread (rounded up)
SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
/// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
/// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
/// Degree of bank conflicts (e.g., 4-way)
CONFLICT_DEGREE = (HAS_CONFLICTS) ?
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
* An efficient "raking" reduction algorithm that only supports commutative
* reduction operators (true for most operations, e.g., addition).
*
* \par
* Execution is comprised of three phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Threads in warps other than the first warp place
* their partial reductions into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within the first
* warp continue to accumulate by raking across segments of shared partial reductions
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
*
* \par
* \image html block_reduce.png
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
* and is preferable when the reduction operator is commutative. This variant
* applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
* throughput across the GPU when suitably occupied. However, turn-around latency may be
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
* (e.g., addition) and non-commutative (e.g., string concatenation) reduction
* operators. \blocked.
*
* \par
* Execution is comprised of three phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within a
* single warp rake across segments of shared partial reductions.
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
*
* \par
* \image html block_reduce.png
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant performs more communication than BLOCK_REDUCE_RAKING
* and is only preferable when the reduction operator is non-commutative. This variant
* applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
* throughput across the GPU when suitably occupied. However, turn-around latency may be
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
* \par Overview
* A quick "tiled warp-reductions" reduction algorithm that supports commutative
* (e.g., addition) and non-commutative (e.g., string concatenation) reduction
* operators.
*
* \par
* Execution is comprised of four phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
* reduction within each warp.
* -# A propagation phase where the warp reduction outputs in each warp are
* updated with the aggregate from each preceding warp.
*
* \par
* \image html block_scan_warpscans.png
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
xgboost/cub/cub/block/block_reduce.cuh view on Meta::CPAN
* - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
* - \rowmajor
* - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
* -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
* -# <b>cub::BLOCK_REDUCE_RAKING</b>. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
* -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
*
* \par Performance Considerations
* - \granularity
* - Very efficient (only one synchronization barrier).
* - Incurs zero bank conflicts for most types
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
* - Summation (<b><em>vs.</em></b> generic reduction)
* - \p BLOCK_THREADS is a multiple of the architecture's warp size
* - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
* - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
*
* \par A Simple Example
* \blockcollective{BlockReduce}
* \par
xgboost/cub/cub/block/block_scan.cuh view on Meta::CPAN
* \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
*/
enum BlockScanAlgorithm
{
/**
* \par Overview
* An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
* -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions.
* -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
* -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
* -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
*
* \par
* \image html block_scan_raking.png
* <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - Although this variant may suffer longer turnaround latencies when the
* GPU is under-occupied, it can often provide higher overall throughput
* across the GPU when suitably occupied.
*/
BLOCK_SCAN_RAKING,
/**
* \par Overview
* Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
* the expense of higher register pressure. Raking threads preserve their
* "upsweep" segment of values in registers while performing warp-synchronous
* scan, allowing the "downsweep" not to re-read them from shared memory.
*/
BLOCK_SCAN_RAKING_MEMOIZE,
/**
* \par Overview
* A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
* -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
* -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
* -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
*
* \par
* \image html block_scan_warpscans.png
* <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - Although this variant may suffer lower overall throughput across the
* GPU because due to a heavy reliance on inefficient warpscans, it can
xgboost/cub/cub/block/block_scan.cuh view on Meta::CPAN
* the <em>i</em><sup>th</sup> output reduction.
* - \rowmajor
* - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
* -# <b>cub::BLOCK_SCAN_RAKING</b>. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
* -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
* -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
*
* \par Performance Considerations
* - \granularity
* - Uses special instructions when applicable (e.g., warp \p SHFL)
* - Uses synchronization-free communication between warp lanes when applicable
* - Invokes a minimal number of minimal block-wide synchronization barriers (only
* one or two depending on algorithm selection)
* - Incurs zero bank conflicts for most types
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
* - Prefix sum variants (<b><em>vs.</em></b> generic scan)
* - \blocksize
* - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
*
* \par A Simple Example
* \blockcollective{BlockScan}
* \par
xgboost/cub/cub/block/specializations/block_reduce_raking.cuh view on Meta::CPAN
/// WarpReduce utility type
typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
/// Constants
enum
{
/// Number of raking threads
RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
/// Number of raking elements per warp synchronous raking thread
SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
/// Cooperative work can be entirely warp synchronous
WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
/// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
/// Whether or not accesses into smem are unguarded
RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
};
/// Shared memory storage layout type
union _TempStorage
{
typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Thread fields
_TempStorage &temp_storage;
xgboost/cub/cub/block/specializations/block_reduce_raking.cuh view on Meta::CPAN
template <
bool IS_FULL_TILE,
typename ReductionOp>
__device__ __forceinline__ T Reduce(
T partial, ///< [in] Calling thread's input partial reductions
int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE, SEGMENT_LENGTH>(
partial,
num_valid,
reduction_op);
}
else
{
// Place partial into shared memory grid.
*BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
xgboost/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh view on Meta::CPAN
/// Whether or not to use fall-back
USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
/// Number of raking threads
RAKING_THREADS = WARP_THREADS,
/// Number of threads actually sharing items with the raking threads
SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
/// Number of raking elements per warp synchronous raking thread
SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
};
/// WarpReduce utility type
typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
/// Layout type for padded thread block raking grid
typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
/// Shared memory storage layout type
struct _TempStorage
{
union
{
struct
{
typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
};
typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan
};
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
xgboost/cub/cub/block/specializations/block_reduce_warp_reductions.cuh view on Meta::CPAN
};
/// WarpReduce utility type
typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan
T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan
T block_prefix; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Thread fields
_TempStorage &temp_storage;
unsigned int linear_tid;
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
/// Layout type for padded threadblock raking grid
typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
/// Constants
enum
{
/// Number of raking threads
RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
/// Number of raking elements per warp synchronous raking thread
SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
/// Cooperative work can be entirely warp synchronous
WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
};
/// WarpScan utility type
typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan
typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
T block_aggregate; ///< Block aggregate
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op) ///< [in] Binary scan operator
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction across shared partials
T upsweep_partial = Upsweep(scan_op);
// Warp-synchronous scan
T exclusive_partial;
WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
}
CTA_SYNC();
// Grab thread prefix from shared memory
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input items
T &output, ///< [out] Calling thread's output items (may be aliased to \p input)
const T &initial_value, ///< [in] Initial value to seed the exclusive scan
ScanOp scan_op) ///< [in] Binary scan operator
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction across shared partials
T upsweep_partial = Upsweep(scan_op);
// Exclusive Warp-synchronous scan
T exclusive_partial;
WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, exclusive_partial);
}
CTA_SYNC();
// Grab exclusive partial from shared memory
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial v...
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction across shared partials
T upsweep_partial= Upsweep(scan_op);
// Warp-synchronous scan
T inclusive_partial;
T exclusive_partial;
WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
// Broadcast aggregate to all threads
if (linear_tid == RAKING_THREADS - 1)
temp_storage.block_aggregate = inclusive_partial;
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input items
T &output, ///< [out] Calling thread's output items (may be aliased to \p input)
const T &initial_value, ///< [in] Initial value to seed the exclusive scan
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction across shared partials
T upsweep_partial = Upsweep(scan_op);
// Warp-synchronous scan
T exclusive_partial;
WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, exclusive_partial);
// Broadcast aggregate to other threads
if (linear_tid == 0)
temp_storage.block_aggregate = block_aggregate;
}
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
typename ScanOp,
typename BlockPrefixCallbackOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
T block_aggregate;
WarpScan warp_scan(temp_storage.warp_scan);
warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
// Obtain warp-wide prefix in lane0, then broadcast to other lanes
T block_prefix = block_prefix_callback_op(block_aggregate);
block_prefix = warp_scan.Broadcast(block_prefix, 0);
output = scan_op(block_prefix, output);
if (linear_tid == 0)
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
WarpScan warp_scan(temp_storage.warp_scan);
// Raking upsweep reduction across shared partials
T upsweep_partial = Upsweep(scan_op);
// Warp-synchronous scan
T exclusive_partial, block_aggregate;
warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
// Obtain block-wide prefix in lane0, then broadcast to other lanes
T block_prefix = block_prefix_callback_op(block_aggregate);
block_prefix = warp_scan.Broadcast(block_prefix, 0);
// Update prefix with warpscan exclusive partial
T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
if (linear_tid == 0)
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element.
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op) ///< [in] Binary scan operator
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction across shared partials
T upsweep_partial = Upsweep(scan_op);
// Exclusive Warp-synchronous scan
T exclusive_partial;
WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
// Inclusive raking downsweep scan
InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
}
CTA_SYNC();
// Grab thread prefix from shared memory
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction across shared partials
T upsweep_partial = Upsweep(scan_op);
// Warp-synchronous scan
T inclusive_partial;
T exclusive_partial;
WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
// Inclusive raking downsweep scan
InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
// Broadcast aggregate to all threads
if (linear_tid == RAKING_THREADS - 1)
temp_storage.block_aggregate = inclusive_partial;
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
typename ScanOp,
typename BlockPrefixCallbackOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp-synchronous scan
T block_aggregate;
WarpScan warp_scan(temp_storage.warp_scan);
warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
// Obtain warp-wide prefix in lane0, then broadcast to other lanes
T block_prefix = block_prefix_callback_op(block_aggregate);
block_prefix = warp_scan.Broadcast(block_prefix, 0);
// Update prefix with exclusive warpscan partial
output = scan_op(block_prefix, output);
xgboost/cub/cub/block/specializations/block_scan_raking.cuh view on Meta::CPAN
CTA_SYNC();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
WarpScan warp_scan(temp_storage.warp_scan);
// Raking upsweep reduction across shared partials
T upsweep_partial = Upsweep(scan_op);
// Warp-synchronous scan
T exclusive_partial, block_aggregate;
warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
// Obtain block-wide prefix in lane0, then broadcast to other lanes
T block_prefix = block_prefix_callback_op(block_aggregate);
block_prefix = warp_scan.Broadcast(block_prefix, 0);
// Update prefix with warpscan exclusive partial
T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
if (linear_tid == 0)
xgboost/cub/cub/block/specializations/block_scan_warp_scans.cuh view on Meta::CPAN
typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
/// WarpScan utility type
typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
/// Shared memory storage layout type
struct __align__(32) _TempStorage
{
T warp_aggregates[WARPS];
typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans
T block_prefix; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
xgboost/cub/cub/block/specializations/block_scan_warp_scans2.cuh view on Meta::CPAN
/// WarpScan utility type
typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
/// WarpScan utility type
typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans
typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans
T warp_aggregates[WARPS];
T block_prefix; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
xgboost/cub/cub/block/specializations/block_scan_warp_scans3.cuh view on Meta::CPAN
typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
/// Shared memory storage layout type
struct _TempStorage
{
union
{
Uninitialized<OuterScanArray> outer_warp_scan; ///< Buffer for warp-synchronous outer scans
typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan
};
T warp_aggregates[OUTER_WARPS];
T block_aggregate; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
static cudaError_t HistogramEven(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT lower_level1[1] = {lower_level};
LevelT upper_level1[1] = {upper_level};
return MultiHistogramEven<1, 1>(
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
lower_level1,
upper_level1,
num_samples,
1,
sizeof(SampleT) * num_samples,
stream,
debug_synchronous);
}
/**
* \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
*
* \par
* - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
* using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
* - The row stride must be a whole multiple of the sample data type
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT lower_level1[1] = {lower_level};
LevelT upper_level1[1] = {upper_level};
return MultiHistogramEven<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
lower_level1,
upper_level1,
num_row_samples,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
* for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
static cudaError_t MultiHistogramEven(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_pixels,
1,
sizeof(SampleT) * NUM_CHANNELS * num_pixels,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
Int2Type<sizeof(SampleT) == 1> is_byte_sample;
if ((sizeof(OffsetT) > sizeof(int)) &&
((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
{
// Down-convert OffsetT data type
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
(int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
//@} end member group
/******************************************************************//**
* \name Custom bin ranges
*********************************************************************/
//@{
/**
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t HistogramRange(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT* d_levels1[1] = {d_levels};
return MultiHistogramRange<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
d_levels1,
num_samples,
1,
sizeof(SampleT) * num_samples,
stream,
debug_synchronous);
}
/**
* \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
*
* \par
* - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
* using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
* - The row stride must be a whole multiple of the sample data type
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples.
CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is <tt>num_levels</tt> - 1.
LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
CounterT* d_histogram1[1] = {d_histogram};
int num_levels1[1] = {num_levels};
LevelT* d_levels1[1] = {d_levels};
return MultiHistogramRange<1, 1>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram1,
num_levels1,
d_levels1,
num_row_samples,
num_rows,
row_stride_bytes,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
* for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t MultiHistogramRange(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_pixels,
1,
sizeof(SampleT) * NUM_CHANNELS * num_pixels,
stream,
debug_synchronous);
}
/**
* \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
*
* \par
* - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
* a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
* - Of the \p NUM_CHANNELS specified, the function will only compute histograms
xgboost/cub/cub/device/device_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
/// The sample value type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
Int2Type<sizeof(SampleT) == 1> is_byte_sample;
if ((sizeof(OffsetT) > sizeof(int)) &&
((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
{
// Down-convert OffsetT data type
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
(int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream, debug_synchronous, is_byte_sample);
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. 
*
* \par
* - Copies of the selected items are compacted into \p d_out and maintain their original
* relative ordering, however copies of the unselected items are compacted into the
* rear of \p d_out in reverse order.
xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
select_op,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_partition_flagged.cu
* \example example_device_partition_if.cu
*/
} // CUB namespace
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortPairs(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
* - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortPairsDescending(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
/******************************************************************//**
* \name Keys-only
*********************************************************************/
//@{
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeys(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
template <typename KeyT>
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeys(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
* - \devicestorageNP For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
* - \devicestorage
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeysDescending(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_radix_sort.cuh view on Meta::CPAN
template <typename KeyT>
CUB_RUNTIME_FUNCTION
static cudaError_t SortKeysDescending(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] Number of items to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
};
/**
* \example example_device_radix_sort.cu
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Reduce(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
ReductionOpT reduction_op, ///< [in] Binary reduction functor
T init, ///< [in] Initial value of the reduction
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
reduction_op,
init,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide sum using the addition (\p +) operator.
*
* \par
* - Uses \p 0 as the initial value of the reduction.
* - Does not support \p + operators that are non-commutative..
* - \devicestorage
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t Sum(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
cub::Sum(),
OutputT(), // zero-initialize
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide minimum using the less-than ('<') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
* - Does not support \p < operators that are non-commutative.
* - \devicestorage
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t Min(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
cub::Min(),
Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
* - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMin(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_items,
cub::ArgMin(),
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide maximum using the greater-than ('>') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
* - Does not support \p > operators that are non-commutative.
* - \devicestorage
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t Max(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_items,
cub::Max(),
Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
* - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMax(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_items,
cub::ArgMax(),
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
*
* \par
* This operation computes segmented reductions within \p d_values_in using
* the specified binary \p reduction_op functor. The segments are identified by
* "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values
AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
ReductionOpT reduction_op, ///< [in] Binary reduction functor
int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
// FlagT iterator type (not used)
// Selection op (not used)
// Default == operator
typedef Equality EqualityOp;
xgboost/cub/cub/device/device_reduce.cuh view on Meta::CPAN
temp_storage_bytes,
d_keys_in,
d_unique_out,
d_values_in,
d_aggregates_out,
d_num_runs_out,
EqualityOp(),
reduction_op,
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_reduce.cu
*/
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_run_length_encode.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Encode(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs
int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType SelectOp; // Selection op (not used)
typedef Equality EqualityOp; // Default == operator
typedef cub::Sum ReductionOp; // Value reduction operator
// The lengths output value type
typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ?
OffsetT, // ... then the OffsetT type,
xgboost/cub/cub/device/device_run_length_encode.cuh view on Meta::CPAN
temp_storage_bytes,
d_in,
d_unique_out,
LengthsInputIteratorT((LengthT) 1),
d_counts_out,
d_num_runs_out,
EqualityOp(),
ReductionOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
*
* \par
* - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
* and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
* <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
xgboost/cub/cub/device/device_run_length_encode.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t NonTrivialRuns(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to input sequence of data items
OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef Equality EqualityOp; // Default == operator
return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_offsets_out,
d_lengths_out,
d_num_runs_out,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveSum(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
// Initial value
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
Sum(),
init_value,
num_items,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out.
*
* \par
* - Supports non-commutative scan operators.
* - \devicestorage
*
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveScan(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
ScanOpT scan_op, ///< [in] Binary scan functor
InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
init_value,
num_items,
stream,
debug_synchronous);
}
//@} end member group
/******************************************************************//**
* \name Inclusive scans
*********************************************************************/
//@{
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveSum(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
Sum(),
NullType(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
*
* \par
* - Supports non-commutative scan operators.
* - \devicestorage
*
xgboost/cub/cub/device/device_scan.cuh view on Meta::CPAN
typename ScanOpT>
CUB_RUNTIME_FUNCTION
static cudaError_t InclusiveScan(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
ScanOpT scan_op, ///< [in] Binary scan functor
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
NullType(),
num_items,
stream,
debug_synchronous);
}
//@} end member group
};
/**
* \example example_device_scan.cu
*/
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers and a corresponding
* pair of associated value buffers. Each pair is managed by a DoubleBuffer
* structure that indicates which of the two buffers is "current" (and thus
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the consol...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
/******************************************************************//**
* \name Keys-only
*********************************************************************/
//@{
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
*
* \par
* - The contents of the input data are not altered by the sorting operation
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
* for both the \p d_begin_offsets and \p d_end_offsets parameters (where
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort
KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
false,
stream,
debug_synchronous);
}
/**
* \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
*
* \par
* - The sorting operation is given a pair of key buffers managed by a
* DoubleBuffer structure that indicates which of the two buffers is
* "current" (and thus contains the input data to be sorted).
xgboost/cub/cub/device/device_segmented_radix_sort.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
int num_items, ///< [in] The total number of items to sort (across all segments)
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
int begin_bit = 0, ///< [in] <b>[optional]</b> The least-significant bit index (inclusive) needed for key comparison
int end_bit = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
// Signed integer type for global offsets
typedef int OffsetT;
// Null value type
DoubleBuffer<NullType> d_values;
return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_begin_offsets,
d_end_offsets,
begin_bit,
end_bit,
true,
stream,
debug_synchronous);
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
ReductionOp reduction_op, ///< [in] Binary reduction functor
T initial_value, ///< [in] Initial value of the reduction for each segment
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
reduction_op,
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide segmented sum using the addition ('+') operator.
*
* \par
* - Uses \p 0 as the initial value of the reduction for each segment.
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Sum(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Sum(),
OutputT(), // zero-initialize
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Min(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Min(),
Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
* - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMin(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::ArgMin(),
initial_value,
stream,
debug_synchronous);
}
/**
* \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
*
* \par
* - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
* - When input a contiguous sequence of segments, a single sequence
* \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t Max(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::Max(),
Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
stream,
debug_synchronous);
}
/**
* \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
*
* \par
* - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
* - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
* - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION
static cudaError_t ArgMax(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_segments, ///< [in] The number of segments that comprise the sorting data
int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Defa...
{
// Signed integer type for global offsets
typedef int OffsetT;
// The input type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
// The output tuple type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
xgboost/cub/cub/device/device_segmented_reduce.cuh view on Meta::CPAN
d_temp_storage,
temp_storage_bytes,
d_indexed_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
cub::ArgMax(),
initial_value,
stream,
debug_synchronous);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. 
*
* \par
* - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
* - \devicestorage
*
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
select_op,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. 
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType SelectOp; // Selection op (not used)
typedef Equality EqualityOp; // Default == operator
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_select_flagged.cu
* \example example_device_select_if.cu
* \example example_device_select_unique.cu
*/
xgboost/cub/cub/device/device_spmv.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
int num_rows, ///< [in] number of rows of matrix <b>A</b>.
int num_cols, ///< [in] number of columns of matrix <b>A</b>.
int num_nonzeros, ///< [in] number of nonzero elements of matrix <b>A</b>.
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
SpmvParams<ValueT, int> spmv_params;
spmv_params.d_values = d_values;
spmv_params.d_row_end_offsets = d_row_offsets + 1;
spmv_params.d_column_indices = d_column_indices;
spmv_params.d_vector_x = d_vector_x;
spmv_params.d_vector_y = d_vector_y;
spmv_params.num_rows = num_rows;
spmv_params.num_cols = num_cols;
spmv_params.num_nonzeros = num_nonzeros;
spmv_params.alpha = 1.0;
spmv_params.beta = 0.0;
return DispatchSpmv<ValueT, int>::Dispatch(
d_temp_storage,
temp_storage_bytes,
spmv_params,
stream,
debug_synchronous);
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i<...
OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel
int max_num_output_bins, ///< [in] Maximum number of output bins in any channel
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
cudaError error = cudaSuccess;
do
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
// Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
int histogram_init_block_threads = 256;
int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
// Log DeviceHistogramInitKernel configuration
if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
// Invoke histogram_init_kernel
histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
num_output_bins_wrapper,
d_output_histograms_wrapper,
tile_queue);
// Return if empty problem
if ((blocks_per_row == 0) || (blocks_per_col == 0))
break;
// Log histogram_sweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
// Invoke histogram_sweep_kernel
histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
d_samples,
num_output_bins_wrapper,
num_privatized_bins_wrapper,
d_output_histograms_wrapper,
d_privatized_histograms_wrapper,
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_row_pixels,
num_rows,
row_stride_samples,
tiles_per_row,
tile_queue);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<false> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
else
{
// Dispatch shared-privatized approach
const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
if (CubDebug(error = PrivatizedDispatch(
d_temp_storage,
temp_storage_bytes,
d_samples,
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
} while (0);
return error;
}
/**
* Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> ...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<true> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
} while (0);
return error;
}
/**
* Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
*/
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<false> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
else
{
// Dispatch shared-privatized approach
const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
if (CubDebug(error = PrivatizedDispatch(
d_temp_storage,
temp_storage_bytes,
d_samples,
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
}
while (0);
return error;
}
/**
* Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
Int2Type<true> is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh view on Meta::CPAN
num_output_levels,
output_decode_op,
max_num_output_bins,
num_row_pixels,
num_rows,
row_stride_samples,
DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
histogram_sweep_config,
stream,
debug_synchronous))) break;
}
while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
//------------------------------------------------------------------------------
void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
OffsetT num_items; ///< [in] Number of items to sort
int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int ptx_version; ///< [in] PTX version
bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers
//------------------------------------------------------------------------------
// Constructor
//------------------------------------------------------------------------------
/// Constructor
CUB_RUNTIME_FUNCTION __forceinline__
DispatchRadixSort(
void* d_temp_storage,
size_t &temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
OffsetT num_items,
int begin_bit,
int end_bit,
bool is_overwrite_okay,
cudaStream_t stream,
bool debug_synchronous,
int ptx_version)
:
d_temp_storage(d_temp_storage),
temp_storage_bytes(temp_storage_bytes),
d_keys(d_keys),
d_values(d_values),
num_items(num_items),
begin_bit(begin_bit),
end_bit(end_bit),
stream(stream),
debug_synchronous(debug_synchronous),
ptx_version(ptx_version),
is_overwrite_okay(is_overwrite_okay)
{}
//------------------------------------------------------------------------------
// Small-problem (single tile) invocation
//------------------------------------------------------------------------------
/// Invoke a single block to sort in-core
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
{
temp_storage_bytes = 1;
break;
}
// Return if empty problem
if (num_items == 0)
break;
// Log single_tile_kernel configuration
if (debug_synchronous)
_CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
// Invoke upsweep_kernel with same grid size as downsweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
d_keys.Current(),
d_keys.Alternate(),
d_values.Current(),
d_values.Alternate(),
num_items,
begin_bit,
end_bit);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update selector
d_keys.selector ^= 1;
d_values.selector ^= 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
int spine_length,
int ¤t_bit,
PassConfigT &pass_config)
{
cudaError error = cudaSuccess;
do
{
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
// Log upsweep_kernel configuration
if (debug_synchronous)
_CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
// Invoke upsweep_kernel with same grid size as downsweep_kernel
pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
d_keys_in,
d_spine,
num_items,
current_bit,
pass_bits,
pass_config.even_share);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log scan_kernel configuration
if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
// Invoke scan_kernel
pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
d_spine,
spine_length);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log downsweep_kernel configuration
if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
// Invoke downsweep_kernel
pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
d_spine,
num_items,
current_bit,
pass_bits,
pass_config.even_share);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update current bit
current_bit += pass_bits;
}
while (0);
return error;
}
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
OffsetT num_items, ///< [in] Number of items to sort
int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
cudaError_t error;
do {
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Create dispatch functor
DispatchRadixSort dispatch(
d_temp_storage, temp_storage_bytes,
d_keys, d_values,
num_items, begin_bit, end_bit, is_overwrite_okay,
stream, debug_synchronous, ptx_version);
// Dispatch to chained policy
if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
} while (0);
return error;
}
};
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
DoubleBuffer<KeyT> &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
OffsetT num_items; ///< [in] Number of items to sort
OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data
const OffsetT *d_begin_offsets; ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data se...
const OffsetT *d_end_offsets; ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segmen...
int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int ptx_version; ///< [in] PTX version
bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers
//------------------------------------------------------------------------------
// Constructors
//------------------------------------------------------------------------------
/// Constructor
CUB_RUNTIME_FUNCTION __forceinline__
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
OffsetT num_items,
OffsetT num_segments,
const OffsetT *d_begin_offsets,
const OffsetT *d_end_offsets,
int begin_bit,
int end_bit,
bool is_overwrite_okay,
cudaStream_t stream,
bool debug_synchronous,
int ptx_version)
:
d_temp_storage(d_temp_storage),
temp_storage_bytes(temp_storage_bytes),
d_keys(d_keys),
d_values(d_values),
num_items(num_items),
num_segments(num_segments),
d_begin_offsets(d_begin_offsets),
d_end_offsets(d_end_offsets),
begin_bit(begin_bit),
end_bit(end_bit),
is_overwrite_okay(is_overwrite_okay),
stream(stream),
debug_synchronous(debug_synchronous),
ptx_version(ptx_version)
{}
//------------------------------------------------------------------------------
// Multi-segment invocation
//------------------------------------------------------------------------------
/// Invoke a three-kernel sorting pass at the current bit.
template <typename PassConfigT>
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
ValueT *d_values_out,
int ¤t_bit,
PassConfigT &pass_config)
{
cudaError error = cudaSuccess;
do
{
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
// Log kernel configuration
if (debug_synchronous)
_CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
num_segments, pass_config.segmented_config.block_threads, (long long) stream,
pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
d_keys_in, d_keys_out,
d_values_in, d_values_out,
d_begin_offsets, d_end_offsets, num_segments,
current_bit, pass_bits);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update current bit
current_bit += pass_bits;
}
while (0);
return error;
}
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
DoubleBuffer<KeyT> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<ValueT> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
int num_items, ///< [in] Number of items to sort
int num_segments, ///< [in] The number of segments that comprise the sorting data
const int *d_begin_offsets, ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> dat...
const int *d_end_offsets, ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data se...
int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison
int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison
bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
cudaError_t error;
do {
// Get PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
// Create dispatch functor
DispatchSegmentedRadixSort dispatch(
d_temp_storage, temp_storage_bytes,
d_keys, d_values,
num_items, num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit, is_overwrite_okay,
stream, debug_synchronous, ptx_version);
// Dispatch to chained policy
if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
} while (0);
return error;
}
};