Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/R-package/NAMESPACE  view on Meta::CPAN


S3method("[",xgb.DMatrix)
S3method("dimnames<-",xgb.DMatrix)
S3method(dim,xgb.DMatrix)
S3method(dimnames,xgb.DMatrix)
S3method(getinfo,xgb.DMatrix)
S3method(predict,xgb.Booster)
S3method(predict,xgb.Booster.handle)
S3method(print,xgb.Booster)
S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous)
S3method(setinfo,xgb.DMatrix)
S3method(slice,xgb.DMatrix)
export("xgb.attr<-")
export("xgb.attributes<-")
export("xgb.parameters<-")
export(cb.cv.predict)
export(cb.early.stop)
export(cb.evaluation.log)
export(cb.print.evaluation)
export(cb.reset.parameters)

xgboost/R-package/R/xgb.cv.R  view on Meta::CPAN

#' 
#' Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data. 
#' 
#' The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
#' 
#' All observations are used for both training and validation.
#' 
#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
#'
#' @return 
#' An object of class \code{xgb.cv.synchronous} with the following elements:
#' \itemize{
#'   \item \code{call} a function call.
#'   \item \code{params} parameters that were passed to the xgboost library. Note that it does not 
#'         capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
#'   \item \code{callbacks} callback functions that were either automatically assigned or 
#'         explicitely passed.
#'   \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
#'         first column corresponding to iteration number and the rest corresponding to the 
#'         CV-based evaluation means and standard deviations for the training and test CV-sets.
#'         It is created by the \code{\link{cb.evaluation.log}} callback.

xgboost/R-package/R/xgb.cv.R  view on Meta::CPAN

  basket <- list()

  # extract parameters that can affect the relationship b/w #trees and #iterations
  num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)

  # those are fixed for CV (no training continuation)
  begin_iteration <- 1
  end_iteration <- nrounds
  
  # synchronous CV boosting: run CV folds' models within each iteration
  for (iteration in begin_iteration:end_iteration) {
    
    for (f in cb$pre_iter) f()
    
    msg <- lapply(bst_folds, function(fd) {
      xgb.iter.update(fd$bst, fd$dtrain, iteration - 1, obj)
      xgb.iter.eval(fd$bst, fd$watchlist, iteration - 1, feval)
    })
    msg <- simplify2array(msg)
    bst_evaluation <- rowMeans(msg)

xgboost/R-package/R/xgb.cv.R  view on Meta::CPAN

  ret <- list(
    call = match.call(),
    params = params,
    callbacks = callbacks,
    evaluation_log = evaluation_log,
    niter = end_iteration,
    folds = folds
  )
  ret <- c(ret, basket)

  class(ret) <- 'xgb.cv.synchronous'
  invisible(ret)
}



#' Print xgb.cv result
#' 
#' Prints formatted results of \code{xgb.cv}.
#' 
#' @param x an \code{xgb.cv.synchronous} object
#' @param verbose whether to print detailed data
#' @param ... passed to \code{data.table.print}
#' 
#' @details
#' When not verbose, it would only print the evaluation results, 
#' including the best iteration (when available).
#' 
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' print(cv)
#' print(cv, verbose=TRUE)
#' 
#' @rdname print.xgb.cv
#' @method print xgb.cv.synchronous
#' @export
print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
  cat('##### xgb.cv ', length(x$folds), '-folds\n', sep = '')
  
  if (verbose) {
    if (!is.null(x$call)) {
      cat('call:\n  ')
      print(x$call)
    }
    if (!is.null(x$params)) {
      cat('params (as set within xgb.cv):\n')
      cat( '  ', 

xgboost/R-package/man/print.xgb.cv.Rd  view on Meta::CPAN

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.cv.R
\name{print.xgb.cv.synchronous}
\alias{print.xgb.cv.synchronous}
\title{Print xgb.cv result}
\usage{
\method{print}{xgb.cv.synchronous}(x, verbose = FALSE, ...)
}
\arguments{
\item{x}{an \code{xgb.cv.synchronous} object}

\item{verbose}{whether to print detailed data}

\item{...}{passed to \code{data.table.print}}
}
\description{
Prints formatted results of \code{xgb.cv}.
}
\details{
When not verbose, it would only print the evaluation results, 

xgboost/R-package/man/xgb.cv.Rd  view on Meta::CPAN

This parameter is passed to the \code{\link{cb.early.stop}} callback.}

\item{callbacks}{a list of callback functions to perform various task during boosting.
See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the 
parameters' values. User can provide either existing or their own callback methods in order 
to customize the training process.}

\item{...}{other parameters to pass to \code{params}.}
}
\value{
An object of class \code{xgb.cv.synchronous} with the following elements:
\itemize{
  \item \code{call} a function call.
  \item \code{params} parameters that were passed to the xgboost library. Note that it does not 
        capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
  \item \code{callbacks} callback functions that were either automatically assigned or 
        explicitely passed.
  \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
        first column corresponding to iteration number and the rest corresponding to the 
        CV-based evaluation means and standard deviations for the training and test CV-sets.
        It is created by the \code{\link{cb.evaluation.log}} callback.

xgboost/R-package/tests/testthat/test_basic.R  view on Meta::CPAN

})


test_that("xgb.cv works", {
  set.seed(11)
  expect_output(
    cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
                 eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
                 verbose=TRUE)
  , "train-error:")
  expect_is(cv, 'xgb.cv.synchronous')
  expect_false(is.null(cv$evaluation_log))
  expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
  expect_lt(cv$evaluation_log[, min(test_error_std)], 0.004)
  expect_equal(cv$niter, 2)
  expect_false(is.null(cv$folds) && is.list(cv$folds))
  expect_length(cv$folds, 5)
  expect_false(is.null(cv$params) && is.list(cv$params))
  expect_false(is.null(cv$callbacks))
  expect_false(is.null(cv$call))
})

xgboost/cub/CHANGE_LOG.TXT  view on Meta::CPAN

        - Added descending sorting to DeviceRadixSort and BlockRadixSort
        - Added min, max, arg-min, and arg-max to DeviceReduce
        - Added DeviceSelect (select-unique, select-if, and select-flagged)
        - Added DevicePartition (partition-if, partition-flagged)
        - Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
        - Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
    - Performance
        - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
    - Interface changes:
        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
        - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
    - Documentation and testing:
        - Added simple examples of device-wide methods
        - Improved doxygen documentation and example snippets
        - Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform)
    - Bug fixes 
        - Fixed misc DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM10-SM13
        - Fixed DeviceScan / WarpReduction bug: SHFL-based segmented reduction producting incorrect results for multi-word types (size > 4B) on Linux 
        - Fixed BlockScan bug: For warpscan-based scans, not all threads in the first warp were entering the prefix callback functor
        - Fixed DeviceRadixSort bug: race condition with key-value pairs for pre-SM35 architectures
        - Fixed DeviceRadixSort bug: incorrect bitfield-extract behavior with long keys on 64bit Linux

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

    typedef LengthOffsetPair WarpAggregates[WARPS];

    // Shared memory type for this threadblock
    struct _TempStorage
    {
        union
        {
            struct
            {
                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
            };

            // Smem needed for input loading
            typename BlockLoadT::TempStorage                    load;

            // Smem needed for two-phase scatter
            union
            {

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

/// CUB namespace
namespace cub {

/**
 * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
 * \ingroup BlockModule
 *
 * \tparam T                    The data type to be exchanged.
 * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
 * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
 * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the ex...
 * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
 * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
 * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
 *
 * \par Overview
 * - It is commonplace for blocks of threads to rearrange data items between
 *   threads.  For example, the device-accessible memory subsystem prefers access patterns
 *   where data items are "striped" across threads (where consecutive threads access consecutive items),
 *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
 *   (where consecutive items belong to a single thread).

xgboost/cub/cub/block/block_raking_layout.cuh  view on Meta::CPAN

{
    //---------------------------------------------------------------------
    // Constants and type definitions
    //---------------------------------------------------------------------

    enum
    {
        /// The total number of elements that need to be cooperatively reduced
        SHARED_ELEMENTS = BLOCK_THREADS,

        /// Maximum number of warp-synchronous raking threads
        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),

        /// Number of raking elements per warp-synchronous raking thread (rounded up)
        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,

        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,

        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),

        /// Degree of bank conflicts (e.g., 4-way)
        CONFLICT_DEGREE = (HAS_CONFLICTS) ?

xgboost/cub/cub/block/block_reduce.cuh  view on Meta::CPAN

     * An efficient "raking" reduction algorithm that only supports commutative
     * reduction operators (true for most operations, e.g., addition).
     *
     * \par
     * Execution is comprised of three phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more
     *    than one input each).  Threads in warps other than the first warp place
     *    their partial reductions into shared memory.
     * -# Upsweep sequential reduction in shared memory.  Threads within the first
     *    warp continue to accumulate by raking across segments of shared partial reductions
     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
     *
     * \par
     * \image html block_reduce.png
     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
     *   and is preferable when the reduction operator is commutative.  This variant
     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be

xgboost/cub/cub/block/block_reduce.cuh  view on Meta::CPAN

     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
     * operators. \blocked.
     *
     * \par
     * Execution is comprised of three phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more
     *    than one input each).  Each thread then places the partial reduction
     *    of its item(s) into shared memory.
     * -# Upsweep sequential reduction in shared memory.  Threads within a
     *    single warp rake across segments of shared partial reductions.
     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
     *
     * \par
     * \image html block_reduce.png
     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - This variant performs more communication than BLOCK_REDUCE_RAKING
     *   and is only preferable when the reduction operator is non-commutative.  This variant
     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be

xgboost/cub/cub/block/block_reduce.cuh  view on Meta::CPAN

     * \par Overview
     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
     * operators.
     *
     * \par
     * Execution is comprised of four phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more
     *    than one input each).  Each thread then places the partial reduction
     *    of its item(s) into shared memory.
     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
     *    reduction within each warp.
     * -# A propagation phase where the warp reduction outputs in each warp are
     *    updated with the aggregate from each preceding warp.
     *
     * \par
     * \image html block_scan_warpscans.png
     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING

xgboost/cub/cub/block/block_reduce.cuh  view on Meta::CPAN

 * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
 *   uses a binary combining operator to compute a single aggregate from a list of input elements.
 * - \rowmajor
 * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
 *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
 *
 * \par Performance Considerations
 * - \granularity
 * - Very efficient (only one synchronization barrier).
 * - Incurs zero bank conflicts for most types
 * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
 *   - Summation (<b><em>vs.</em></b> generic reduction)
 *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
 *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
 * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
 *
 * \par A Simple Example
 * \blockcollective{BlockReduce}
 * \par

xgboost/cub/cub/block/block_scan.cuh  view on Meta::CPAN

 * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
 */
enum BlockScanAlgorithm
{

    /**
     * \par Overview
     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
     *
     * \par
     * \image html block_scan_raking.png
     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - Although this variant may suffer longer turnaround latencies when the
     *   GPU is under-occupied, it can often provide higher overall throughput
     *   across the GPU when suitably occupied.
     */
    BLOCK_SCAN_RAKING,


    /**
     * \par Overview
     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
     * the expense of higher register pressure.  Raking threads preserve their
     * "upsweep" segment of values in registers while performing warp-synchronous
     * scan, allowing the "downsweep" not to re-read them from shared memory.
     */
    BLOCK_SCAN_RAKING_MEMOIZE,


    /**
     * \par Overview
     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
     *
     * \par
     * \image html block_scan_warpscans.png
     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
     *
     * \par Performance Considerations
     * - Although this variant may suffer lower overall throughput across the
     *   GPU because due to a heavy reliance on inefficient warpscans, it can

xgboost/cub/cub/block/block_scan.cuh  view on Meta::CPAN

 *   the <em>i</em><sup>th</sup> output reduction.
 * - \rowmajor
 * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
 *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
 *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
 *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
 *
 * \par Performance Considerations
 * - \granularity
 * - Uses special instructions when applicable (e.g., warp \p SHFL)
 * - Uses synchronization-free communication between warp lanes when applicable
 * - Invokes a minimal number of minimal block-wide synchronization barriers (only
 *   one or two depending on algorithm selection)
 * - Incurs zero bank conflicts for most types
 * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
 *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
 *   - \blocksize
 * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
 *
 * \par A Simple Example
 * \blockcollective{BlockScan}
 * \par

xgboost/cub/cub/block/specializations/block_reduce_raking.cuh  view on Meta::CPAN


    ///  WarpReduce utility type
    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;

    /// Constants
    enum
    {
        /// Number of raking threads
        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,

        /// Number of raking elements per warp synchronous raking thread
        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,

        /// Cooperative work can be entirely warp synchronous
        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),

        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,

        /// Whether or not accesses into smem are unguarded
        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,

    };


    /// Shared memory storage layout type
    union _TempStorage
    {
        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded threadblock raking grid
    };


    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    // Thread fields
    _TempStorage &temp_storage;

xgboost/cub/cub/block/specializations/block_reduce_raking.cuh  view on Meta::CPAN

    template <
        bool                IS_FULL_TILE,
        typename            ReductionOp>
    __device__ __forceinline__ T Reduce(
        T                   partial,            ///< [in] Calling thread's input partial reductions
        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE, SEGMENT_LENGTH>(
                partial,
                num_valid,
                reduction_op);
        }
        else
        {
            // Place partial into shared memory grid.
            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;

xgboost/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh  view on Meta::CPAN


        /// Whether or not to use fall-back
        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),

        /// Number of raking threads
        RAKING_THREADS = WARP_THREADS,

        /// Number of threads actually sharing items with the raking threads
        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),

        /// Number of raking elements per warp synchronous raking thread
        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
    };

    ///  WarpReduce utility type
    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;

    /// Layout type for padded thread block raking grid
    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;

    /// Shared memory storage layout type
    struct _TempStorage
    {
        union
        {
            struct
            {
                typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
                typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded threadblock raking grid
            };
            typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
        };
    };


    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};

xgboost/cub/cub/block/specializations/block_reduce_warp_reductions.cuh  view on Meta::CPAN

    };


    ///  WarpReduce utility type
    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;


    /// Shared memory storage layout type
    struct _TempStorage
    {
        typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
    };

    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    // Thread fields
    _TempStorage &temp_storage;
    unsigned int linear_tid;

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN


    /// Layout type for padded threadblock raking grid
    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;

    /// Constants
    enum
    {
        /// Number of raking threads
        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,

        /// Number of raking elements per warp synchronous raking thread
        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,

        /// Cooperative work can be entirely warp synchronous
        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
    };

    ///  WarpScan utility type
    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;

    /// Shared memory storage layout type
    struct _TempStorage
    {
        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded threadblock raking grid
        T                                           block_aggregate;    ///< Block aggregate
    };


    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN


    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
    template <typename ScanOp>
    __device__ __forceinline__ void ExclusiveScan(
        T               input,                          ///< [in] Calling thread's input item
        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
        ScanOp          scan_op)                        ///< [in] Binary scan operator
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
        }
        else
        {
            // Place thread partial into shared memory raking grid
            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
            *placement_ptr = input;

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                // Raking upsweep reduction across shared partials
                T upsweep_partial = Upsweep(scan_op);

                // Warp-synchronous scan
                T exclusive_partial;
                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);

                // Exclusive raking downsweep scan
                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
            }

            CTA_SYNC();

            // Grab thread prefix from shared memory

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
    template <typename ScanOp>
    __device__ __forceinline__ void ExclusiveScan(
        T               input,              ///< [in] Calling thread's input items
        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
        ScanOp          scan_op)            ///< [in] Binary scan operator
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
        }
        else
        {
            // Place thread partial into shared memory raking grid
            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
            *placement_ptr = input;

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                // Raking upsweep reduction across shared partials
                T upsweep_partial = Upsweep(scan_op);

                // Exclusive Warp-synchronous scan
                T exclusive_partial;
                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);

                // Exclusive raking downsweep scan
                ExclusiveDownsweep(scan_op, exclusive_partial);
            }

            CTA_SYNC();

            // Grab exclusive partial from shared memory

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial v...
    template <typename ScanOp>
    __device__ __forceinline__ void ExclusiveScan(
        T               input,                          ///< [in] Calling thread's input item
        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
        ScanOp          scan_op,                        ///< [in] Binary scan operator
        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
        }
        else
        {
            // Place thread partial into shared memory raking grid
            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
            *placement_ptr = input;

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                // Raking upsweep reduction across shared partials
                T upsweep_partial= Upsweep(scan_op);

                // Warp-synchronous scan
                T inclusive_partial;
                T exclusive_partial;
                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);

                // Exclusive raking downsweep scan
                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));

                // Broadcast aggregate to all threads
                if (linear_tid == RAKING_THREADS - 1)
                    temp_storage.block_aggregate = inclusive_partial;

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

    template <typename ScanOp>
    __device__ __forceinline__ void ExclusiveScan(
        T               input,              ///< [in] Calling thread's input items
        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
        ScanOp          scan_op,            ///< [in] Binary scan operator
        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
        }
        else
        {
            // Place thread partial into shared memory raking grid
            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
            *placement_ptr = input;

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                // Raking upsweep reduction across shared partials
                T upsweep_partial = Upsweep(scan_op);

                // Warp-synchronous scan
                T exclusive_partial;
                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);

                // Exclusive raking downsweep scan
                ExclusiveDownsweep(scan_op, exclusive_partial);

                // Broadcast aggregate to other threads
                if (linear_tid == 0)
                    temp_storage.block_aggregate = block_aggregate;
            }

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

        typename ScanOp,
        typename BlockPrefixCallbackOp>
    __device__ __forceinline__ void ExclusiveScan(
        T                       input,                          ///< [in] Calling thread's input item
        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
        ScanOp                  scan_op,                        ///< [in] Binary scan operator
        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            T block_aggregate;
            WarpScan warp_scan(temp_storage.warp_scan);
            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);

            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
            T block_prefix = block_prefix_callback_op(block_aggregate);
            block_prefix = warp_scan.Broadcast(block_prefix, 0);

            output = scan_op(block_prefix, output);
            if (linear_tid == 0)

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                WarpScan warp_scan(temp_storage.warp_scan);

                // Raking upsweep reduction across shared partials
                T upsweep_partial = Upsweep(scan_op);

                // Warp-synchronous scan
                T exclusive_partial, block_aggregate;
                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);

                // Obtain block-wide prefix in lane0, then broadcast to other lanes
                T block_prefix = block_prefix_callback_op(block_aggregate);
                block_prefix = warp_scan.Broadcast(block_prefix, 0);

                // Update prefix with warpscan exclusive partial
                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
                if (linear_tid == 0)

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN


    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
    template <typename ScanOp>
    __device__ __forceinline__ void InclusiveScan(
        T               input,                          ///< [in] Calling thread's input item
        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
        ScanOp          scan_op)                        ///< [in] Binary scan operator
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
        }
        else
        {
            // Place thread partial into shared memory raking grid
            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
            *placement_ptr = input;

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                // Raking upsweep reduction across shared partials
                T upsweep_partial = Upsweep(scan_op);

                // Exclusive Warp-synchronous scan
                T exclusive_partial;
                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);

                // Inclusive raking downsweep scan
                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
            }

            CTA_SYNC();

            // Grab thread prefix from shared memory

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
    template <typename ScanOp>
    __device__ __forceinline__ void InclusiveScan(
        T               input,                          ///< [in] Calling thread's input item
        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
        ScanOp          scan_op,                        ///< [in] Binary scan operator
        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
        }
        else
        {
            // Place thread partial into shared memory raking grid
            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
            *placement_ptr = input;

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                // Raking upsweep reduction across shared partials
                T upsweep_partial = Upsweep(scan_op);

                // Warp-synchronous scan
                T inclusive_partial;
                T exclusive_partial;
                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);

                // Inclusive raking downsweep scan
                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));

                // Broadcast aggregate to all threads
                if (linear_tid == RAKING_THREADS - 1)
                    temp_storage.block_aggregate = inclusive_partial;

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

        typename ScanOp,
        typename BlockPrefixCallbackOp>
    __device__ __forceinline__ void InclusiveScan(
        T                       input,                          ///< [in] Calling thread's input item
        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
        ScanOp                  scan_op,                        ///< [in] Binary scan operator
        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
    {
        if (WARP_SYNCHRONOUS)
        {
            // Short-circuit directly to warp-synchronous scan
            T block_aggregate;
            WarpScan warp_scan(temp_storage.warp_scan);
            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);

            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
            T block_prefix = block_prefix_callback_op(block_aggregate);
            block_prefix = warp_scan.Broadcast(block_prefix, 0);

            // Update prefix with exclusive warpscan partial
            output = scan_op(block_prefix, output);

xgboost/cub/cub/block/specializations/block_scan_raking.cuh  view on Meta::CPAN

            CTA_SYNC();

            // Reduce parallelism down to just raking threads
            if (linear_tid < RAKING_THREADS)
            {
                WarpScan warp_scan(temp_storage.warp_scan);

                // Raking upsweep reduction across shared partials
                T upsweep_partial = Upsweep(scan_op);

                // Warp-synchronous scan
                T exclusive_partial, block_aggregate;
                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);

                // Obtain block-wide prefix in lane0, then broadcast to other lanes
                T block_prefix = block_prefix_callback_op(block_aggregate);
                block_prefix = warp_scan.Broadcast(block_prefix, 0);

                // Update prefix with warpscan exclusive partial
                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
                if (linear_tid == 0)

xgboost/cub/cub/block/specializations/block_scan_warp_scans.cuh  view on Meta::CPAN

    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;

    ///  WarpScan utility type
    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;

    /// Shared memory storage layout type

    struct __align__(32) _TempStorage
    {
        T                               warp_aggregates[WARPS];
        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
        T                               block_prefix;               ///< Shared prefix for the entire threadblock
    };


    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------
    // Per-thread fields

xgboost/cub/cub/block/specializations/block_scan_warp_scans2.cuh  view on Meta::CPAN


    ///  WarpScan utility type
    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;

    ///  WarpScan utility type
    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;

    /// Shared memory storage layout type
    struct _TempStorage
    {
        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
        T                                           warp_aggregates[WARPS];
        T                                           block_prefix;               ///< Shared prefix for the entire threadblock
    };


    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

xgboost/cub/cub/block/specializations/block_scan_warp_scans3.cuh  view on Meta::CPAN

    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;

    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];


    /// Shared memory storage layout type
    struct _TempStorage
    {
        union
        {
            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
        };
        T                               warp_aggregates[OUTER_WARPS];
        T                               block_aggregate;                           ///< Shared prefix for the entire threadblock
    };


    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    static cudaError_t HistogramEven(
        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        CounterT*           d_histogram1[1]     = {d_histogram};
        int                 num_levels1[1]      = {num_levels};
        LevelT              lower_level1[1]     = {lower_level};
        LevelT              upper_level1[1]     = {upper_level};

        return MultiHistogramEven<1, 1>(

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            lower_level1,
            upper_level1,
            num_samples,
            1,
            sizeof(SampleT) * num_samples,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
     *
     * \par
     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
     * - The row stride must be a whole multiple of the sample data type

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        CounterT*           d_histogram1[1]     = {d_histogram};
        int                 num_levels1[1]      = {num_levels};
        LevelT              lower_level1[1]     = {lower_level};
        LevelT              upper_level1[1]     = {upper_level};

        return MultiHistogramEven<1, 1>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            lower_level1,
            upper_level1,
            num_row_samples,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    static cudaError_t MultiHistogramEven(
        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram,
            num_levels,
            lower_level,
            upper_level,
            num_pixels,
            1,
            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
        Int2Type<sizeof(SampleT) == 1> is_byte_sample;

        if ((sizeof(OffsetT) > sizeof(int)) &&
            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
        {
            // Down-convert OffsetT data type


            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
                stream, debug_synchronous, is_byte_sample);
        }

        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
            stream, debug_synchronous, is_byte_sample);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Custom bin ranges
     *********************************************************************/
    //@{

    /**

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t HistogramRange(
        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        CounterT*           d_histogram1[1] = {d_histogram};
        int                 num_levels1[1]  = {num_levels};
        LevelT*             d_levels1[1]    = {d_levels};

        return MultiHistogramRange<1, 1>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            d_levels1,
            num_samples,
            1,
            sizeof(SampleT) * num_samples,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
     *
     * \par
     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
     * - The row stride must be a whole multiple of the sample data type

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        CounterT*           d_histogram1[1]     = {d_histogram};
        int                 num_levels1[1]      = {num_levels};
        LevelT*             d_levels1[1]        = {d_levels};

        return MultiHistogramRange<1, 1>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            d_levels1,
            num_row_samples,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t MultiHistogramRange(
        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram,
            num_levels,
            d_levels,
            num_pixels,
            1,
            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
        Int2Type<sizeof(SampleT) == 1> is_byte_sample;

        if ((sizeof(OffsetT) > sizeof(int)) &&
            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
        {
            // Down-convert OffsetT data type
            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
                stream, debug_synchronous, is_byte_sample);
        }

        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
            stream, debug_synchronous, is_byte_sample);
    }



    //@}  end member group
};

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/device_partition.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
        int                         num_items,                      ///< [in] Total number of items to select from
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
     *
     * \par
     * - Copies of the selected items are compacted into \p d_out and maintain their original
     *   relative ordering, however copies of the unselected items are compacted into the
     *   rear of \p d_out in reverse order.

xgboost/cub/cub/device/device_partition.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t If(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
        int                         num_items,                      ///< [in] Total number of items to select from
        SelectOp                    select_op,                      ///< [in] Unary selection operator
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            select_op,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_partition_flagged.cu
 * \example example_device_partition_if.cu
 */

}               // CUB namespace

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortPairs(
        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] Number of items to sort
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortPairsDescending(
        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] Number of items to sort
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Keys-only
     *********************************************************************/
    //@{


xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeys(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    template <typename KeyT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeys(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
     * - \devicestorage

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeysDescending(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    template <typename KeyT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeysDescending(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group


};

/**
 * \example example_device_radix_sort.cu

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Reduce(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
        T                           init,                               ///< [in] Initial value of the reduction
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            reduction_op,
            init,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide sum using the addition (\p +) operator.
     *
     * \par
     * - Uses \p 0 as the initial value of the reduction.
     * - Does not support \p + operators that are non-commutative..
     * - \devicestorage

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t Sum(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The output value type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            cub::Sum(),
            OutputT(),            // zero-initialize
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide minimum using the less-than ('<') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
     * - Does not support \p < operators that are non-commutative.
     * - \devicestorage

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t Min(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            cub::Min(),
            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMin(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN


        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_items,
            cub::ArgMin(),
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
     * - Does not support \p > operators that are non-commutative.
     * - \devicestorage

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t Max(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            cub::Max(),
            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMax(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN


        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_items,
            cub::ArgMax(),
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
     *
     * \par
     * This operation computes segmented reductions within \p d_values_in using
     * the specified binary \p reduction_op functor.  The segments are identified by
     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // FlagT iterator type (not used)

        // Selection op (not used)

        // Default == operator
        typedef Equality EqualityOp;

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

            temp_storage_bytes,
            d_keys_in,
            d_unique_out,
            d_values_in,
            d_aggregates_out,
            d_num_runs_out,
            EqualityOp(),
            reduction_op,
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_reduce.cu
 */

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/device_run_length_encode.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Encode(
        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int         OffsetT;                    // Signed integer type for global offsets
        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
        typedef NullType    SelectOp;                   // Selection op (not used)
        typedef Equality    EqualityOp;                 // Default == operator
        typedef cub::Sum    ReductionOp;                // Value reduction operator

        // The lengths output value type
        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
            OffsetT,                                                                                                    // ... then the OffsetT type,

xgboost/cub/cub/device/device_run_length_encode.cuh  view on Meta::CPAN

            temp_storage_bytes,
            d_in,
            d_unique_out,
            LengthsInputIteratorT((LengthT) 1),
            d_counts_out,
            d_num_runs_out,
            EqualityOp(),
            ReductionOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
     *
     * \par
     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.

xgboost/cub/cub/device/device_run_length_encode.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t NonTrivialRuns(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int         OffsetT;                    // Signed integer type for global offsets
        typedef Equality    EqualityOp;                 // Default == operator

        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_offsets_out,
            d_lengths_out,
            d_num_runs_out,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


};


}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)


xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

        typename        InputIteratorT,
        typename        OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t ExclusiveSum(
        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The output value type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

        // Initial value

xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN


        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            Sum(),
            init_value,
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
     *
     * \par
     * - Supports non-commutative scan operators.
     * - \devicestorage
     *

xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t ExclusiveScan(
        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
        ScanOpT         scan_op,                            ///< [in] Binary scan functor
        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            scan_op,
            init_value,
            num_items,
            stream,
            debug_synchronous);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Inclusive scans
     *********************************************************************/
    //@{


xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

        typename            InputIteratorT,
        typename            OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t InclusiveSum(
        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            Sum(),
            NullType(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
     *
     * \par
     * - Supports non-commutative scan operators.
     * - \devicestorage
     *

xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

        typename        ScanOpT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t InclusiveScan(
        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
        ScanOpT         scan_op,                            ///< [in] Binary scan functor
        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            scan_op,
            NullType(),
            num_items,
            stream,
            debug_synchronous);
    }

    //@}  end member group

};

/**
 * \example example_device_scan.cu
 */

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int               *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
        const int               *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int               *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
        const int               *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Keys-only
     *********************************************************************/
    //@{


xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group


};

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
        T                   initial_value,                               ///< [in] Initial value of the reduction for each segment
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            reduction_op,
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
     *
     * \par
     * - Uses \p 0 as the initial value of the reduction for each segment.
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Sum(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The output value type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::Sum(),
            OutputT(),            // zero-initialize
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Min(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::Min(),
            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMin(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::ArgMin(),
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Max(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::Max(),
            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMax(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::ArgMax(),
            initial_value,
            stream,
            debug_synchronous);
    }

};

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)


xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
     *
     * \par
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage
     *

xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t If(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        SelectOp                    select_op,                      ///< [in] Unary selection operator
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            select_op,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.p...
     *
     * \par
     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage

xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

        typename                    NumSelectedIteratorT>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Unique(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef Equality                EqualityOp;     // Default == operator

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_select_flagged.cu
 * \example example_device_select_if.cu
 * \example example_device_select_unique.cu
 */

xgboost/cub/cub/device/device_spmv.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        SpmvParams<ValueT, int> spmv_params;
        spmv_params.d_values             = d_values;
        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
        spmv_params.d_column_indices     = d_column_indices;
        spmv_params.d_vector_x           = d_vector_x;
        spmv_params.d_vector_y           = d_vector_y;
        spmv_params.num_rows             = num_rows;
        spmv_params.num_cols             = num_cols;
        spmv_params.num_nonzeros         = num_nonzeros;
        spmv_params.alpha                = 1.0;
        spmv_params.beta                 = 0.0;

        return DispatchSpmv<ValueT, int>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            spmv_params,
            stream,
            debug_synchronous);
    }

    //@}  end member group
};



}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i<...
        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
    #ifndef CUB_RUNTIME_ENABLED

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported);

    #else

        cudaError error = cudaSuccess;
        do

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN


            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;

            int histogram_init_block_threads    = 256;
            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;

            // Log DeviceHistogramInitKernel configuration
            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);

            // Invoke histogram_init_kernel
            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
                num_output_bins_wrapper,
                d_output_histograms_wrapper,
                tile_queue);

            // Return if empty problem
            if ((blocks_per_row == 0) || (blocks_per_col == 0))
                break;

            // Log histogram_sweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);

            // Invoke histogram_sweep_kernel
            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
                d_samples,
                num_output_bins_wrapper,
                num_privatized_bins_wrapper,
                d_output_histograms_wrapper,
                d_privatized_histograms_wrapper,

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                num_row_pixels,
                num_rows,
                row_stride_samples,
                tiles_per_row,
                tile_queue);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

        }
        while (0);

        return error;

    #endif // CUB_RUNTIME_ENABLED
    }


xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }
            else
            {
                // Dispatch shared-privatized approach
                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;

                if (CubDebug(error = PrivatizedDispatch(
                    d_temp_storage,
                    temp_storage_bytes,
                    d_samples,

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }

        } while (0);

        return error;
    }


    /**
     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> ...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                num_output_levels,
                output_decode_op,
                max_num_output_bins,
                num_row_pixels,
                num_rows,
                row_stride_samples,
                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                histogram_sweep_config,
                stream,
                debug_synchronous))) break;

        } while (0);

        return error;
    }


    /**
     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
     */

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }
            else
            {
                // Dispatch shared-privatized approach
                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;

                if (CubDebug(error = PrivatizedDispatch(
                    d_temp_storage,
                    temp_storage_bytes,
                    d_samples,

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }
        }
        while (0);

        return error;
    }


    /**
     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                num_output_levels,
                output_decode_op,
                max_num_output_bins,
                num_row_pixels,
                num_rows,
                row_stride_samples,
                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                histogram_sweep_config,
                stream,
                debug_synchronous))) break;

        }
        while (0);

        return error;
    }

};


xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

    //------------------------------------------------------------------------------

    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
    OffsetT                 num_items;              ///< [in] Number of items to sort
    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    int                     ptx_version;            ///< [in] PTX version
    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers


    //------------------------------------------------------------------------------
    // Constructor
    //------------------------------------------------------------------------------

    /// Constructor
    CUB_RUNTIME_FUNCTION __forceinline__
    DispatchRadixSort(
        void*                   d_temp_storage,
        size_t                  &temp_storage_bytes,
        DoubleBuffer<KeyT>      &d_keys,
        DoubleBuffer<ValueT>    &d_values,
        OffsetT                 num_items,
        int                     begin_bit,
        int                     end_bit,
        bool                    is_overwrite_okay,
        cudaStream_t            stream,
        bool                    debug_synchronous,
        int                     ptx_version)
    :
        d_temp_storage(d_temp_storage),
        temp_storage_bytes(temp_storage_bytes),
        d_keys(d_keys),
        d_values(d_values),
        num_items(num_items),
        begin_bit(begin_bit),
        end_bit(end_bit),
        stream(stream),
        debug_synchronous(debug_synchronous),
        ptx_version(ptx_version),
        is_overwrite_okay(is_overwrite_okay)
    {}


    //------------------------------------------------------------------------------
    // Small-problem (single tile) invocation
    //------------------------------------------------------------------------------

    /// Invoke a single block to sort in-core

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

            {
                temp_storage_bytes = 1;
                break;
            }

            // Return if empty problem
            if (num_items == 0)
                break;

            // Log single_tile_kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);

            // Invoke upsweep_kernel with same grid size as downsweep_kernel
            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
                d_keys.Current(),
                d_keys.Alternate(),
                d_values.Current(),
                d_values.Alternate(),
                num_items,
                begin_bit,
                end_bit);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update selector
            d_keys.selector ^= 1;
            d_values.selector ^= 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        int             spine_length,
        int             &current_bit,
        PassConfigT     &pass_config)
    {
        cudaError error = cudaSuccess;
        do
        {
            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

            // Log upsweep_kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);

            // Invoke upsweep_kernel with same grid size as downsweep_kernel
            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
                d_keys_in,
                d_spine,
                num_items,
                current_bit,
                pass_bits,
                pass_config.even_share);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Log scan_kernel configuration
            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);

            // Invoke scan_kernel
            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
                d_spine,
                spine_length);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Log downsweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);

            // Invoke downsweep_kernel
            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
                d_keys_in,
                d_keys_out,
                d_values_in,
                d_values_out,
                d_spine,
                num_items,
                current_bit,
                pass_bits,
                pass_config.even_share);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update current bit
            current_bit += pass_bits;
        }
        while (0);

        return error;
    }


xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

    static cudaError_t Dispatch(
        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
        OffsetT                 num_items,              ///< [in] Number of items to sort
        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    {
        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;

        cudaError_t error;
        do {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Create dispatch functor
            DispatchRadixSort dispatch(
                d_temp_storage, temp_storage_bytes,
                d_keys, d_values,
                num_items, begin_bit, end_bit, is_overwrite_okay,
                stream, debug_synchronous, ptx_version);

            // Dispatch to chained policy
            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;

        } while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
    OffsetT                 num_items;              ///< [in] Number of items to sort
    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
    const OffsetT           *d_begin_offsets;       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data se...
    const OffsetT           *d_end_offsets;         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segmen...
    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    int                     ptx_version;            ///< [in] PTX version
    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers


    //------------------------------------------------------------------------------
    // Constructors
    //------------------------------------------------------------------------------

    /// Constructor
    CUB_RUNTIME_FUNCTION __forceinline__

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        DoubleBuffer<KeyT>      &d_keys,
        DoubleBuffer<ValueT>    &d_values,
        OffsetT                 num_items,
        OffsetT                 num_segments,
        const OffsetT           *d_begin_offsets,
        const OffsetT           *d_end_offsets,
        int                     begin_bit,
        int                     end_bit,
        bool                    is_overwrite_okay,
        cudaStream_t            stream,
        bool                    debug_synchronous,
        int                     ptx_version)
    :
        d_temp_storage(d_temp_storage),
        temp_storage_bytes(temp_storage_bytes),
        d_keys(d_keys),
        d_values(d_values),
        num_items(num_items),
        num_segments(num_segments),
        d_begin_offsets(d_begin_offsets),
        d_end_offsets(d_end_offsets),
        begin_bit(begin_bit),
        end_bit(end_bit),
        is_overwrite_okay(is_overwrite_okay),
        stream(stream),
        debug_synchronous(debug_synchronous),
        ptx_version(ptx_version)
    {}


    //------------------------------------------------------------------------------
    // Multi-segment invocation
    //------------------------------------------------------------------------------

    /// Invoke a three-kernel sorting pass at the current bit.
    template <typename PassConfigT>

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        ValueT          *d_values_out,
        int             &current_bit,
        PassConfigT     &pass_config)
    {
        cudaError error = cudaSuccess;
        do
        {
            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

            // Log kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,
                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);

            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
                d_keys_in, d_keys_out,
                d_values_in,  d_values_out,
                d_begin_offsets, d_end_offsets, num_segments,
                current_bit, pass_bits);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update current bit
            current_bit += pass_bits;
        }
        while (0);

        return error;
    }


xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
        int                     num_items,              ///< [in] Number of items to sort
        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
        const int               *d_begin_offsets,       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> dat...
        const int               *d_end_offsets,         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data se...
        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    {
        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;

        cudaError_t error;
        do {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Create dispatch functor
            DispatchSegmentedRadixSort dispatch(
                d_temp_storage, temp_storage_bytes,
                d_keys, d_values,
                num_items, num_segments, d_begin_offsets, d_end_offsets,
                begin_bit, end_bit, is_overwrite_okay,
                stream, debug_synchronous, ptx_version);

            // Dispatch to chained policy
            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;

        } while (0);

        return error;
    }
};



( run in 0.505 second using v1.01-cache-2.11-cpan-0d8aa00de5b )