Alien-XGBoost

 view release on metacpan or  search on metacpan

Changes  view on Meta::CPAN

0.04    2017-08-27
    Fix preserving executable flag for xgboost command
0.03    2017-08-26
    * Windows install working. Thanks again to Graham Ollis <plicease@cpan.org>
0.02    2017-08-24
    * System install working using ffi_name property. Thanks to Graham Ollis <plicease@cpan.org>
    * Switch from Module::Build to ExtUtils::MakeMaker
0.01    2017-08-20
    * Dynamic library and command support

MANIFEST  view on Meta::CPAN

xgboost/cub/cub/warp/specializations/warp_scan_smem.cuh
xgboost/cub/cub/warp/warp_reduce.cuh
xgboost/cub/cub/warp/warp_scan.cuh
'xgboost/cub/eclipse code style profile.xml'
xgboost/cub/examples/block/Makefile
xgboost/cub/examples/block/example_block_radix_sort.cu
xgboost/cub/examples/block/example_block_reduce.cu
xgboost/cub/examples/block/example_block_scan.cu
xgboost/cub/examples/block/reduce_by_key.cu
xgboost/cub/examples/device/Makefile
xgboost/cub/examples/device/example_device_partition_flagged.cu
xgboost/cub/examples/device/example_device_partition_if.cu
xgboost/cub/examples/device/example_device_radix_sort.cu
xgboost/cub/examples/device/example_device_reduce.cu
xgboost/cub/examples/device/example_device_scan.cu
xgboost/cub/examples/device/example_device_select_flagged.cu
xgboost/cub/examples/device/example_device_select_if.cu
xgboost/cub/examples/device/example_device_select_unique.cu
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
xgboost/cub/experimental/Makefile
xgboost/cub/experimental/defunct/example_coo_spmv.cu
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu
xgboost/cub/experimental/histogram/histogram_cub.h
xgboost/cub/experimental/histogram/histogram_gmem_atomics.h
xgboost/cub/experimental/histogram/histogram_smem_atomics.h
xgboost/cub/experimental/histogram_compare.cu

xgboost/CMakeLists.txt  view on Meta::CPAN

msvc_use_static_runtime()

# Options
option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
option(JVM_BINDINGS "Build JVM bindings" OFF)
option(GOOGLE_TEST "Build google tests" OFF)
set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
  "Space separated list of compute versions to be built against")


# Compiler flags
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if(MSVC)
  # Multithreaded compilation
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
else()
  # Correct error for GCC 5 and cuda
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")

xgboost/CMakeLists.txt  view on Meta::CPAN

  cmake_minimum_required(VERSION 3.5)

  add_definitions(-DXGBOOST_USE_CUDA)
  
  include_directories(
    nccl/src
    cub
  )

  set(GENCODE_FLAGS "")
  format_gencode_flags("${GPU_COMPUTE_VER}" GENCODE_FLAGS)
  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;${GENCODE_FLAGS};-lineinfo;")
  if(NOT MSVC)
    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11")
  endif()

  add_subdirectory(nccl)
  cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
  target_link_libraries(gpuxgboost nccl)
  list(APPEND LINK_LIBRARIES gpuxgboost) 
  list(APPEND SOURCES plugin/updater_gpu/src/register_updater_gpu.cc) 

xgboost/R-package/R/callbacks.R  view on Meta::CPAN

#'        stopping. If not set, the last column would be used.
#'        Let's say the test data in \code{watchlist} was labelled as \code{dtest}, 
#'        and one wants to use the AUC in test data for early stopping regardless of where 
#'        it is in the \code{watchlist}, then one of the following would need to be set:
#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
#'        All dash '-' characters in metric names are considered equivalent to '_'.
#' @param verbose whether to print the early stopping information.
#' 
#' @details
#' This callback function determines the condition for early stopping 
#' by setting the \code{stop_condition = TRUE} flag in its calling frame.
#' 
#' The following additional fields are assigned to the model's R object:
#' \itemize{
#' \item \code{best_score} the evaluation score at the best iteration
#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
#' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
#'      It differs from \code{best_iteration} in multiclass or random forest settings.
#' }
#' 
#' The Same values are also stored as xgb-attributes:

xgboost/R-package/R/callbacks.R  view on Meta::CPAN

      xgb.save(env$bst, sprintf(save_name, env$iteration))
  }
  attr(callback, 'call') <- match.call()
  attr(callback, 'name') <- 'cb.save.model'
  callback
}


#' Callback closure for returning cross-validation based predictions.
#' 
#' @param save_models a flag for whether to save the folds' models.
#' 
#' @details 
#' This callback function saves predictions for all of the test folds,
#' and also allows to save the folds' models.
#' 
#' It is a "finalizer" callback and it uses early stopping information whenever it is available,
#' thus it must be run after the early stopping callback if the early stopping is used.
#' 
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst_folds},

xgboost/R-package/R/xgb.Booster.R  view on Meta::CPAN

}

#' Restore missing parts of an incomplete xgb.Booster object.
#'
#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing 
#' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid 
#' but it has a raw Booster memory dump).
#' 
#' @param object object of class \code{xgb.Booster}
#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data 
#'                when it doesn't already exist.
#' 
#' @details
#' 
#' While this method is primarily for internal use, it might be useful in some practical situations.
#' 
#' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods 
#' should still work for such a model object since those methods would be using 
#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the  

xgboost/R-package/R/xgb.model.dt.tree.R  view on Meta::CPAN

#'          Non-null \code{feature_names} could be provided to override those in the model.
#' @param model object of class \code{xgb.Booster}
#' @param text \code{character} vector previously generated by the \code{xgb.dump} 
#'          function  (where parameter \code{with_stats = TRUE} should have been set).
#'          \code{text} takes precedence over \code{model}.
#' @param trees an integer vector of tree indices that should be parsed.
#'          If set to \code{NULL}, all trees of the model are parsed.
#'          It could be useful, e.g., in multiclass classification to get only
#'          the trees of one certain class. IMPORTANT: the tree index in xgboost models
#'          is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
#' @param use_int_id a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
#'          represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).
#' @param ... currently not used.
#'
#' @return 
#' A \code{data.table} with detailed information about model trees' nodes.
#'
#' The columns of the \code{data.table} are:
#' 
#' \itemize{
#'  \item \code{Tree}: integer ID of a tree in a model (zero-based index)

xgboost/R-package/R/xgb.plot.tree.R  view on Meta::CPAN

#' Read a tree model text dump and plot the model. 
#' 
#' @param feature_names names of each feature as a \code{character} vector.
#' @param model produced by the \code{xgb.train} function.
#' @param trees an integer vector of tree indices that should be visualized.
#'          If set to \code{NULL}, all trees of the model are included.
#'          IMPORTANT: the tree index in xgboost model is zero-based
#'          (e.g., use \code{trees = 0:2} for the first 3 trees in a model).
#' @param plot_width  the width of the diagram in pixels.
#' @param plot_height	the height of the diagram in pixels.
#' @param render a logical flag for whether the graph should be rendered (see Value).
#' @param show_node_id a logical flag for whether to show node id's in the graph.
#' @param ... currently not used.
#'
#' @details 
#' 
#' The content of each node is organised that way:
#' 
#' \itemize{
#'  \item Feature name.
#'  \item \code{Cover}: The sum of second order gradient of training data classified to the leaf.
#'        If it is square loss, this simply corresponds to the number of instances seen by a split

xgboost/R-package/man/cb.cv.predict.Rd  view on Meta::CPAN

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/callbacks.R
\name{cb.cv.predict}
\alias{cb.cv.predict}
\title{Callback closure for returning cross-validation based predictions.}
\usage{
cb.cv.predict(save_models = FALSE)
}
\arguments{
\item{save_models}{a flag for whether to save the folds' models.}
}
\value{
Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
depending on the number of prediction outputs per data row. The order of predictions corresponds 
to the order of rows in the original dataset. Note that when a custom \code{folds} list is 
provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a 
non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be 
meaningful when user-profided folds have overlapping indices as in, e.g., random sampling splits.
When some of the indices in the training dataset are not included into user-provided \code{folds},
their prediction value would be \code{NA}.

xgboost/R-package/man/cb.early.stop.Rd  view on Meta::CPAN

\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
All dash '-' characters in metric names are considered equivalent to '_'.}

\item{verbose}{whether to print the early stopping information.}
}
\description{
Callback closure to activate the early stopping.
}
\details{
This callback function determines the condition for early stopping 
by setting the \code{stop_condition = TRUE} flag in its calling frame.

The following additional fields are assigned to the model's R object:
\itemize{
\item \code{best_score} the evaluation score at the best iteration
\item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
\item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
     It differs from \code{best_iteration} in multiclass or random forest settings.
}

The Same values are also stored as xgb-attributes:

xgboost/R-package/man/xgb.Booster.complete.Rd  view on Meta::CPAN

% Please edit documentation in R/xgb.Booster.R
\name{xgb.Booster.complete}
\alias{xgb.Booster.complete}
\title{Restore missing parts of an incomplete xgb.Booster object.}
\usage{
xgb.Booster.complete(object, saveraw = TRUE)
}
\arguments{
\item{object}{object of class \code{xgb.Booster}}

\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data 
when it doesn't already exist.}
}
\value{
An object of \code{xgb.Booster} class.
}
\description{
It attempts to complete an \code{xgb.Booster} object by restoring either its missing 
raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
or its missing internal handle (when its \code{xgb.Booster.handle} is not valid 
but it has a raw Booster memory dump).

xgboost/R-package/man/xgb.model.dt.tree.Rd  view on Meta::CPAN

\item{text}{\code{character} vector previously generated by the \code{xgb.dump} 
function  (where parameter \code{with_stats = TRUE} should have been set).
\code{text} takes precedence over \code{model}.}

\item{trees}{an integer vector of tree indices that should be parsed.
If set to \code{NULL}, all trees of the model are parsed.
It could be useful, e.g., in multiclass classification to get only
the trees of one certain class. IMPORTANT: the tree index in xgboost models
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}

\item{use_int_id}{a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).}

\item{...}{currently not used.}
}
\value{
A \code{data.table} with detailed information about model trees' nodes.

The columns of the \code{data.table} are:

\itemize{

xgboost/R-package/man/xgb.plot.tree.Rd  view on Meta::CPAN


\item{trees}{an integer vector of tree indices that should be visualized.
If set to \code{NULL}, all trees of the model are included.
IMPORTANT: the tree index in xgboost model is zero-based
(e.g., use \code{trees = 0:2} for the first 3 trees in a model).}

\item{plot_width}{the width of the diagram in pixels.}

\item{plot_height}{the height of the diagram in pixels.}

\item{render}{a logical flag for whether the graph should be rendered (see Value).}

\item{show_node_id}{a logical flag for whether to show node id's in the graph.}

\item{...}{currently not used.}
}
\value{
When \code{render = TRUE}:
returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
Similar to ggplot objects, it needs to be printed to see it when not running from command line.

When \code{render = FALSE}:
silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.

xgboost/R-package/tests/testthat/test_basic.R  view on Meta::CPAN


context("basic functions")

data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
set.seed(1994)

# disable some tests for Win32
windows_flag = .Platform$OS.type == "windows" &&
               .Machine$sizeof.pointer != 8

test_that("train and predict binary classification", {
  nrounds = 2
  expect_output(
    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
                  eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic")
  , "train-error")
  expect_equal(class(bst), "xgb.Booster")
  expect_equal(bst$niter, nrounds)

xgboost/R-package/tests/testthat/test_basic.R  view on Meta::CPAN

  param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)

  # for the reference, use 4 iterations at once:
  set.seed(11)
  bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
  # first two iterations:
  set.seed(11)
  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
  # continue for two more:
  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
  if (!windows_flag)
    expect_equal(bst$raw, bst2$raw)
  expect_false(is.null(bst2$evaluation_log))
  expect_equal(dim(bst2$evaluation_log), c(4, 2))
  expect_equal(bst2$evaluation_log, bst$evaluation_log)
  # test continuing from raw model data
  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
  if (!windows_flag)
    expect_equal(bst$raw, bst2$raw)
  expect_equal(dim(bst2$evaluation_log), c(2, 2))
  # test continuing from a model in file
  xgb.save(bst1, "xgboost.model")
  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.model")
  if (!windows_flag)
    expect_equal(bst$raw, bst2$raw)
  expect_equal(dim(bst2$evaluation_log), c(2, 2))
})


test_that("xgb.cv works", {
  set.seed(11)
  expect_output(
    cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
                 eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",

xgboost/cmake/Utils.cmake  view on Meta::CPAN

# Set a default build type to release if none was specified
function(set_default_configuration_release)
    if(CMAKE_CONFIGURATION_TYPES STREQUAL "Debug;Release;MinSizeRel;RelWithDebInfo") # multiconfig generator?
        set(CMAKE_CONFIGURATION_TYPES Release CACHE STRING "" FORCE) 
	elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
	  message(STATUS "Setting build type to 'Release' as none was specified.")
	  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE )
	endif()
endfunction(set_default_configuration_release)

function(format_gencode_flags flags out)
  foreach(ver ${flags})
    set(${out} "${${out}}-gencode arch=compute_${ver},code=sm_${ver};")
  endforeach()
  set(${out} "${${out}}" PARENT_SCOPE)
endfunction(format_gencode_flags flags)

xgboost/cub/CHANGE_LOG.TXT  view on Meta::CPAN

              aliasing collective storage to shared memory that has been 
              allocated dynamically by the host at the kernel call site.   
            - Most CUB programs having typical 1D usage should not require any 
              changes to accomodate these updates.
        - Added new "combination" WarpScan methods for efficiently computing 
          both inclusive and exclusive prefix scans (and sums).
    - Bug fixes: 
        - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
          cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
          returned when parameterized for floating-point types (fp32, fp64).
        - Workaround-fix for ptxas error when compiling with with -G flag on Linux 
          (for debug instrumentation) 
        - Misc. workaround-fixes for certain scan scenarios (using custom 
          scan operators) where code compiled for SM1x is run on newer 
          GPUs of higher compute-capability: the compiler could not tell
          which memory space was being used collective operations and was 
          mistakenly using global ops instead of shared ops. 

//-----------------------------------------------------------------------------

1.2.3    04/01/2014

xgboost/cub/CHANGE_LOG.TXT  view on Meta::CPAN

        - Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
        - Added workaround for TexRefInputIteratorTto work with CUDA 6.0
    
//-----------------------------------------------------------------------------

1.1.1    12/11/2013
    - New features:
        - Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy.  Compatible with Thrust API. 
        - Added descending sorting to DeviceRadixSort and BlockRadixSort
        - Added min, max, arg-min, and arg-max to DeviceReduce
        - Added DeviceSelect (select-unique, select-if, and select-flagged)
        - Added DevicePartition (partition-if, partition-flagged)
        - Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
        - Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
    - Performance
        - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
    - Interface changes:
        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
        - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
    - Documentation and testing:
        - Added simple examples of device-wide methods
        - Improved doxygen documentation and example snippets

xgboost/cub/common.mk  view on Meta::CPAN

NVCC = "$(shell which nvcc)"
ifdef nvccver
    NVCC_VERSION = $(nvccver)
else
    NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
endif

# detect OS
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])

# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases 
NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\# 

ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
    # For MSVC
    # Enable more warnings and treat as errors
    NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
    NVCCFLAGS += -Xcompiler /fp:strict
    # Help the compiler/linker work with huge numbers of kernels on Windows
	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500

xgboost/cub/cub/agent/agent_histogram.cuh  view on Meta::CPAN

        bool        is_valid[PIXELS_PER_THREAD];

        // Load tile
        LoadTile(
            block_offset,
            valid_samples,
            samples,
            Int2Type<IS_FULL_TILE>(),
            Int2Type<IS_ALIGNED>());

        // Set valid flags
        #pragma unroll
        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);

        // Accumulate samples
#if CUB_PTX_ARCH >= 120
        if (prefer_smem)
            AccumulateSmemPixels(samples, is_valid);
        else
            AccumulateGmemPixels(samples, is_valid);

xgboost/cub/cub/agent/agent_reduce_by_key.cuh  view on Meta::CPAN

        reduction_op(reduction_op),
        scan_op(reduction_op)
    {}


    //---------------------------------------------------------------------
    // Scatter utility methods
    //---------------------------------------------------------------------

    /**
     * Directly scatter flagged items to output offsets
     */
    __device__ __forceinline__ void ScatterDirect(
        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
    {
        // Scatter flagged keys and values
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            if (segment_flags[ITEM])
            {
                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
            }
        }
    }


    /**
     * 2-phase scatter flagged items to output offsets
     *
     * The exclusive scan causes each head flag to be paired with the previous
     * value aggregate: the scatter offsets must be decremented for value aggregates
     */
    __device__ __forceinline__ void ScatterTwoPhase(
        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
        OffsetT         num_tile_segments,
        OffsetT         num_tile_segments_prefix)
    {
        CTA_SYNC();

        // Compact and scatter pairs
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            if (segment_flags[ITEM])
            {
                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
            }
        }

        CTA_SYNC();

        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
        {
            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
        }
    }


    /**
     * Scatter flagged items
     */
    __device__ __forceinline__ void Scatter(
        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
        OffsetT         num_tile_segments,
        OffsetT         num_tile_segments_prefix)
    {
        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
        {
            ScatterTwoPhase(
                scatter_items,
                segment_flags,
                segment_indices,
                num_tile_segments,
                num_tile_segments_prefix);
        }
        else
        {
            ScatterDirect(
                scatter_items,
                segment_flags,
                segment_indices);
        }
    }


    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------

    /**

xgboost/cub/cub/agent/agent_reduce_by_key.cuh  view on Meta::CPAN

    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
    __device__ __forceinline__ void ConsumeTile(
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering

        // Load keys
        if (IS_LAST_TILE)
            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
        else
            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);

        // Load tile predecessor key in first thread
        KeyOutputT tile_predecessor;
        if (threadIdx.x == 0)
        {
            tile_predecessor = (tile_idx == 0) ?
                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
        }

        CTA_SYNC();

        // Load values
        if (IS_LAST_TILE)
            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
        else
            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);

        CTA_SYNC();

        // Initialize head-flags and shuffle up the previous keys
        if (IS_LAST_TILE)
        {
            // Use custom flag operator to additionally flag the first out-of-bounds item
            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
                head_flags, keys, prev_keys, flag_op, tile_predecessor);
        }
        else
        {
            InequalityWrapper<EqualityOpT> flag_op(equality_op);
            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
                head_flags, keys, prev_keys, flag_op, tile_predecessor);
        }

        // Zip values and head flags
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            scan_items[ITEM].value  = values[ITEM];
            scan_items[ITEM].key    = head_flags[ITEM];
        }

        // Perform exclusive tile scan
        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
        ValueOutputT        total_aggregate;        // The tile prefix folded with block_aggregate
        if (tile_idx == 0)
        {
            // Scan first tile
            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);

xgboost/cub/cub/agent/agent_reduce_by_key.cuh  view on Meta::CPAN


        // Rezip scatter items and segment indices
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            scatter_items[ITEM].key     = prev_keys[ITEM];
            scatter_items[ITEM].value   = scan_items[ITEM].value;
            segment_indices[ITEM]       = scan_items[ITEM].key;
        }

        // At this point, each flagged segment head has:
        //  - The key for the previous segment
        //  - The reduced value from the previous segment
        //  - The segment index for the reduced value

        // Scatter flagged keys and values
        OffsetT num_tile_segments = block_aggregate.key;
        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);

        // Last thread in last tile will output final count (and last pair, if necessary)
        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
        {
            OffsetT num_segments = num_segments_prefix + num_tile_segments;

            // If the last tile is a whole tile, output the final_value
            if (num_remaining == TILE_ITEMS)
            {
                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),

        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
    };


    /**
     * Special operator that signals all out-of-bounds items are not equal to everything else,
     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
     * trivial.
     */
    template <bool LAST_TILE>
    struct OobInequalityOp
    {
        OffsetT         num_remaining;
        EqualityOpT      equality_op;

        __device__ __forceinline__ OobInequalityOp(
            OffsetT     num_remaining,

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

    // Per-thread fields
    //---------------------------------------------------------------------

    _TempStorage&                   temp_storage;       ///< Reference to temp_storage

    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths

    EqualityOpT                     equality_op;        ///< T equality operator
    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
    OffsetT                         num_items;          ///< Total number of input items


    //---------------------------------------------------------------------
    // Constructor
    //---------------------------------------------------------------------

    // Constructor
    __device__ __forceinline__
    AgentRle(

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

    // Utility methods for initializing the selections
    //---------------------------------------------------------------------

    template <bool FIRST_TILE, bool LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT             tile_offset,
        OffsetT             num_remaining,
        T                   (&items)[ITEMS_PER_THREAD],
        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
    {
        bool                head_flags[ITEMS_PER_THREAD];
        bool                tail_flags[ITEMS_PER_THREAD];

        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);

        if (FIRST_TILE && LAST_TILE)
        {
            // First-and-last-tile always head-flags the first item and tail-flags the last item

            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
                head_flags, tail_flags, items, inequality_op);
        }
        else if (FIRST_TILE)
        {
            // First-tile always head-flags the first item

            // Get the first item from the next tile
            T tile_successor_item;
            if (threadIdx.x == BLOCK_THREADS - 1)
                tile_successor_item = d_in[tile_offset + TILE_ITEMS];

            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
                head_flags, tail_flags, tile_successor_item, items, inequality_op);
        }
        else if (LAST_TILE)
        {
            // Last-tile always flags the last item

            // Get the last item from the previous tile
            T tile_predecessor_item;
            if (threadIdx.x == 0)
                tile_predecessor_item = d_in[tile_offset - 1];

            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
        }
        else
        {
            // Get the first item from the next tile
            T tile_successor_item;
            if (threadIdx.x == BLOCK_THREADS - 1)
                tile_successor_item = d_in[tile_offset + TILE_ITEMS];

            // Get the last item from the previous tile
            T tile_predecessor_item;
            if (threadIdx.x == 0)
                tile_predecessor_item = d_in[tile_offset - 1];

            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
        }

        // Zip counts and runs
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            lengths_and_num_runs[ITEM].key   = head_flags[ITEM] && (!tail_flags[ITEM]);
            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
        }
    }

    //---------------------------------------------------------------------
    // Scan utility methods
    //---------------------------------------------------------------------

    /**
     * Scan of allocations
     */

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

            // Load items
            T items[ITEMS_PER_THREAD];
            if (LAST_TILE)
                BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining, T());
            else
                BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);

            if (SYNC_AFTER_LOAD)
                CTA_SYNC();

            // Set flags
            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];

            InitializeSelections<true, LAST_TILE>(
                tile_offset,
                num_remaining,
                items,
                lengths_and_num_runs);

            // Exclusive scan of lengths and runs
            LengthOffsetPair tile_aggregate;

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

            // Load items
            T items[ITEMS_PER_THREAD];
            if (LAST_TILE)
                BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining, T());
            else
                BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);

            if (SYNC_AFTER_LOAD)
                CTA_SYNC();

            // Set flags
            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];

            InitializeSelections<false, LAST_TILE>(
                tile_offset,
                num_remaining,
                items,
                lengths_and_num_runs);

            // Exclusive scan of lengths and runs
            LengthOffsetPair tile_aggregate;

xgboost/cub/cub/agent/agent_segment_fixup.cuh  view on Meta::CPAN

        if (IS_LAST_TILE)
            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
        else
            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);

        CTA_SYNC();

        KeyValuePairT tile_aggregate;
        if (tile_idx == 0)
        {
            // Exclusive scan of values and segment_flags
            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);

            // Update tile status if this is not the last tile
            if (threadIdx.x == 0)
            {
                // Set first segment id to not trigger a flush (invalid from exclusive scan)
                scatter_pairs[0].key = pairs[0].key;

                if (!IS_LAST_TILE)
                    tile_state.SetInclusive(0, tile_aggregate);

            }
        }
        else
        {
            // Exclusive scan of values and segment_flags
            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
            tile_aggregate = prefix_op.GetBlockAggregate();
        }

        // Scatter updated values
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            if (scatter_pairs[ITEM].key != pairs[ITEM].key)

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN


/******************************************************************************
 * Thread block abstractions
 ******************************************************************************/


/**
 * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
 *
 * Performs functor-based selection if SelectOpT functor type != NullType
 * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
 * Otherwise performs discontinuity selection (keep unique)
 */
template <
    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
    typename    OffsetT,                        ///< Signed integer type for global offsets
    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
struct AgentSelectIf
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input value type
    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

    // The output value type
    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    // The flag value type
    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;

    // Tile status descriptor interface type
    typedef ScanTileState<OffsetT> ScanTileStateT;

    // Constants
    enum
    {
        USE_SELECT_OP,
        USE_SELECT_FLAGS,

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN

        WrappedFlagsInputIteratorT;

    // Parameterized BlockLoad type for input data
    typedef BlockLoad<
            OutputT,
            BLOCK_THREADS,
            ITEMS_PER_THREAD,
            AgentSelectIfPolicyT::LOAD_ALGORITHM>
        BlockLoadT;

    // Parameterized BlockLoad type for flags
    typedef BlockLoad<
            FlagT,
            BLOCK_THREADS,
            ITEMS_PER_THREAD,
            AgentSelectIfPolicyT::LOAD_ALGORITHM>
        BlockLoadFlags;

    // Parameterized BlockDiscontinuity type for items
    typedef BlockDiscontinuity<
            OutputT,

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN

        {
            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
        };

        // Smem needed for loading items
        typename BlockLoadT::TempStorage load_items;

        // Smem needed for loading values
        typename BlockLoadFlags::TempStorage load_flags;

        // Smem needed for compacting items (allows non POD items in this union)
        Uninitialized<ItemExchangeT> raw_exchange;
    };

    // Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------
    // Per-thread fields
    //---------------------------------------------------------------------

    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
    WrappedInputIteratorT           d_in;               ///< Input items
    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
    SelectOpT                       select_op;          ///< Selection operator
    OffsetT                         num_items;          ///< Total number of input items


    //---------------------------------------------------------------------
    // Constructor
    //---------------------------------------------------------------------

    // Constructor
    __device__ __forceinline__
    AgentSelectIf(
        TempStorage                 &temp_storage,      ///< Reference to temp_storage
        InputIteratorT              d_in,               ///< Input data
        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
        SelectedOutputIteratorT     d_selected_out,     ///< Output data
        SelectOpT                   select_op,          ///< Selection operator
        EqualityOpT                 equality_op,        ///< Equality operator
        OffsetT                     num_items)          ///< Total number of input items
    :
        temp_storage(temp_storage.Alias()),
        d_in(d_in),
        d_flags_in(d_flags_in),
        d_selected_out(d_selected_out),
        select_op(select_op),
        inequality_op(equality_op),
        num_items(num_items)
    {}


    //---------------------------------------------------------------------
    // Utility methods for initializing the selections
    //---------------------------------------------------------------------

    /**
     * Initialize selections (specialized for selection operator)
     */
    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT                     /*tile_offset*/,
        OffsetT                     num_tile_items,
        OutputT                     (&items)[ITEMS_PER_THREAD],
        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
        Int2Type<USE_SELECT_OP>     /*select_method*/)
    {
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            // Out-of-bounds items are selection_flags
            selection_flags[ITEM] = 1;

            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
                selection_flags[ITEM] = select_op(items[ITEM]);
        }
    }


    /**
     * Initialize selections (specialized for valid flags)
     */
    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT                     tile_offset,
        OffsetT                     num_tile_items,
        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
    {
        CTA_SYNC();

        FlagT flags[ITEMS_PER_THREAD];

        if (IS_LAST_TILE)
        {
            // Out-of-bounds items are selection_flags
            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
        }
        else
        {
            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
        }

        // Convert flag type to selection_flags type
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            selection_flags[ITEM] = flags[ITEM];
        }
    }


    /**
     * Initialize selections (specialized for discontinuity detection)
     */
    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT                     tile_offset,
        OffsetT                     num_tile_items,
        OutputT                     (&items)[ITEMS_PER_THREAD],
        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
        Int2Type<USE_DISCONTINUITY> /*select_method*/)
    {
        if (IS_FIRST_TILE)
        {
            CTA_SYNC();

            // Set head selection_flags.  First tile sets the first flag for the first item
            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
        }
        else
        {
            OutputT tile_predecessor;
            if (threadIdx.x == 0)
                tile_predecessor = d_in[tile_offset - 1];

            CTA_SYNC();

            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
        }

        // Set selection flags for out-of-bounds items
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            // Set selection_flags for out-of-bounds items
            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
                selection_flags[ITEM] = 1;
        }
    }


    //---------------------------------------------------------------------
    // Scatter utility methods
    //---------------------------------------------------------------------

    /**
     * Scatter flagged items to output offsets (specialized for direct scattering)
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void ScatterDirect(
        OutputT (&items)[ITEMS_PER_THREAD],
        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
        OffsetT num_selections)
    {
        // Scatter flagged items
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            if (selection_flags[ITEM])
            {
                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
                {
                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
                }
            }
        }
    }


    /**
     * Scatter flagged items to output offsets (specialized for two-phase scattering)
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void ScatterTwoPhase(
        OutputT         (&items)[ITEMS_PER_THREAD],
        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
        int             num_tile_selections,                        ///< Number of selections in this tile
        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
    {
        CTA_SYNC();

        // Compact and scatter items
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
            if (selection_flags[ITEM])
            {
                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
            }
        }

        CTA_SYNC();

        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
        {
            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
        }
    }


    /**
     * Scatter flagged items to output offsets (specialized for two-phase scattering)
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void ScatterTwoPhase(
        OutputT         (&items)[ITEMS_PER_THREAD],
        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
        int             num_tile_items,                             ///< Number of valid items in this tile
        int             num_tile_selections,                        ///< Number of selections in this tile
        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
    {
        CTA_SYNC();

        int tile_num_rejections = num_tile_items - num_tile_selections;

        // Scatter items to shared memory (rejections first)
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
            int local_rejection_idx     = item_idx - local_selection_idx;
            int local_scatter_offset    = (selection_flags[ITEM]) ?
                                            tile_num_rejections + local_selection_idx :
                                            local_rejection_idx;

            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
        }

        CTA_SYNC();

        // Gather items from shared memory and scatter to global
        #pragma unroll

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN


            if (!IS_LAST_TILE || (item_idx < num_tile_items))
            {
                d_selected_out[scatter_offset] = item;
            }
        }
    }


    /**
     * Scatter flagged items
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void Scatter(
        OutputT         (&items)[ITEMS_PER_THREAD],
        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
        int             num_tile_items,                             ///< Number of valid items in this tile
        int             num_tile_selections,                        ///< Number of selections in this tile
        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
        OffsetT         num_selections)                             ///< Total number of selections including this tile
    {
        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
        {
            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
                items,
                selection_flags,
                selection_indices,
                num_tile_items,
                num_tile_selections,
                num_selections_prefix,
                num_rejected_prefix,
                Int2Type<KEEP_REJECTS>());
        }
        else
        {
            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
                items,
                selection_flags,
                selection_indices,
                num_selections);
        }
    }

    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------


    /**
     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
     */
    template <bool IS_LAST_TILE>
    __device__ __forceinline__ OffsetT ConsumeFirstTile(
        int                 num_tile_items,      ///< Number of input items comprising this tile
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        OutputT     items[ITEMS_PER_THREAD];
        OffsetT     selection_flags[ITEMS_PER_THREAD];
        OffsetT     selection_indices[ITEMS_PER_THREAD];

        // Load items
        if (IS_LAST_TILE)
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
        else
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);

        // Initialize selection_flags
        InitializeSelections<true, IS_LAST_TILE>(
            tile_offset,
            num_tile_items,
            items,
            selection_flags,
            Int2Type<SELECT_METHOD>());

        CTA_SYNC();

        // Exclusive scan of selection_flags
        OffsetT num_tile_selections;
        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);

        if (threadIdx.x == 0)
        {
            // Update tile status if this is not the last tile
            if (!IS_LAST_TILE)
                tile_state.SetInclusive(0, num_tile_selections);
        }

        // Discount any out-of-bounds selections
        if (IS_LAST_TILE)
            num_tile_selections -= (TILE_ITEMS - num_tile_items);

        // Scatter flagged items
        Scatter<IS_LAST_TILE, true>(
            items,
            selection_flags,
            selection_indices,
            num_tile_items,
            num_tile_selections,
            0,
            0,
            num_tile_selections);

        return num_tile_selections;
    }

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN

     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
     */
    template <bool IS_LAST_TILE>
    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
        int                 num_tile_items,      ///< Number of input items comprising this tile
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        OutputT     items[ITEMS_PER_THREAD];
        OffsetT     selection_flags[ITEMS_PER_THREAD];
        OffsetT     selection_indices[ITEMS_PER_THREAD];

        // Load items
        if (IS_LAST_TILE)
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
        else
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);

        // Initialize selection_flags
        InitializeSelections<false, IS_LAST_TILE>(
            tile_offset,
            num_tile_items,
            items,
            selection_flags,
            Int2Type<SELECT_METHOD>());

        CTA_SYNC();

        // Exclusive scan of values and selection_flags
        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);

        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;

        // Discount any out-of-bounds selections
        if (IS_LAST_TILE)
        {
            int num_discount    = TILE_ITEMS - num_tile_items;
            num_selections      -= num_discount;
            num_tile_selections -= num_discount;
        }

        // Scatter flagged items
        Scatter<IS_LAST_TILE, false>(
            items,
            selection_flags,
            selection_indices,
            num_tile_items,
            num_tile_selections,
            num_selections_prefix,
            num_rejected_prefix,
            num_selections);

        return num_selections;
    }

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN

            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
        }

        return num_selections;
    }


    /**
     * Scan tiles of items as part of a dynamic chained scan
     */
    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
    __device__ __forceinline__ void ConsumeRange(
        int                     num_tiles,          ///< Total number of input tiles
        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
    {
        // Blocks are launched in increasing order, so just assign one tile per block
        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile

        if (tile_idx < num_tiles - 1)
        {
            // Not the last tile (full)
            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
        }
        else
        {
            // The last tile (possibly partially-full)
            OffsetT num_remaining   = num_items - tile_offset;
            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);

            if (threadIdx.x == 0)
            {
                // Output the total number of items selection_flags
                *d_num_selected_out = num_selections;
            }
        }
    }

};



}               // CUB namespace

xgboost/cub/cub/agent/agent_spmv_row_based.cuh  view on Meta::CPAN

            row_start = temp_storage.nonzeros[local_row_nonzero_idx];
            temp_storage.nonzeros[local_row_nonzero_idx] = NAN_TOKEN;
        }

        CTA_SYNC();

        //
        // Segmented scan
        //

        // Read strip of nonzeros into thread-blocked order, setup segment flags
        KeyValuePairT scan_items[NNZ_PER_THREAD];
        for (int ITEM = 0; ITEM < NNZ_PER_THREAD; ++ITEM)
        {
            int     local_nonzero_idx   = (threadIdx.x * NNZ_PER_THREAD) + ITEM;
            ValueT  value               = temp_storage.nonzeros[local_nonzero_idx];
            bool    is_nan              = (value != value);

            scan_items[ITEM].value  = (is_nan) ? 0.0 : value;
            scan_items[ITEM].key    = is_nan;
        }

xgboost/cub/cub/agent/single_pass_scan_operators.cuh  view on Meta::CPAN

        int         predecessor_idx,        ///< Preceding tile index to inspect
        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
    {
        T value;
        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);

        // Perform a segmented reduction to get the prefix for the current window.
        // Use the swizzled scan operator because we are now scanning *down* towards thread0.

        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
            value,
            tail_flag,
            SwizzleScanOp<ScanOpT>(scan_op));
    }


    // BlockScan prefix callback functor (called by the first warp)
    __device__ __forceinline__
    T operator()(T block_aggregate)
    {

        // Update our status with our tile-aggregate

xgboost/cub/cub/block/block_adjacent_difference.cuh  view on Meta::CPAN

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
 */

#pragma once

#include "../util_type.cuh"
#include "../util_ptx.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

xgboost/cub/cub/block/block_adjacent_difference.cuh  view on Meta::CPAN

    {
        __shared__ _TempStorage private_storage;
        return private_storage;
    }


    /// Specialization for when FlagOp has third index param
    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
    struct ApplyOp
    {
        // Apply flag operator
        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
        {
            return flag_op(b, a, idx);
        }
    };

    /// Specialization for when FlagOp does not have a third index param
    template <typename FlagOp>
    struct ApplyOp<FlagOp, false>
    {
        // Apply flag operator
        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
        {
            return flag_op(b, a);
        }
    };

    /// Templated unrolling of item comparison (inductive case)
    template <int ITERATION, int MAX_ITERATIONS>
    struct Iterate
    {
        // Head flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagHeads(
            int                     linear_tid,
            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
        {
            preds[ITERATION] = input[ITERATION - 1];

            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                preds[ITERATION],
                input[ITERATION],
                (linear_tid * ITEMS_PER_THREAD) + ITERATION);

            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
        }

        // Tail flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagTails(
            int                     linear_tid,
            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
        {
            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITERATION],
                input[ITERATION + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);

            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
        }

    };

    /// Templated unrolling of item comparison (termination case)
    template <int MAX_ITERATIONS>
    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
    {
        // Head flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagHeads(
            int                     /*linear_tid*/,
            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
        {}

        // Tail flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagTails(
            int                     /*linear_tid*/,
            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
        {}
    };


    /******************************************************************************
     * Thread fields
     ******************************************************************************/

    /// Shared storage reference
    _TempStorage &temp_storage;

xgboost/cub/cub/block/block_adjacent_difference.cuh  view on Meta::CPAN

    __device__ __forceinline__ BlockAdjacentDifference(
        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
    :
        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    //@}  end member group
    /******************************************************************//**
     * \name Head flag operations
     *********************************************************************/
    //@{


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document

    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share last item
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        if (linear_tid == 0)
        {
            // Set flag for first thread-item (preds[0] is undefined)
            head_flags[0] = 1;
        }
        else
        {
            preds[0] = temp_storage.last_items[linear_tid - 1];
            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
        }

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
    }

    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
    {
        // Share last item
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        // Set flag for first thread-item
        preds[0] = (linear_tid == 0) ?
            tile_predecessor_item :              // First thread
            temp_storage.last_items[linear_tid - 1];

        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
    }

#endif // DOXYGEN_SHOULD_SKIP_THIS


    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        T preds[ITEMS_PER_THREAD];
        FlagHeads(head_flags, input, preds, flag_op);
    }


    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
    {
        T preds[ITEMS_PER_THREAD];
        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
    }



    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagTails(
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first item
        temp_storage.first_items[linear_tid] = input[0];

        CTA_SYNC();

        // Set flag for last thread-item
        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
            1 :                             // Last thread
            ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITEMS_PER_THREAD - 1],
                temp_storage.first_items[linear_tid + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagTails(
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
    {
        // Share first item
        temp_storage.first_items[linear_tid] = input[0];

        CTA_SYNC();

        // Set flag for last thread-item
        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
            tile_successor_item :              // Last thread
            temp_storage.first_items[linear_tid + 1];

        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            input[ITEMS_PER_THREAD - 1],
            successor_item,
            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        preds[0] = temp_storage.last_items[linear_tid - 1];
        if (linear_tid == 0)
        {
            head_flags[0] = 1;
        }
        else
        {
            head_flags[0] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                preds[0],
                input[0],
                linear_tid * ITEMS_PER_THREAD);
        }


        // Set flag for last thread-item
        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
            1 :                             // Last thread
            ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITEMS_PER_THREAD - 1],
                temp_storage.first_items[linear_tid + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        if (linear_tid == 0)
        {
            head_flags[0] = 1;
        }
        else
        {
            preds[0] = temp_storage.last_items[linear_tid - 1];
            head_flags[0] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                preds[0],
                input[0],
                linear_tid * ITEMS_PER_THREAD);
        }

        // Set flag for last thread-item
        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
            tile_successor_item :              // Last thread
            temp_storage.first_items[linear_tid + 1];

        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            input[ITEMS_PER_THREAD - 1],
            successor_item,
            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }

    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        preds[0] = (linear_tid == 0) ?
            tile_predecessor_item :              // First thread
            temp_storage.last_items[linear_tid - 1];

        head_flags[0] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            preds[0],
            input[0],
            linear_tid * ITEMS_PER_THREAD);

        // Set flag for last thread-item
        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
            1 :                             // Last thread
            ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITEMS_PER_THREAD - 1],
                temp_storage.first_items[linear_tid + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        preds[0] = (linear_tid == 0) ?
            tile_predecessor_item :              // First thread
            temp_storage.last_items[linear_tid - 1];

        head_flags[0] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            preds[0],
            input[0],
            linear_tid * ITEMS_PER_THREAD);

        // Set flag for last thread-item
        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
            tile_successor_item :              // Last thread
            temp_storage.first_items[linear_tid + 1];

        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            input[ITEMS_PER_THREAD - 1],
            successor_item,
            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }



};


}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
 */

#pragma once

#include "../util_type.cuh"
#include "../util_ptx.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

/**
 * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
 * \ingroup BlockModule
 *
 * \tparam T                The data type to be flagged.
 * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
 * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
 * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
 * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
 *
 * \par Overview
 * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
 *   that differ from their predecessors (or successors).  For example, head flags are convenient
 *   for demarcating disjoint data segments as part of a segmented scan or reduction.
 * - \blocked
 *
 * \par Performance Considerations
 * - \granularity
 *
 * \par A Simple Example
 * \blockcollective{BlockDiscontinuity}
 * \par
 * The code snippet below illustrates the head flagging of 512 integer items that
 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
 * where each thread owns 4 consecutive items.
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
 *
 * __global__ void ExampleKernel(...)
 * {
 *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
 *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
 *
 *     // Allocate shared memory for BlockDiscontinuity
 *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
 *
 *     // Obtain a segment of consecutive items that are blocked across threads
 *     int thread_data[4];
 *     ...
 *
 *     // Collectively compute head flags for discontinuities in the segment
 *     int head_flags[4];
 *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
 *
 * \endcode
 * \par
 * Suppose the set of input \p thread_data across the block of threads is
 * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
 * The corresponding output \p head_flags in those threads will be
 * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
 *
 * \par Performance Considerations
 * - Incurs zero bank conflicts for most types
 *
 */
template <
    typename    T,
    int         BLOCK_DIM_X,
    int         BLOCK_DIM_Y     = 1,

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

    {
        __shared__ _TempStorage private_storage;
        return private_storage;
    }


    /// Specialization for when FlagOp has third index param
    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
    struct ApplyOp
    {
        // Apply flag operator
        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
        {
            return flag_op(a, b, idx);
        }
    };

    /// Specialization for when FlagOp does not have a third index param
    template <typename FlagOp>
    struct ApplyOp<FlagOp, false>
    {
        // Apply flag operator
        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
        {
            return flag_op(a, b);
        }
    };

    /// Templated unrolling of item comparison (inductive case)
    template <int ITERATION, int MAX_ITERATIONS>
    struct Iterate
    {
        // Head flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagHeads(
            int                     linear_tid,
            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
        {
            preds[ITERATION] = input[ITERATION - 1];

            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                preds[ITERATION],
                input[ITERATION],
                (linear_tid * ITEMS_PER_THREAD) + ITERATION);

            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
        }

        // Tail flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagTails(
            int                     linear_tid,
            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
        {
            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITERATION],
                input[ITERATION + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);

            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
        }

    };

    /// Templated unrolling of item comparison (termination case)
    template <int MAX_ITERATIONS>
    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
    {
        // Head flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagHeads(
            int                     /*linear_tid*/,
            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
        {}

        // Tail flags
        template <
            int             ITEMS_PER_THREAD,
            typename        FlagT,
            typename        FlagOp>
        static __device__ __forceinline__ void FlagTails(
            int                     /*linear_tid*/,
            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
        {}
    };


    /******************************************************************************
     * Thread fields
     ******************************************************************************/

    /// Shared storage reference
    _TempStorage &temp_storage;

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

    __device__ __forceinline__ BlockDiscontinuity(
        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
    :
        temp_storage(temp_storage.Alias()),
        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
    {}


    //@}  end member group
    /******************************************************************//**
     * \name Head flag operations
     *********************************************************************/
    //@{


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document

    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share last item
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        if (linear_tid == 0)
        {
            // Set flag for first thread-item (preds[0] is undefined)
            head_flags[0] = 1;
        }
        else
        {
            preds[0] = temp_storage.last_items[linear_tid - 1];
            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
        }

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
    }

    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
    {
        // Share last item
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        // Set flag for first thread-item
        preds[0] = (linear_tid == 0) ?
            tile_predecessor_item :              // First thread
            temp_storage.last_items[linear_tid - 1];

        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
    }

#endif // DOXYGEN_SHOULD_SKIP_THIS


    /**
     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
     *
     * \par
     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
     *   returns \p true (where <em>previous-item</em> is either the preceding item
     *   in the same thread or the last item in the previous thread).
     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Collectively compute head flags for discontinuities in the segment
     *     int head_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
     * The corresponding output \p head_flags in those threads will be
     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        T preds[ITEMS_PER_THREAD];
        FlagHeads(head_flags, input, preds, flag_op);
    }


    /**
     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
     *
     * \par
     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
     *   returns \p true (where <em>previous-item</em> is either the preceding item
     *   in the same thread or the last item in the previous thread).
     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
     *   against \p tile_predecessor_item.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Have thread0 obtain the predecessor item for the entire tile
     *     int tile_predecessor_item;
     *     if (threadIdx.x == 0) tile_predecessor_item == ...
     *
     *     // Collectively compute head flags for discontinuities in the segment
     *     int head_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagHeads(
     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeads(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
    {
        T preds[ITEMS_PER_THREAD];
        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
    }



    //@}  end member group
    /******************************************************************//**
     * \name Tail flag operations
     *********************************************************************/
    //@{


    /**
     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
     *
     * \par
     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
     *   returns \p true (where <em>next-item</em> is either the next item
     *   in the same thread or the first item in the next thread).
     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Collectively compute tail flags for discontinuities in the segment
     *     int tail_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
     * The corresponding output \p tail_flags in those threads will be
     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagTails(
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first item
        temp_storage.first_items[linear_tid] = input[0];

        CTA_SYNC();

        // Set flag for last thread-item
        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
            1 :                             // Last thread
            ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITEMS_PER_THREAD - 1],
                temp_storage.first_items[linear_tid + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    /**
     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
     *
     * \par
     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
     *   returns \p true (where <em>next-item</em> is either the next item
     *   in the same thread or the first item in the next thread).
     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
     *   against \p tile_successor_item.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Have thread127 obtain the successor item for the entire tile
     *     int tile_successor_item;
     *     if (threadIdx.x == 127) tile_successor_item == ...
     *
     *     // Collectively compute tail flags for discontinuities in the segment
     *     int tail_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagTails(
     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagTails(
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
    {
        // Share first item
        temp_storage.first_items[linear_tid] = input[0];

        CTA_SYNC();

        // Set flag for last thread-item
        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
            tile_successor_item :              // Last thread
            temp_storage.first_items[linear_tid + 1];

        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            input[ITEMS_PER_THREAD - 1],
            successor_item,
            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Head & tail flag operations
     *********************************************************************/
    //@{


    /**
     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
     *
     * \par
     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
     *   returns \p true (where <em>previous-item</em> is either the preceding item
     *   in the same thread or the last item in the previous thread).
     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
     *   returns \p true (where <em>next-item</em> is either the next item
     *   in the same thread or the first item in the next thread).
     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Collectively compute head and flags for discontinuities in the segment
     *     int head_flags[4];
     *     int tail_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagTails(
     *         head_flags, tail_flags, thread_data, cub::Inequality());
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
     * and the corresponding output \p tail_flags in those threads will be
     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        preds[0] = temp_storage.last_items[linear_tid - 1];
        if (linear_tid == 0)
        {
            head_flags[0] = 1;
        }
        else
        {
            head_flags[0] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                preds[0],
                input[0],
                linear_tid * ITEMS_PER_THREAD);
        }


        // Set flag for last thread-item
        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
            1 :                             // Last thread
            ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITEMS_PER_THREAD - 1],
                temp_storage.first_items[linear_tid + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    /**
     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
     *
     * \par
     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
     *   returns \p true (where <em>previous-item</em> is either the preceding item
     *   in the same thread or the last item in the previous thread).
     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
     *   returns \p true (where <em>next-item</em> is either the next item
     *   in the same thread or the first item in the next thread).
     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
     *   against \p tile_predecessor_item.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...
     *
     *     // Have thread127 obtain the successor item for the entire tile
     *     int tile_successor_item;
     *     if (threadIdx.x == 127) tile_successor_item == ...
     *
     *     // Collectively compute head and flags for discontinuities in the segment
     *     int head_flags[4];
     *     int tail_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagTails(
     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
     * and the corresponding output \p tail_flags in those threads will be
     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        if (linear_tid == 0)
        {
            head_flags[0] = 1;
        }
        else
        {
            preds[0] = temp_storage.last_items[linear_tid - 1];
            head_flags[0] = ApplyOp<FlagOp>::FlagT(
                flag_op,
                preds[0],
                input[0],
                linear_tid * ITEMS_PER_THREAD);
        }

        // Set flag for last thread-item
        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
            tile_successor_item :              // Last thread
            temp_storage.first_items[linear_tid + 1];

        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            input[ITEMS_PER_THREAD - 1],
            successor_item,
            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    /**
     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
     *
     * \par
     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
     *   returns \p true (where <em>previous-item</em> is either the preceding item
     *   in the same thread or the last item in the previous thread).
     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
     *   against \p tile_predecessor_item.
     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
     *   returns \p true (where <em>next-item</em> is either the next item
     *   in the same thread or the first item in the next thread).
     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     *     ...
     *
     *     // Have thread0 obtain the predecessor item for the entire tile
     *     int tile_predecessor_item;
     *     if (threadIdx.x == 0) tile_predecessor_item == ...
     *
     *     // Have thread127 obtain the successor item for the entire tile
     *     int tile_successor_item;
     *     if (threadIdx.x == 127) tile_successor_item == ...
     *
     *     // Collectively compute head and flags for discontinuities in the segment
     *     int head_flags[4];
     *     int tail_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagTails(
     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
     *         thread_data, cub::Inequality());
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
     * that the \p tile_predecessor_item is \p 0, and that the
     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
     * and the corresponding output \p tail_flags in those threads will be
     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        preds[0] = (linear_tid == 0) ?
            tile_predecessor_item :              // First thread
            temp_storage.last_items[linear_tid - 1];

        head_flags[0] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            preds[0],
            input[0],
            linear_tid * ITEMS_PER_THREAD);

        // Set flag for last thread-item
        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
            1 :                             // Last thread
            ApplyOp<FlagOp>::FlagT(
                flag_op,
                input[ITEMS_PER_THREAD - 1],
                temp_storage.first_items[linear_tid + 1],
                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }


    /**
     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
     *
     * \par
     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
     *   returns \p true (where <em>previous-item</em> is either the preceding item
     *   in the same thread or the last item in the previous thread).
     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
     *   against \p tile_predecessor_item.
     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
     *   <tt>input<sub><em>i</em></sub></tt> when
     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
     *   returns \p true (where <em>next-item</em> is either the next item
     *   in the same thread or the first item in the next thread).
     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
     *   against \p tile_successor_item.
     * - \blocked
     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     *     ...
     *
     *     // Have thread0 obtain the predecessor item for the entire tile
     *     int tile_predecessor_item;
     *     if (threadIdx.x == 0) tile_predecessor_item == ...
     *
     *     // Have thread127 obtain the successor item for the entire tile
     *     int tile_successor_item;
     *     if (threadIdx.x == 127) tile_successor_item == ...
     *
     *     // Collectively compute head and flags for discontinuities in the segment
     *     int head_flags[4];
     *     int tail_flags[4];
     *     BlockDiscontinuity(temp_storage).FlagTails(
     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
     *         thread_data, cub::Inequality());
     *
     * \endcode
     * \par
     * Suppose the set of input \p thread_data across the block of threads is
     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
     * that the \p tile_predecessor_item is \p 0, and that the
     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
     * and the corresponding output \p tail_flags in those threads will be
     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
     *
     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
     */
    template <
        int             ITEMS_PER_THREAD,
        typename        FlagT,
        typename        FlagOp>
    __device__ __forceinline__ void FlagHeadsAndTails(
        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
    {
        // Share first and last items
        temp_storage.first_items[linear_tid] = input[0];
        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

        CTA_SYNC();

        T preds[ITEMS_PER_THREAD];

        // Set flag for first thread-item
        preds[0] = (linear_tid == 0) ?
            tile_predecessor_item :              // First thread
            temp_storage.last_items[linear_tid - 1];

        head_flags[0] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            preds[0],
            input[0],
            linear_tid * ITEMS_PER_THREAD);

        // Set flag for last thread-item
        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
            tile_successor_item :              // Last thread
            temp_storage.first_items[linear_tid + 1];

        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
            flag_op,
            input[ITEMS_PER_THREAD - 1],
            successor_item,
            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);

        // Set head_flags for remaining items
        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);

        // Set tail_flags for remaining items
        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
    }




    //@}  end member group

};


xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
     */
    template <typename OutputT, typename OffsetT, typename ValidFlag>
    __device__ __forceinline__ void ScatterToStripedFlagged(
        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
    {
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
        {
            int item_offset = ranks[ITEM];
            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
            if (is_valid[ITEM])
                temp_storage.buff[item_offset] = input_items[ITEM];
        }

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
    {
        ScatterToStripedGuarded(items, items, ranks);
    }

    template <typename OffsetT, typename ValidFlag>
    __device__ __forceinline__ void ScatterToStripedFlagged(
        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
    {
        ScatterToStriped(items, items, ranks, is_valid);
    }

#endif // DOXYGEN_SHOULD_SKIP_THIS


};


xgboost/cub/cub/block/specializations/block_histogram_sort.cuh  view on Meta::CPAN


    /// Shared memory
    union _TempStorage
    {
        // Storage for sorting bin values
        typename BlockRadixSortT::TempStorage sort;

        struct
        {
            // Storage for detecting discontinuities in the tile of sorted bin values
            typename BlockDiscontinuityT::TempStorage flag;

            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
            unsigned int run_begin[BINS];
            unsigned int run_end[BINS];
        };
    };


    /// Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};

xgboost/cub/cub/block/specializations/block_histogram_sort.cuh  view on Meta::CPAN

        }
        // Finish up with guarded initialization if necessary
        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
        {
            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
        }

        CTA_SYNC();

        int flags[ITEMS_PER_THREAD];    // unused

        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
        DiscontinuityOp flag_op(temp_storage);
        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);

        // Update begin for first item
        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;

        CTA_SYNC();

        // Composite into histogram
        histo_offset = 0;

        #pragma unroll

xgboost/cub/cub/device/device_partition.cuh  view on Meta::CPAN

 * performance across different CUDA architectures for \p int32 items,
 * where 50% of the items are randomly selected for the first partition.
 * \plots_below
 *
 * \image html partition_if_int32_50_percent.png
 *
 */
struct DevicePartition
{
    /**
     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.pn...
     *
     * \par
     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
     * - Copies of the selected items are compacted into \p d_out and maintain their original
     *   relative ordering, however copies of the unselected items are compacted into the
     *   rear of \p d_out in reverse order.
     * - \devicestorage
     *
     * \par Snippet
     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
     * \par
     * \code
     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
     *
     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
     * int  num_items;              // e.g., 8
     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
     * int  *d_num_selected_out;    // e.g., [ ]
     * ...
     *
     * // Determine temporary device storage requirements
     * void     *d_temp_storage = NULL;
     * size_t   temp_storage_bytes = 0;
     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // Allocate temporary storage
     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
     *
     * // Run selection
     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
     * // d_num_selected_out    <-- [4]
     *
     * \endcode
     *
     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
     */
    template <
        typename                    InputIteratorT,
        typename                    FlagIterator,
        typename                    OutputIteratorT,
        typename                    NumSelectedIteratorT>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
        int                         num_items,                      ///< [in] Total number of items to select from
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


xgboost/cub/cub/device/device_partition.cuh  view on Meta::CPAN

            select_op,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_partition_flagged.cu
 * \example example_device_partition_if.cu
 */

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)


xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

 * \ingroup SingleModule
 *
 * \par Overview
 * These operations apply a selection criterion to selectively copy
 * items from a specified input sequence to a compact output sequence.
 *
 * \par Usage Considerations
 * \cdp_class{DeviceSelect}
 *
 * \par Performance
 * \linear_performance{select-flagged, select-if, and select-unique}
 *
 * \par
 * The following chart illustrates DeviceSelect::If
 * performance across different CUDA architectures for \p int32 items,
 * where 50% of the items are randomly selected.
 *
 * \image html select_if_int32_50_percent.png
 *
 * \par
 * The following chart illustrates DeviceSelect::Unique

xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

 *
 * \image html select_unique_int32_len_500.png
 *
 * \par
 * \plots_below
 *
 */
struct DeviceSelect
{
    /**
     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
     *
     * \par
     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage
     *
     * \par Snippet
     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
     * \par
     * \code
     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
     *
     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
     * int  num_items;              // e.g., 8
     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
     * int  *d_num_selected_out;    // e.g., [ ]
     * ...
     *
     * // Determine temporary device storage requirements
     * void     *d_temp_storage = NULL;
     * size_t   temp_storage_bytes = 0;
     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // Allocate temporary storage
     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
     *
     * // Run selection
     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // d_out                 <-- [1, 4, 6, 7]
     * // d_num_selected_out    <-- [4]
     *
     * \endcode
     *
     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
     */
    template <
        typename                    InputIteratorT,
        typename                    FlagIterator,
        typename                    OutputIteratorT,
        typename                    NumSelectedIteratorT>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_select_flagged.cu
 * \example example_device_select_if.cu
 * \example example_device_select_unique.cu
 */

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)


xgboost/cub/cub/device/dispatch/dispatch_rle.cuh  view on Meta::CPAN



/******************************************************************************
 * Kernel entry points
 *****************************************************************************/

/**
 * Select kernel entry point (multi-block)
 *
 * Performs functor-based selection if SelectOp functor type != NullType
 * Otherwise performs flag-based selection if FlagIterator's value type != NullType
 * Otherwise performs discontinuity selection (keep unique)
 */
template <
    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
    typename            ScanTileStateT,              ///< Tile status interface type
    typename            EqualityOpT,                 ///< T equality operator type



( run in 0.934 second using v1.01-cache-2.11-cpan-94b05bcf43c )