view release on metacpan or search on metacpan
0.04 2017-08-27
Fix preserving executable flag for xgboost command
0.03 2017-08-26
* Windows install working. Thanks again to Graham Ollis <plicease@cpan.org>
0.02 2017-08-24
* System install working using ffi_name property. Thanks to Graham Ollis <plicease@cpan.org>
* Switch from Module::Build to ExtUtils::MakeMaker
0.01 2017-08-20
* Dynamic library and command support
xgboost/cub/cub/warp/specializations/warp_scan_smem.cuh
xgboost/cub/cub/warp/warp_reduce.cuh
xgboost/cub/cub/warp/warp_scan.cuh
'xgboost/cub/eclipse code style profile.xml'
xgboost/cub/examples/block/Makefile
xgboost/cub/examples/block/example_block_radix_sort.cu
xgboost/cub/examples/block/example_block_reduce.cu
xgboost/cub/examples/block/example_block_scan.cu
xgboost/cub/examples/block/reduce_by_key.cu
xgboost/cub/examples/device/Makefile
xgboost/cub/examples/device/example_device_partition_flagged.cu
xgboost/cub/examples/device/example_device_partition_if.cu
xgboost/cub/examples/device/example_device_radix_sort.cu
xgboost/cub/examples/device/example_device_reduce.cu
xgboost/cub/examples/device/example_device_scan.cu
xgboost/cub/examples/device/example_device_select_flagged.cu
xgboost/cub/examples/device/example_device_select_if.cu
xgboost/cub/examples/device/example_device_select_unique.cu
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
xgboost/cub/experimental/Makefile
xgboost/cub/experimental/defunct/example_coo_spmv.cu
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu
xgboost/cub/experimental/histogram/histogram_cub.h
xgboost/cub/experimental/histogram/histogram_gmem_atomics.h
xgboost/cub/experimental/histogram/histogram_smem_atomics.h
xgboost/cub/experimental/histogram_compare.cu
xgboost/CMakeLists.txt view on Meta::CPAN
msvc_use_static_runtime()
# Options
option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
option(JVM_BINDINGS "Build JVM bindings" OFF)
option(GOOGLE_TEST "Build google tests" OFF)
set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
"Space separated list of compute versions to be built against")
# Compiler flags
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if(MSVC)
# Multithreaded compilation
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
else()
# Correct error for GCC 5 and cuda
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
xgboost/CMakeLists.txt view on Meta::CPAN
cmake_minimum_required(VERSION 3.5)
add_definitions(-DXGBOOST_USE_CUDA)
include_directories(
nccl/src
cub
)
set(GENCODE_FLAGS "")
format_gencode_flags("${GPU_COMPUTE_VER}" GENCODE_FLAGS)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;${GENCODE_FLAGS};-lineinfo;")
if(NOT MSVC)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11")
endif()
add_subdirectory(nccl)
cuda_add_library(gpuxgboost ${CUDA_SOURCES} STATIC)
target_link_libraries(gpuxgboost nccl)
list(APPEND LINK_LIBRARIES gpuxgboost)
list(APPEND SOURCES plugin/updater_gpu/src/register_updater_gpu.cc)
xgboost/R-package/R/callbacks.R view on Meta::CPAN
#' stopping. If not set, the last column would be used.
#' Let's say the test data in \code{watchlist} was labelled as \code{dtest},
#' and one wants to use the AUC in test data for early stopping regardless of where
#' it is in the \code{watchlist}, then one of the following would need to be set:
#' \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
#' All dash '-' characters in metric names are considered equivalent to '_'.
#' @param verbose whether to print the early stopping information.
#'
#' @details
#' This callback function determines the condition for early stopping
#' by setting the \code{stop_condition = TRUE} flag in its calling frame.
#'
#' The following additional fields are assigned to the model's R object:
#' \itemize{
#' \item \code{best_score} the evaluation score at the best iteration
#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
#' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
#' It differs from \code{best_iteration} in multiclass or random forest settings.
#' }
#'
#' The Same values are also stored as xgb-attributes:
xgboost/R-package/R/callbacks.R view on Meta::CPAN
xgb.save(env$bst, sprintf(save_name, env$iteration))
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.save.model'
callback
}
#' Callback closure for returning cross-validation based predictions.
#'
#' @param save_models a flag for whether to save the folds' models.
#'
#' @details
#' This callback function saves predictions for all of the test folds,
#' and also allows to save the folds' models.
#'
#' It is a "finalizer" callback and it uses early stopping information whenever it is available,
#' thus it must be run after the early stopping callback if the early stopping is used.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst_folds},
xgboost/R-package/R/xgb.Booster.R view on Meta::CPAN
}
#' Restore missing parts of an incomplete xgb.Booster object.
#'
#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing
#' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
#' but it has a raw Booster memory dump).
#'
#' @param object object of class \code{xgb.Booster}
#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data
#' when it doesn't already exist.
#'
#' @details
#'
#' While this method is primarily for internal use, it might be useful in some practical situations.
#'
#' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
#' should still work for such a model object since those methods would be using
#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
xgboost/R-package/R/xgb.model.dt.tree.R view on Meta::CPAN
#' Non-null \code{feature_names} could be provided to override those in the model.
#' @param model object of class \code{xgb.Booster}
#' @param text \code{character} vector previously generated by the \code{xgb.dump}
#' function (where parameter \code{with_stats = TRUE} should have been set).
#' \code{text} takes precedence over \code{model}.
#' @param trees an integer vector of tree indices that should be parsed.
#' If set to \code{NULL}, all trees of the model are parsed.
#' It could be useful, e.g., in multiclass classification to get only
#' the trees of one certain class. IMPORTANT: the tree index in xgboost models
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
#' @param use_int_id a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
#' represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).
#' @param ... currently not used.
#'
#' @return
#' A \code{data.table} with detailed information about model trees' nodes.
#'
#' The columns of the \code{data.table} are:
#'
#' \itemize{
#' \item \code{Tree}: integer ID of a tree in a model (zero-based index)
xgboost/R-package/R/xgb.plot.tree.R view on Meta::CPAN
#' Read a tree model text dump and plot the model.
#'
#' @param feature_names names of each feature as a \code{character} vector.
#' @param model produced by the \code{xgb.train} function.
#' @param trees an integer vector of tree indices that should be visualized.
#' If set to \code{NULL}, all trees of the model are included.
#' IMPORTANT: the tree index in xgboost model is zero-based
#' (e.g., use \code{trees = 0:2} for the first 3 trees in a model).
#' @param plot_width the width of the diagram in pixels.
#' @param plot_height the height of the diagram in pixels.
#' @param render a logical flag for whether the graph should be rendered (see Value).
#' @param show_node_id a logical flag for whether to show node id's in the graph.
#' @param ... currently not used.
#'
#' @details
#'
#' The content of each node is organised that way:
#'
#' \itemize{
#' \item Feature name.
#' \item \code{Cover}: The sum of second order gradient of training data classified to the leaf.
#' If it is square loss, this simply corresponds to the number of instances seen by a split
xgboost/R-package/man/cb.cv.predict.Rd view on Meta::CPAN
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/callbacks.R
\name{cb.cv.predict}
\alias{cb.cv.predict}
\title{Callback closure for returning cross-validation based predictions.}
\usage{
cb.cv.predict(save_models = FALSE)
}
\arguments{
\item{save_models}{a flag for whether to save the folds' models.}
}
\value{
Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
depending on the number of prediction outputs per data row. The order of predictions corresponds
to the order of rows in the original dataset. Note that when a custom \code{folds} list is
provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
meaningful when user-profided folds have overlapping indices as in, e.g., random sampling splits.
When some of the indices in the training dataset are not included into user-provided \code{folds},
their prediction value would be \code{NA}.
xgboost/R-package/man/cb.early.stop.Rd view on Meta::CPAN
\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
All dash '-' characters in metric names are considered equivalent to '_'.}
\item{verbose}{whether to print the early stopping information.}
}
\description{
Callback closure to activate the early stopping.
}
\details{
This callback function determines the condition for early stopping
by setting the \code{stop_condition = TRUE} flag in its calling frame.
The following additional fields are assigned to the model's R object:
\itemize{
\item \code{best_score} the evaluation score at the best iteration
\item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
\item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
It differs from \code{best_iteration} in multiclass or random forest settings.
}
The Same values are also stored as xgb-attributes:
xgboost/R-package/man/xgb.Booster.complete.Rd view on Meta::CPAN
% Please edit documentation in R/xgb.Booster.R
\name{xgb.Booster.complete}
\alias{xgb.Booster.complete}
\title{Restore missing parts of an incomplete xgb.Booster object.}
\usage{
xgb.Booster.complete(object, saveraw = TRUE)
}
\arguments{
\item{object}{object of class \code{xgb.Booster}}
\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data
when it doesn't already exist.}
}
\value{
An object of \code{xgb.Booster} class.
}
\description{
It attempts to complete an \code{xgb.Booster} object by restoring either its missing
raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
but it has a raw Booster memory dump).
xgboost/R-package/man/xgb.model.dt.tree.Rd view on Meta::CPAN
\item{text}{\code{character} vector previously generated by the \code{xgb.dump}
function (where parameter \code{with_stats = TRUE} should have been set).
\code{text} takes precedence over \code{model}.}
\item{trees}{an integer vector of tree indices that should be parsed.
If set to \code{NULL}, all trees of the model are parsed.
It could be useful, e.g., in multiclass classification to get only
the trees of one certain class. IMPORTANT: the tree index in xgboost models
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
\item{use_int_id}{a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).}
\item{...}{currently not used.}
}
\value{
A \code{data.table} with detailed information about model trees' nodes.
The columns of the \code{data.table} are:
\itemize{
xgboost/R-package/man/xgb.plot.tree.Rd view on Meta::CPAN
\item{trees}{an integer vector of tree indices that should be visualized.
If set to \code{NULL}, all trees of the model are included.
IMPORTANT: the tree index in xgboost model is zero-based
(e.g., use \code{trees = 0:2} for the first 3 trees in a model).}
\item{plot_width}{the width of the diagram in pixels.}
\item{plot_height}{the height of the diagram in pixels.}
\item{render}{a logical flag for whether the graph should be rendered (see Value).}
\item{show_node_id}{a logical flag for whether to show node id's in the graph.}
\item{...}{currently not used.}
}
\value{
When \code{render = TRUE}:
returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
Similar to ggplot objects, it needs to be printed to see it when not running from command line.
When \code{render = FALSE}:
silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
xgboost/R-package/tests/testthat/test_basic.R view on Meta::CPAN
context("basic functions")
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
set.seed(1994)
# disable some tests for Win32
windows_flag = .Platform$OS.type == "windows" &&
.Machine$sizeof.pointer != 8
test_that("train and predict binary classification", {
nrounds = 2
expect_output(
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic")
, "train-error")
expect_equal(class(bst), "xgb.Booster")
expect_equal(bst$niter, nrounds)
xgboost/R-package/tests/testthat/test_basic.R view on Meta::CPAN
param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)
# for the reference, use 4 iterations at once:
set.seed(11)
bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
# first two iterations:
set.seed(11)
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
# continue for two more:
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
if (!windows_flag)
expect_equal(bst$raw, bst2$raw)
expect_false(is.null(bst2$evaluation_log))
expect_equal(dim(bst2$evaluation_log), c(4, 2))
expect_equal(bst2$evaluation_log, bst$evaluation_log)
# test continuing from raw model data
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
if (!windows_flag)
expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2))
# test continuing from a model in file
xgb.save(bst1, "xgboost.model")
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.model")
if (!windows_flag)
expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2))
})
test_that("xgb.cv works", {
set.seed(11)
expect_output(
cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
xgboost/cmake/Utils.cmake view on Meta::CPAN
# Set a default build type to release if none was specified
function(set_default_configuration_release)
if(CMAKE_CONFIGURATION_TYPES STREQUAL "Debug;Release;MinSizeRel;RelWithDebInfo") # multiconfig generator?
set(CMAKE_CONFIGURATION_TYPES Release CACHE STRING "" FORCE)
elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE )
endif()
endfunction(set_default_configuration_release)
function(format_gencode_flags flags out)
foreach(ver ${flags})
set(${out} "${${out}}-gencode arch=compute_${ver},code=sm_${ver};")
endforeach()
set(${out} "${${out}}" PARENT_SCOPE)
endfunction(format_gencode_flags flags)
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
aliasing collective storage to shared memory that has been
allocated dynamically by the host at the kernel call site.
- Most CUB programs having typical 1D usage should not require any
changes to accomodate these updates.
- Added new "combination" WarpScan methods for efficiently computing
both inclusive and exclusive prefix scans (and sums).
- Bug fixes:
- Fixed bug in cub::WarpScan (which affected cub::BlockScan and
cub::DeviceScan) where incorrect results (e.g., NAN) would often be
returned when parameterized for floating-point types (fp32, fp64).
- Workaround-fix for ptxas error when compiling with with -G flag on Linux
(for debug instrumentation)
- Misc. workaround-fixes for certain scan scenarios (using custom
scan operators) where code compiled for SM1x is run on newer
GPUs of higher compute-capability: the compiler could not tell
which memory space was being used collective operations and was
mistakenly using global ops instead of shared ops.
//-----------------------------------------------------------------------------
1.2.3 04/01/2014
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
- Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
- Added workaround for TexRefInputIteratorTto work with CUDA 6.0
//-----------------------------------------------------------------------------
1.1.1 12/11/2013
- New features:
- Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy. Compatible with Thrust API.
- Added descending sorting to DeviceRadixSort and BlockRadixSort
- Added min, max, arg-min, and arg-max to DeviceReduce
- Added DeviceSelect (select-unique, select-if, and select-flagged)
- Added DevicePartition (partition-if, partition-flagged)
- Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
- Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
- Performance
- Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
- Interface changes:
- Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces. The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
- Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
- Documentation and testing:
- Added simple examples of device-wide methods
- Improved doxygen documentation and example snippets
xgboost/cub/common.mk view on Meta::CPAN
NVCC = "$(shell which nvcc)"
ifdef nvccver
NVCC_VERSION = $(nvccver)
else
NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'))
endif
# detect OS
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
# For MSVC
# Enable more warnings and treat as errors
NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
# Disable excess x86 floating point precision that can lead to results being labeled incorrectly
NVCCFLAGS += -Xcompiler /fp:strict
# Help the compiler/linker work with huge numbers of kernels on Windows
NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
bool is_valid[PIXELS_PER_THREAD];
// Load tile
LoadTile(
block_offset,
valid_samples,
samples,
Int2Type<IS_FULL_TILE>(),
Int2Type<IS_ALIGNED>());
// Set valid flags
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
// Accumulate samples
#if CUB_PTX_ARCH >= 120
if (prefer_smem)
AccumulateSmemPixels(samples, is_valid);
else
AccumulateGmemPixels(samples, is_valid);
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
reduction_op(reduction_op),
scan_op(reduction_op)
{}
//---------------------------------------------------------------------
// Scatter utility methods
//---------------------------------------------------------------------
/**
* Directly scatter flagged items to output offsets
*/
__device__ __forceinline__ void ScatterDirect(
KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
OffsetT (&segment_flags)[ITEMS_PER_THREAD],
OffsetT (&segment_indices)[ITEMS_PER_THREAD])
{
// Scatter flagged keys and values
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (segment_flags[ITEM])
{
d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key;
d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
}
}
}
/**
* 2-phase scatter flagged items to output offsets
*
* The exclusive scan causes each head flag to be paired with the previous
* value aggregate: the scatter offsets must be decremented for value aggregates
*/
__device__ __forceinline__ void ScatterTwoPhase(
KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
OffsetT (&segment_flags)[ITEMS_PER_THREAD],
OffsetT (&segment_indices)[ITEMS_PER_THREAD],
OffsetT num_tile_segments,
OffsetT num_tile_segments_prefix)
{
CTA_SYNC();
// Compact and scatter pairs
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (segment_flags[ITEM])
{
temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
}
}
CTA_SYNC();
for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
{
KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item];
d_unique_out[num_tile_segments_prefix + item] = pair.key;
d_aggregates_out[num_tile_segments_prefix + item] = pair.value;
}
}
/**
* Scatter flagged items
*/
__device__ __forceinline__ void Scatter(
KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
OffsetT (&segment_flags)[ITEMS_PER_THREAD],
OffsetT (&segment_indices)[ITEMS_PER_THREAD],
OffsetT num_tile_segments,
OffsetT num_tile_segments_prefix)
{
// Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
{
ScatterTwoPhase(
scatter_items,
segment_flags,
segment_indices,
num_tile_segments,
num_tile_segments_prefix);
}
else
{
ScatterDirect(
scatter_items,
segment_flags,
segment_indices);
}
}
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
template <bool IS_LAST_TILE> ///< Whether the current tile is the last tile
__device__ __forceinline__ void ConsumeTile(
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys
KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up
ValueOutputT values[ITEMS_PER_THREAD]; // Tile values
OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags
OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices
OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices
KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering
// Load keys
if (IS_LAST_TILE)
BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
else
BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
// Load tile predecessor key in first thread
KeyOutputT tile_predecessor;
if (threadIdx.x == 0)
{
tile_predecessor = (tile_idx == 0) ?
keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head)
d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile
}
CTA_SYNC();
// Load values
if (IS_LAST_TILE)
BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
else
BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
CTA_SYNC();
// Initialize head-flags and shuffle up the previous keys
if (IS_LAST_TILE)
{
// Use custom flag operator to additionally flag the first out-of-bounds item
GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
head_flags, keys, prev_keys, flag_op, tile_predecessor);
}
else
{
InequalityWrapper<EqualityOpT> flag_op(equality_op);
BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
head_flags, keys, prev_keys, flag_op, tile_predecessor);
}
// Zip values and head flags
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
scan_items[ITEM].value = values[ITEM];
scan_items[ITEM].key = head_flags[ITEM];
}
// Perform exclusive tile scan
OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate
OffsetT num_segments_prefix; // Number of segments prior to this tile
ValueOutputT total_aggregate; // The tile prefix folded with block_aggregate
if (tile_idx == 0)
{
// Scan first tile
BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
// Rezip scatter items and segment indices
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
scatter_items[ITEM].key = prev_keys[ITEM];
scatter_items[ITEM].value = scan_items[ITEM].value;
segment_indices[ITEM] = scan_items[ITEM].key;
}
// At this point, each flagged segment head has:
// - The key for the previous segment
// - The reduced value from the previous segment
// - The segment index for the reduced value
// Scatter flagged keys and values
OffsetT num_tile_segments = block_aggregate.key;
Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
// Last thread in last tile will output final count (and last pair, if necessary)
if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
{
OffsetT num_segments = num_segments_prefix + num_tile_segments;
// If the last tile is a whole tile, output the final_value
if (num_remaining == TILE_ITEMS)
{
d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1];
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
/// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
};
/**
* Special operator that signals all out-of-bounds items are not equal to everything else,
* forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
* trivial.
*/
template <bool LAST_TILE>
struct OobInequalityOp
{
OffsetT num_remaining;
EqualityOpT equality_op;
__device__ __forceinline__ OobInequalityOp(
OffsetT num_remaining,
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
// Per-thread fields
//---------------------------------------------------------------------
_TempStorage& temp_storage; ///< Reference to temp_storage
WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items
OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets
LengthsOutputIteratorT d_lengths_out; ///< Output run lengths
EqualityOpT equality_op; ///< T equality operator
ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator
OffsetT num_items; ///< Total number of input items
//---------------------------------------------------------------------
// Constructor
//---------------------------------------------------------------------
// Constructor
__device__ __forceinline__
AgentRle(
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
// Utility methods for initializing the selections
//---------------------------------------------------------------------
template <bool FIRST_TILE, bool LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT tile_offset,
OffsetT num_remaining,
T (&items)[ITEMS_PER_THREAD],
LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD])
{
bool head_flags[ITEMS_PER_THREAD];
bool tail_flags[ITEMS_PER_THREAD];
OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
if (FIRST_TILE && LAST_TILE)
{
// First-and-last-tile always head-flags the first item and tail-flags the last item
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
head_flags, tail_flags, items, inequality_op);
}
else if (FIRST_TILE)
{
// First-tile always head-flags the first item
// Get the first item from the next tile
T tile_successor_item;
if (threadIdx.x == BLOCK_THREADS - 1)
tile_successor_item = d_in[tile_offset + TILE_ITEMS];
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
head_flags, tail_flags, tile_successor_item, items, inequality_op);
}
else if (LAST_TILE)
{
// Last-tile always flags the last item
// Get the last item from the previous tile
T tile_predecessor_item;
if (threadIdx.x == 0)
tile_predecessor_item = d_in[tile_offset - 1];
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
}
else
{
// Get the first item from the next tile
T tile_successor_item;
if (threadIdx.x == BLOCK_THREADS - 1)
tile_successor_item = d_in[tile_offset + TILE_ITEMS];
// Get the last item from the previous tile
T tile_predecessor_item;
if (threadIdx.x == 0)
tile_predecessor_item = d_in[tile_offset - 1];
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
}
// Zip counts and runs
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]);
lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
}
}
//---------------------------------------------------------------------
// Scan utility methods
//---------------------------------------------------------------------
/**
* Scan of allocations
*/
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
// Load items
T items[ITEMS_PER_THREAD];
if (LAST_TILE)
BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining, T());
else
BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
if (SYNC_AFTER_LOAD)
CTA_SYNC();
// Set flags
LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD];
InitializeSelections<true, LAST_TILE>(
tile_offset,
num_remaining,
items,
lengths_and_num_runs);
// Exclusive scan of lengths and runs
LengthOffsetPair tile_aggregate;
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
// Load items
T items[ITEMS_PER_THREAD];
if (LAST_TILE)
BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining, T());
else
BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
if (SYNC_AFTER_LOAD)
CTA_SYNC();
// Set flags
LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD];
InitializeSelections<false, LAST_TILE>(
tile_offset,
num_remaining,
items,
lengths_and_num_runs);
// Exclusive scan of lengths and runs
LengthOffsetPair tile_aggregate;
xgboost/cub/cub/agent/agent_segment_fixup.cuh view on Meta::CPAN
if (IS_LAST_TILE)
BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
else
BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
CTA_SYNC();
KeyValuePairT tile_aggregate;
if (tile_idx == 0)
{
// Exclusive scan of values and segment_flags
BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
// Update tile status if this is not the last tile
if (threadIdx.x == 0)
{
// Set first segment id to not trigger a flush (invalid from exclusive scan)
scatter_pairs[0].key = pairs[0].key;
if (!IS_LAST_TILE)
tile_state.SetInclusive(0, tile_aggregate);
}
}
else
{
// Exclusive scan of values and segment_flags
TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
tile_aggregate = prefix_op.GetBlockAggregate();
}
// Scatter updated values
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (scatter_pairs[ITEM].key != pairs[ITEM].key)
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
*
* Performs functor-based selection if SelectOpT functor type != NullType
* Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
* Otherwise performs discontinuity selection (keep unique)
*/
template <
typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for selection items
typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items
typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
typename OffsetT, ///< Signed integer type for global offsets
bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
struct AgentSelectIf
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
// The flag value type
typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
// Tile status descriptor interface type
typedef ScanTileState<OffsetT> ScanTileStateT;
// Constants
enum
{
USE_SELECT_OP,
USE_SELECT_FLAGS,
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
WrappedFlagsInputIteratorT;
// Parameterized BlockLoad type for input data
typedef BlockLoad<
OutputT,
BLOCK_THREADS,
ITEMS_PER_THREAD,
AgentSelectIfPolicyT::LOAD_ALGORITHM>
BlockLoadT;
// Parameterized BlockLoad type for flags
typedef BlockLoad<
FlagT,
BLOCK_THREADS,
ITEMS_PER_THREAD,
AgentSelectIfPolicyT::LOAD_ALGORITHM>
BlockLoadFlags;
// Parameterized BlockDiscontinuity type for items
typedef BlockDiscontinuity<
OutputT,
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
{
typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback
typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection
};
// Smem needed for loading items
typename BlockLoadT::TempStorage load_items;
// Smem needed for loading values
typename BlockLoadFlags::TempStorage load_flags;
// Smem needed for compacting items (allows non POD items in this union)
Uninitialized<ItemExchangeT> raw_exchange;
};
// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
_TempStorage& temp_storage; ///< Reference to temp_storage
WrappedInputIteratorT d_in; ///< Input items
SelectedOutputIteratorT d_selected_out; ///< Unique output items
WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable)
InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
SelectOpT select_op; ///< Selection operator
OffsetT num_items; ///< Total number of input items
//---------------------------------------------------------------------
// Constructor
//---------------------------------------------------------------------
// Constructor
__device__ __forceinline__
AgentSelectIf(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorT d_in, ///< Input data
FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable)
SelectedOutputIteratorT d_selected_out, ///< Output data
SelectOpT select_op, ///< Selection operator
EqualityOpT equality_op, ///< Equality operator
OffsetT num_items) ///< Total number of input items
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_flags_in(d_flags_in),
d_selected_out(d_selected_out),
select_op(select_op),
inequality_op(equality_op),
num_items(num_items)
{}
//---------------------------------------------------------------------
// Utility methods for initializing the selections
//---------------------------------------------------------------------
/**
* Initialize selections (specialized for selection operator)
*/
template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT /*tile_offset*/,
OffsetT num_tile_items,
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
Int2Type<USE_SELECT_OP> /*select_method*/)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
// Out-of-bounds items are selection_flags
selection_flags[ITEM] = 1;
if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
selection_flags[ITEM] = select_op(items[ITEM]);
}
}
/**
* Initialize selections (specialized for valid flags)
*/
template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT tile_offset,
OffsetT num_tile_items,
OutputT (&/*items*/)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
Int2Type<USE_SELECT_FLAGS> /*select_method*/)
{
CTA_SYNC();
FlagT flags[ITEMS_PER_THREAD];
if (IS_LAST_TILE)
{
// Out-of-bounds items are selection_flags
BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
}
else
{
BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
}
// Convert flag type to selection_flags type
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
selection_flags[ITEM] = flags[ITEM];
}
}
/**
* Initialize selections (specialized for discontinuity detection)
*/
template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT tile_offset,
OffsetT num_tile_items,
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
Int2Type<USE_DISCONTINUITY> /*select_method*/)
{
if (IS_FIRST_TILE)
{
CTA_SYNC();
// Set head selection_flags. First tile sets the first flag for the first item
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
}
else
{
OutputT tile_predecessor;
if (threadIdx.x == 0)
tile_predecessor = d_in[tile_offset - 1];
CTA_SYNC();
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
}
// Set selection flags for out-of-bounds items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
// Set selection_flags for out-of-bounds items
if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
selection_flags[ITEM] = 1;
}
}
//---------------------------------------------------------------------
// Scatter utility methods
//---------------------------------------------------------------------
/**
* Scatter flagged items to output offsets (specialized for direct scattering)
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void ScatterDirect(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
OffsetT num_selections)
{
// Scatter flagged items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (selection_flags[ITEM])
{
if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
{
d_selected_out[selection_indices[ITEM]] = items[ITEM];
}
}
}
}
/**
* Scatter flagged items to output offsets (specialized for two-phase scattering)
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void ScatterTwoPhase(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
int /*num_tile_items*/, ///< Number of valid items in this tile
int num_tile_selections, ///< Number of selections in this tile
OffsetT num_selections_prefix, ///< Total number of selections prior to this tile
OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile
Int2Type<false> /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition
{
CTA_SYNC();
// Compact and scatter items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
if (selection_flags[ITEM])
{
temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
}
}
CTA_SYNC();
for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
{
d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
}
}
/**
* Scatter flagged items to output offsets (specialized for two-phase scattering)
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void ScatterTwoPhase(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
int num_tile_items, ///< Number of valid items in this tile
int num_tile_selections, ///< Number of selections in this tile
OffsetT num_selections_prefix, ///< Total number of selections prior to this tile
OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile
Int2Type<true> /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition
{
CTA_SYNC();
int tile_num_rejections = num_tile_items - num_tile_selections;
// Scatter items to shared memory (rejections first)
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
int local_selection_idx = selection_indices[ITEM] - num_selections_prefix;
int local_rejection_idx = item_idx - local_selection_idx;
int local_scatter_offset = (selection_flags[ITEM]) ?
tile_num_rejections + local_selection_idx :
local_rejection_idx;
temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
}
CTA_SYNC();
// Gather items from shared memory and scatter to global
#pragma unroll
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
if (!IS_LAST_TILE || (item_idx < num_tile_items))
{
d_selected_out[scatter_offset] = item;
}
}
}
/**
* Scatter flagged items
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void Scatter(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
int num_tile_items, ///< Number of valid items in this tile
int num_tile_selections, ///< Number of selections in this tile
OffsetT num_selections_prefix, ///< Total number of selections prior to this tile
OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile
OffsetT num_selections) ///< Total number of selections including this tile
{
// Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
{
ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
items,
selection_flags,
selection_indices,
num_tile_items,
num_tile_selections,
num_selections_prefix,
num_rejected_prefix,
Int2Type<KEEP_REJECTS>());
}
else
{
ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
items,
selection_flags,
selection_indices,
num_selections);
}
}
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile)
*/
template <bool IS_LAST_TILE>
__device__ __forceinline__ OffsetT ConsumeFirstTile(
int num_tile_items, ///< Number of input items comprising this tile
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
OutputT items[ITEMS_PER_THREAD];
OffsetT selection_flags[ITEMS_PER_THREAD];
OffsetT selection_indices[ITEMS_PER_THREAD];
// Load items
if (IS_LAST_TILE)
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
else
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
// Initialize selection_flags
InitializeSelections<true, IS_LAST_TILE>(
tile_offset,
num_tile_items,
items,
selection_flags,
Int2Type<SELECT_METHOD>());
CTA_SYNC();
// Exclusive scan of selection_flags
OffsetT num_tile_selections;
BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
if (threadIdx.x == 0)
{
// Update tile status if this is not the last tile
if (!IS_LAST_TILE)
tile_state.SetInclusive(0, num_tile_selections);
}
// Discount any out-of-bounds selections
if (IS_LAST_TILE)
num_tile_selections -= (TILE_ITEMS - num_tile_items);
// Scatter flagged items
Scatter<IS_LAST_TILE, true>(
items,
selection_flags,
selection_indices,
num_tile_items,
num_tile_selections,
0,
0,
num_tile_selections);
return num_tile_selections;
}
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
* Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile)
*/
template <bool IS_LAST_TILE>
__device__ __forceinline__ OffsetT ConsumeSubsequentTile(
int num_tile_items, ///< Number of input items comprising this tile
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
OutputT items[ITEMS_PER_THREAD];
OffsetT selection_flags[ITEMS_PER_THREAD];
OffsetT selection_indices[ITEMS_PER_THREAD];
// Load items
if (IS_LAST_TILE)
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
else
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
// Initialize selection_flags
InitializeSelections<false, IS_LAST_TILE>(
tile_offset,
num_tile_items,
items,
selection_flags,
Int2Type<SELECT_METHOD>());
CTA_SYNC();
// Exclusive scan of values and selection_flags
TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
OffsetT num_tile_selections = prefix_op.GetBlockAggregate();
OffsetT num_selections = prefix_op.GetInclusivePrefix();
OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix();
OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix;
// Discount any out-of-bounds selections
if (IS_LAST_TILE)
{
int num_discount = TILE_ITEMS - num_tile_items;
num_selections -= num_discount;
num_tile_selections -= num_discount;
}
// Scatter flagged items
Scatter<IS_LAST_TILE, false>(
items,
selection_flags,
selection_indices,
num_tile_items,
num_tile_selections,
num_selections_prefix,
num_rejected_prefix,
num_selections);
return num_selections;
}
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
}
return num_selections;
}
/**
* Scan tiles of items as part of a dynamic chained scan
*/
template <typename NumSelectedIteratorT> ///< Output iterator type for recording number of items selection_flags
__device__ __forceinline__ void ConsumeRange(
int num_tiles, ///< Total number of input tiles
ScanTileStateT& tile_state, ///< Global tile state descriptor
NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags
{
// Blocks are launched in increasing order, so just assign one tile per block
int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile
if (tile_idx < num_tiles - 1)
{
// Not the last tile (full)
ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
}
else
{
// The last tile (possibly partially-full)
OffsetT num_remaining = num_items - tile_offset;
OffsetT num_selections = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
if (threadIdx.x == 0)
{
// Output the total number of items selection_flags
*d_num_selected_out = num_selections;
}
}
}
};
} // CUB namespace
xgboost/cub/cub/agent/agent_spmv_row_based.cuh view on Meta::CPAN
row_start = temp_storage.nonzeros[local_row_nonzero_idx];
temp_storage.nonzeros[local_row_nonzero_idx] = NAN_TOKEN;
}
CTA_SYNC();
//
// Segmented scan
//
// Read strip of nonzeros into thread-blocked order, setup segment flags
KeyValuePairT scan_items[NNZ_PER_THREAD];
for (int ITEM = 0; ITEM < NNZ_PER_THREAD; ++ITEM)
{
int local_nonzero_idx = (threadIdx.x * NNZ_PER_THREAD) + ITEM;
ValueT value = temp_storage.nonzeros[local_nonzero_idx];
bool is_nan = (value != value);
scan_items[ITEM].value = (is_nan) ? 0.0 : value;
scan_items[ITEM].key = is_nan;
}
xgboost/cub/cub/agent/single_pass_scan_operators.cuh view on Meta::CPAN
int predecessor_idx, ///< Preceding tile index to inspect
StatusWord &predecessor_status, ///< [out] Preceding tile status
T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles
{
T value;
tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
// Perform a segmented reduction to get the prefix for the current window.
// Use the swizzled scan operator because we are now scanning *down* towards thread0.
int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
value,
tail_flag,
SwizzleScanOp<ScanOpT>(scan_op));
}
// BlockScan prefix callback functor (called by the first warp)
__device__ __forceinline__
T operator()(T block_aggregate)
{
// Update our status with our tile-aggregate
xgboost/cub/cub/block/block_adjacent_difference.cuh view on Meta::CPAN
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
*/
#pragma once
#include "../util_type.cuh"
#include "../util_ptx.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
xgboost/cub/cub/block/block_adjacent_difference.cuh view on Meta::CPAN
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Specialization for when FlagOp has third index param
template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
struct ApplyOp
{
// Apply flag operator
static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
{
return flag_op(b, a, idx);
}
};
/// Specialization for when FlagOp does not have a third index param
template <typename FlagOp>
struct ApplyOp<FlagOp, false>
{
// Apply flag operator
static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
{
return flag_op(b, a);
}
};
/// Templated unrolling of item comparison (inductive case)
template <int ITERATION, int MAX_ITERATIONS>
struct Iterate
{
// Head flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagHeads(
int linear_tid,
FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
preds[ITERATION] = input[ITERATION - 1];
flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[ITERATION],
input[ITERATION],
(linear_tid * ITEMS_PER_THREAD) + ITERATION);
Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
}
// Tail flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagTails(
int linear_tid,
FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITERATION],
input[ITERATION + 1],
(linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
}
};
/// Templated unrolling of item comparison (termination case)
template <int MAX_ITERATIONS>
struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
{
// Head flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagHeads(
int /*linear_tid*/,
FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate
{}
// Tail flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagTails(
int /*linear_tid*/,
FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate
{}
};
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
xgboost/cub/cub/block/block_adjacent_difference.cuh view on Meta::CPAN
__device__ __forceinline__ BlockAdjacentDifference(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Head flag operations
*********************************************************************/
//@{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
if (linear_tid == 0)
{
// Set flag for first thread-item (preds[0] is undefined)
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
}
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_predecessor_item) ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
T preds[ITEMS_PER_THREAD];
FlagHeads(head_flags, input, preds, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_predecessor_item) ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
{
T preds[ITEMS_PER_THREAD];
FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagTails(
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagTails(
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_successor_item) ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = temp_storage.last_items[linear_tid - 1];
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T tile_successor_item, ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T tile_predecessor_item, ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T tile_predecessor_item, ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T tile_successor_item, ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
*/
#pragma once
#include "../util_type.cuh"
#include "../util_ptx.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. 
* \ingroup BlockModule
*
* \tparam T The data type to be flagged.
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
*
* \par Overview
* - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
* that differ from their predecessors (or successors). For example, head flags are convenient
* for demarcating disjoint data segments as part of a segmented scan or reduction.
* - \blocked
*
* \par Performance Considerations
* - \granularity
*
* \par A Simple Example
* \blockcollective{BlockDiscontinuity}
* \par
* The code snippet below illustrates the head flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute head flags for discontinuities in the segment
* int head_flags[4];
* BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
* The corresponding output \p head_flags in those threads will be
* <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
*
* \par Performance Considerations
* - Incurs zero bank conflicts for most types
*
*/
template <
typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Specialization for when FlagOp has third index param
template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
struct ApplyOp
{
// Apply flag operator
static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
{
return flag_op(a, b, idx);
}
};
/// Specialization for when FlagOp does not have a third index param
template <typename FlagOp>
struct ApplyOp<FlagOp, false>
{
// Apply flag operator
static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
{
return flag_op(a, b);
}
};
/// Templated unrolling of item comparison (inductive case)
template <int ITERATION, int MAX_ITERATIONS>
struct Iterate
{
// Head flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagHeads(
int linear_tid,
FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
preds[ITERATION] = input[ITERATION - 1];
flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[ITERATION],
input[ITERATION],
(linear_tid * ITEMS_PER_THREAD) + ITERATION);
Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
}
// Tail flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagTails(
int linear_tid,
FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITERATION],
input[ITERATION + 1],
(linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
}
};
/// Templated unrolling of item comparison (termination case)
template <int MAX_ITERATIONS>
struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
{
// Head flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagHeads(
int /*linear_tid*/,
FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate
{}
// Tail flags
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
static __device__ __forceinline__ void FlagTails(
int /*linear_tid*/,
FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate
{}
};
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
__device__ __forceinline__ BlockDiscontinuity(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/******************************************************************//**
* \name Head flag operations
*********************************************************************/
//@{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
if (linear_tid == 0)
{
// Set flag for first thread-item (preds[0] is undefined)
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
}
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
}
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_predecessor_item) ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
*
* \par
* - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute head flags for discontinuities in the segment
* int head_flags[4];
* BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
* The corresponding output \p head_flags in those threads will be
* <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
T preds[ITEMS_PER_THREAD];
FlagHeads(head_flags, input, preds, flag_op);
}
/**
* \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
*
* \par
* - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
* against \p tile_predecessor_item.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Have thread0 obtain the predecessor item for the entire tile
* int tile_predecessor_item;
* if (threadIdx.x == 0) tile_predecessor_item == ...
*
* // Collectively compute head flags for discontinuities in the segment
* int head_flags[4];
* BlockDiscontinuity(temp_storage).FlagHeads(
* head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
* and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be
* <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_predecessor_item) ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
{
T preds[ITEMS_PER_THREAD];
FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
}
//@} end member group
/******************************************************************//**
* \name Tail flag operations
*********************************************************************/
//@{
/**
* \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
*
* \par
* - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
* <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute tail flags for discontinuities in the segment
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
* The corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagTails(
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
/**
* \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
*
* \par
* - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
* <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
* against \p tile_successor_item.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Have thread127 obtain the successor item for the entire tile
* int tile_successor_item;
* if (threadIdx.x == 127) tile_successor_item == ...
*
* // Collectively compute tail flags for discontinuities in the segment
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(
* tail_flags, thread_data, cub::Inequality(), tile_successor_item);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
* and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagTails(
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_successor_item) ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//@} end member group
/******************************************************************//**
* \name Head & tail flag operations
*********************************************************************/
//@{
/**
* \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
*
* \par
* - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
* - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
* <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute head and flags for discontinuities in the segment
* int head_flags[4];
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(
* head_flags, tail_flags, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
* and that the tile_successor_item is \p 125. The corresponding output \p head_flags
* in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
* and the corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = temp_storage.last_items[linear_tid - 1];
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
/**
* \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
*
* \par
* - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
* - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
* <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
* against \p tile_predecessor_item.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Have thread127 obtain the successor item for the entire tile
* int tile_successor_item;
* if (threadIdx.x == 127) tile_successor_item == ...
*
* // Collectively compute head and flags for discontinuities in the segment
* int head_flags[4];
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(
* head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
* and that the tile_successor_item is \p 125. The corresponding output \p head_flags
* in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
* and the corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T tile_successor_item, ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
/**
* \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
*
* \par
* - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
* against \p tile_predecessor_item.
* - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
* <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* ...
*
* // Have thread0 obtain the predecessor item for the entire tile
* int tile_predecessor_item;
* if (threadIdx.x == 0) tile_predecessor_item == ...
*
* // Have thread127 obtain the successor item for the entire tile
* int tile_successor_item;
* if (threadIdx.x == 127) tile_successor_item == ...
*
* // Collectively compute head and flags for discontinuities in the segment
* int head_flags[4];
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(
* head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
* thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
* that the \p tile_predecessor_item is \p 0, and that the
* \p tile_successor_item is \p 125. The corresponding output \p head_flags
* in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
* and the corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T tile_predecessor_item, ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
/**
* \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
*
* \par
* - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
* against \p tile_predecessor_item.
* - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
* <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
* against \p tile_successor_item.
* - \blocked
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* ...
*
* // Have thread0 obtain the predecessor item for the entire tile
* int tile_predecessor_item;
* if (threadIdx.x == 0) tile_predecessor_item == ...
*
* // Have thread127 obtain the successor item for the entire tile
* int tile_successor_item;
* if (threadIdx.x == 127) tile_successor_item == ...
*
* // Collectively compute head and flags for discontinuities in the segment
* int head_flags[4];
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(
* head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
* thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
* that the \p tile_predecessor_item is \p 0, and that the
* \p tile_successor_item is \p 125. The corresponding output \p head_flags
* in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
* and the corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a ...
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T tile_predecessor_item, ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T tile_successor_item, ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread...
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//@} end member group
};
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
* - \smemreuse
*
* \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
* \tparam ValidFlag <b>[inferred]</b> FlagT type denoting which items are valid
*/
template <typename OutputT, typename OffsetT, typename ValidFlag>
__device__ __forceinline__ void ScatterToStripedFlagged(
InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ranks[ITEM];
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
if (is_valid[ITEM])
temp_storage.buff[item_offset] = input_items[ITEM];
}
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToStripedGuarded(items, items, ranks);
}
template <typename OffsetT, typename ValidFlag>
__device__ __forceinline__ void ScatterToStripedFlagged(
InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity
{
ScatterToStriped(items, items, ranks, is_valid);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
};
xgboost/cub/cub/block/specializations/block_histogram_sort.cuh view on Meta::CPAN
/// Shared memory
union _TempStorage
{
// Storage for sorting bin values
typename BlockRadixSortT::TempStorage sort;
struct
{
// Storage for detecting discontinuities in the tile of sorted bin values
typename BlockDiscontinuityT::TempStorage flag;
// Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
unsigned int run_begin[BINS];
unsigned int run_end[BINS];
};
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
xgboost/cub/cub/block/specializations/block_histogram_sort.cuh view on Meta::CPAN
}
// Finish up with guarded initialization if necessary
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
{
temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
}
CTA_SYNC();
int flags[ITEMS_PER_THREAD]; // unused
// Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
DiscontinuityOp flag_op(temp_storage);
BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
// Update begin for first item
if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
CTA_SYNC();
// Composite into histogram
histo_offset = 0;
#pragma unroll
xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN
* performance across different CUDA architectures for \p int32 items,
* where 50% of the items are randomly selected for the first partition.
* \plots_below
*
* \image html partition_if_int32_50_percent.png
*
*/
struct DevicePartition
{
/**
* \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. .
* - Copies of the selected items are compacted into \p d_out and maintain their original
* relative ordering, however copies of the unselected items are compacted into the
* rear of \p d_out in reverse order.
* - \devicestorage
*
* \par Snippet
* The code snippet below illustrates the compaction of items selected from an \p int device vector.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/device/device_partition.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
* int num_items; // e.g., 8
* int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
* char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
* int *d_out; // e.g., [ , , , , , , , ]
* int *d_num_selected_out; // e.g., [ ]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run selection
* cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2]
* // d_num_selected_out <-- [4]
*
* \endcode
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
* \tparam FlagIterator <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
* \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
*/
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN
select_op,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_partition_flagged.cu
* \example example_device_partition_if.cu
*/
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
* \ingroup SingleModule
*
* \par Overview
* These operations apply a selection criterion to selectively copy
* items from a specified input sequence to a compact output sequence.
*
* \par Usage Considerations
* \cdp_class{DeviceSelect}
*
* \par Performance
* \linear_performance{select-flagged, select-if, and select-unique}
*
* \par
* The following chart illustrates DeviceSelect::If
* performance across different CUDA architectures for \p int32 items,
* where 50% of the items are randomly selected.
*
* \image html select_if_int32_50_percent.png
*
* \par
* The following chart illustrates DeviceSelect::Unique
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
*
* \image html select_unique_int32_len_500.png
*
* \par
* \plots_below
*
*/
struct DeviceSelect
{
/**
* \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. 
*
* \par
* - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
* - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
* - \devicestorage
*
* \par Snippet
* The code snippet below illustrates the compaction of items selected from an \p int device vector.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
* int num_items; // e.g., 8
* int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
* char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
* int *d_out; // e.g., [ , , , , , , , ]
* int *d_num_selected_out; // e.g., [ ]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run selection
* cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // d_out <-- [1, 4, 6, 7]
* // d_num_selected_out <-- [4]
*
* \endcode
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
* \tparam FlagIterator <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
* \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
*/
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_select_flagged.cu
* \example example_device_select_if.cu
* \example example_device_select_unique.cu
*/
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
/******************************************************************************
* Kernel entry points
*****************************************************************************/
/**
* Select kernel entry point (multi-block)
*
* Performs functor-based selection if SelectOp functor type != NullType
* Otherwise performs flag-based selection if FlagIterator's value type != NullType
* Otherwise performs discontinuity selection (keep unique)
*/
template <
typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator
typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator
typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator
typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator
typename ScanTileStateT, ///< Tile status interface type
typename EqualityOpT, ///< T equality operator type