view release on metacpan or search on metacpan
xgboost/R-package/R/xgb.ggplot.R view on Meta::CPAN
print(
plots[[i]], vp = grid::viewport(
layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col
)
)
}
}
}
globalVariables(c(
"Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
"element_blank", "element_text", "V1", "Weight"
))
xgboost/R-package/R/xgb.importance.R view on Meta::CPAN
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency))][
order(Gain, decreasing = TRUE)]
}
result
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature"))
xgboost/R-package/R/xgb.model.dt.tree.R view on Meta::CPAN
td[, t := NULL]
td[, isLeaf := NULL]
td[order(Tree, Node)]
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf",".SD", ".SDcols"))
xgboost/R-package/R/xgb.plot.deepness.R view on Meta::CPAN
# list of paths to each leaf in a tree
paths <- lapply(paths_tmp$vpath, names)
# combine into a resulting path lengths table for a tree
data.table(Depth = sapply(paths, length), ID = To[Leaf == TRUE])
}, by = Tree]
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(
c(
".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature", "Leaf", "Weight"
)
)
xgboost/R-package/R/xgb.plot.importance.R view on Meta::CPAN
barplot(Importance, horiz = TRUE, border = NA, add = TRUE)]
par(op)
}
invisible(importance_matrix)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "Importance"))
xgboost/R-package/R/xgb.plot.multi.trees.R view on Meta::CPAN
edges <- DiagrammeR::create_edge_df(
from = match(edges.dt[,From], nodes.dt[,abs.node.position]),
to = match(edges.dt[,To], nodes.dt[,abs.node.position]),
rel = "leading_to")
graph <- DiagrammeR::create_graph(
nodes_df = nodes,
edges_df = edges,
attr_theme = NULL
) %>%
DiagrammeR::add_global_graph_attrs(
attr_type = "graph",
attr = c("layout", "rankdir"),
value = c("dot", "LR")
) %>%
DiagrammeR::add_global_graph_attrs(
attr_type = "node",
attr = c("color", "fillcolor", "style", "shape", "fontname"),
value = c("DimGray", "beige", "filled", "rectangle", "Helvetica")
) %>%
DiagrammeR::add_global_graph_attrs(
attr_type = "edge",
attr = c("color", "arrowsize", "arrowhead", "fontname"),
value = c("DimGray", "1.5", "vee", "Helvetica"))
DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
}
globalVariables(c(".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos",
"ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"))
xgboost/R-package/R/xgb.plot.tree.R view on Meta::CPAN
c(rep("", nrow(dt[Feature != "Leaf"]))),
style = dt[Feature != "Leaf", ifelse(Missing == Yes, "bold", "solid")] %>%
c(dt[Feature != "Leaf", ifelse(Missing == No, "bold", "solid")]),
rel = "leading_to")
graph <- DiagrammeR::create_graph(
nodes_df = nodes,
edges_df = edges,
attr_theme = NULL
) %>%
DiagrammeR::add_global_graph_attrs(
attr_type = "graph",
attr = c("layout", "rankdir"),
value = c("dot", "LR")
) %>%
DiagrammeR::add_global_graph_attrs(
attr_type = "node",
attr = c("color", "style", "fontname"),
value = c("DimGray", "filled", "Helvetica")
) %>%
DiagrammeR::add_global_graph_attrs(
attr_type = "edge",
attr = c("color", "arrowsize", "arrowhead", "fontname"),
value = c("DimGray", "1.5", "vee", "Helvetica"))
if (!render) return(invisible(graph))
DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))
xgboost/R-package/R/xgb.train.R view on Meta::CPAN
#' \itemize{
#' \item \code{reg:linear} linear regression (Default).
#' \item \code{reg:logistic} logistic regression.
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
#' \item \code{num_class} set the number of classes. To use only with multiclass objectives.
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
#' \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each cl...
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#' }
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ...
#' }
#'
#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
#' @param nrounds max number of boosting iterations.
#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
#' Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
#' of these datasets during each boosting iteration, and stored in the end as a field named
#' \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
xgboost/R-package/configure view on Meta::CPAN
else
as_fn_append ()
{
eval $1=\$$1\$2
}
fi # as_fn_append
# as_fn_arith ARG...
# ------------------
# Perform arithmetic evaluation on the ARGs, and store the result in the
# global $as_val. Take advantage of shells that can avoid forks. The arguments
# must be portable across $(()) and expr.
if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
eval 'as_fn_arith ()
{
as_val=$(( $* ))
}'
else
as_fn_arith ()
{
as_val=`expr "$@" || test $? -eq 1`
xgboost/R-package/configure view on Meta::CPAN
else
as_fn_append ()
{
eval $1=\$$1\$2
}
fi # as_fn_append
# as_fn_arith ARG...
# ------------------
# Perform arithmetic evaluation on the ARGs, and store the result in the
# global $as_val. Take advantage of shells that can avoid forks. The arguments
# must be portable across $(()) and expr.
if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
eval 'as_fn_arith ()
{
as_val=$(( $* ))
}'
else
as_fn_arith ()
{
as_val=`expr "$@" || test $? -eq 1`
xgboost/R-package/man/xgb.train.Rd view on Meta::CPAN
\itemize{
\item \code{reg:linear} linear regression (Default).
\item \code{reg:logistic} logistic regression.
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
\item \code{num_class} set the number of classes. To use only with multiclass objectives.
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
\item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class...
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
}
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ran...
}}
\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.}
\item{nrounds}{max number of boosting iterations.}
\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
xgboost/amalgamation/xgboost-all0.cc view on Meta::CPAN
#include "../src/tree/tree_model.cc"
#include "../src/tree/tree_updater.cc"
#include "../src/tree/updater_colmaker.cc"
#include "../src/tree/updater_fast_hist.cc"
#include "../src/tree/updater_prune.cc"
#include "../src/tree/updater_refresh.cc"
#include "../src/tree/updater_sync.cc"
#include "../src/tree/updater_histmaker.cc"
#include "../src/tree/updater_skmaker.cc"
// global
#include "../src/learner.cc"
#include "../src/logging.cc"
#include "../src/common/common.cc"
#include "../src/common/hist_util.cc"
// c_api
#include "../src/c_api/c_api.cc"
#include "../src/c_api/c_api_error.cc"
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
- Bug fixes:
- Fixed bug in cub::WarpScan (which affected cub::BlockScan and
cub::DeviceScan) where incorrect results (e.g., NAN) would often be
returned when parameterized for floating-point types (fp32, fp64).
- Workaround-fix for ptxas error when compiling with with -G flag on Linux
(for debug instrumentation)
- Misc. workaround-fixes for certain scan scenarios (using custom
scan operators) where code compiled for SM1x is run on newer
GPUs of higher compute-capability: the compiler could not tell
which memory space was being used collective operations and was
mistakenly using global ops instead of shared ops.
//-----------------------------------------------------------------------------
1.2.3 04/01/2014
- Bug fixes:
- Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
- Fixed code-snippet bug in ArgIndexInputIteratorT documentation
//-----------------------------------------------------------------------------
xgboost/cub/README.md view on Meta::CPAN

<br><hr>
<h3>A Simple Example</h3>
```C++
#include <cub/cub.cuh>
// Block-sorting CUDA kernel
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
using namespace cub;
// Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
// owning 16 integer items each
typedef BlockRadixSort<int, 128, 16> BlockRadixSort;
typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE> BlockLoad;
typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
// Allocate shared memory
xgboost/cub/README.md view on Meta::CPAN
The cub::BlockLoad and cub::BlockStore classes are similarly specialized.
Furthermore, to provide coalesced accesses to device memory, these primitives are
configured to access memory using a striped access pattern (where consecutive threads
simultaneously access consecutive items) and then <em>transpose</em> the keys into
a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads.
Once specialized, these classes expose opaque \p TempStorage member types.
The thread block uses these storage types to statically allocate the union of
shared memory needed by the thread block. (Alternatively these storage types
could be aliased to global memory allocations).
<br><hr>
<h3>Stable Releases</h3>
CUB releases are labeled using version identifiers having three fields:
*epoch.feature.update*. The *epoch* field corresponds to support for
a major change in the CUDA programming model. The *feature* field
corresponds to a stable set of features, functionality, and interface. The
*update* field corresponds to a bug-fix or performance update for that
feature set. At the moment, we do not publicly provide non-stable releases
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
/**
* Parameterizable tuning policy type for AgentHistogram
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue
struct AgentHistogramPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
};
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
*/
template <
typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type
int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory.
int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels.
int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename SampleIteratorT, ///< Random-access input iterator type for reading samples
typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
typename OffsetT, ///< Signed integer type for global offsets
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
struct AgentHistogram
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// The sample type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
/// Reference to final output histograms (gmem)
CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
/// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
/// The transform operator for determining privatized counter indices from samples, one for each channel
PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
/// Whether to prefer privatized smem counters vs privatized global counters
bool prefer_smem;
//---------------------------------------------------------------------
// Initialize privatized bin counters
//---------------------------------------------------------------------
// Initialize privatized bin counters
__device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
{
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
{
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
InitBinCounters(privatized_histograms);
}
// Initialize privatized bin counters. Specialized for privatized global-memory counters
__device__ __forceinline__ void InitGmemBinCounters()
{
InitBinCounters(d_privatized_histograms);
}
//---------------------------------------------------------------------
// Update final output histograms
//---------------------------------------------------------------------
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
__device__ __forceinline__ void StoreSmemOutput()
{
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
StoreOutput(privatized_histograms);
}
// Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
__device__ __forceinline__ void StoreGmemOutput()
{
StoreOutput(d_privatized_histograms);
}
//---------------------------------------------------------------------
// Tile accumulation
//---------------------------------------------------------------------
xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh view on Meta::CPAN
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Types of scattering strategies
*/
enum RadixSortScatterAlgorithm
{
RADIX_SORT_SCATTER_DIRECT, ///< Scatter directly from registers to global bins
RADIX_SORT_SCATTER_TWO_PHASE, ///< First scatter from registers into shared memory bins, then into global bins
};
/**
* Parameterizable tuning policy type for AgentRadixSortDownsweep
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh view on Meta::CPAN
******************************************************************************/
/**
* \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
*/
template <
typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
typename KeyT, ///< KeyT type
typename ValueT, ///< ValueT type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentRadixSortDownsweep
{
//---------------------------------------------------------------------
// Type definitions and constants
//---------------------------------------------------------------------
// Appropriate unsigned-bits representation of KeyT
typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
static const UnsignedBits LOWEST_KEY = Traits<KeyT>::LOWEST_KEY;
xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh view on Meta::CPAN
// Shared storage for this CTA
_TempStorage &temp_storage;
// Input and output device pointers
KeysItr d_keys_in;
ValuesItr d_values_in;
UnsignedBits *d_keys_out;
ValueT *d_values_out;
// The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
OffsetT bin_offset;
// The least-significant bit position of the current digit to extract
int current_bit;
// Number of bits in current digit
int num_bits;
// Whether to short-cirucit
int short_circuit;
xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh view on Meta::CPAN
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ProcessTile(
OffsetT block_offset,
const OffsetT &valid_items = TILE_ITEMS)
{
// Per-thread tile data
UnsignedBits keys[ITEMS_PER_THREAD]; // Keys
UnsignedBits twiddled_keys[ITEMS_PER_THREAD]; // Twiddled keys
int ranks[ITEMS_PER_THREAD]; // For each key, the local rank within the CTA
OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; // For each key, the global scatter base offset of the corresponding digit
// Assign default (min/max) value to all keys
UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
// Load tile of keys
BlockLoadKeys loader(temp_storage.load_keys);
LoadItems(
loader,
keys,
d_keys_in + block_offset,
xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh view on Meta::CPAN
{
// Get inclusive digit prefix from exclusive prefix (lower bins come first)
inclusive_digit_prefix = (threadIdx.x == RADIX_DIGITS - 1) ?
(BLOCK_THREADS * ITEMS_PER_THREAD) :
temp_storage.exclusive_digit_prefix[threadIdx.x + 1];
}
}
CTA_SYNC();
// Update global scatter base offsets for each digit
if (threadIdx.x < RADIX_DIGITS)
{
bin_offset -= exclusive_digit_prefix;
temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
bin_offset += inclusive_digit_prefix;
}
CTA_SYNC();
xgboost/cub/cub/agent/agent_radix_sort_upsweep.cuh view on Meta::CPAN
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
*/
template <
typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
typename KeyT, ///< KeyT type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentRadixSortUpsweep
{
//---------------------------------------------------------------------
// Type definitions and constants
//---------------------------------------------------------------------
typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
// Integer type for digit counters (to be packed into words of PackedCounters)
xgboost/cub/cub/agent/agent_reduce.cuh view on Meta::CPAN
* \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
*
* Each thread reduces only the values it loads. If \p FIRST_TILE, this
* partial reduction is stored into \p thread_aggregate. Otherwise it is
* accumulated into \p thread_aggregate.
*/
template <
typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type
typename InputIteratorT, ///< Random-access iterator type for input
typename OutputIteratorT, ///< Random-access iterator type for output
typename OffsetT, ///< Signed integer type for global offsets
typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
struct AgentReduce
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
xgboost/cub/cub/agent/agent_reduce.cuh view on Meta::CPAN
return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(block_offset, block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(block_offset, block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
}
/**
* Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT /*num_items*/, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &/*queue*/, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_EVEN_SHARE> /*is_even_share*/) ///< [in] Marker type indicating this is an even-share mapping
{
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
xgboost/cub/cub/agent/agent_reduce.cuh view on Meta::CPAN
// Compute block-wide reduction (all threads have valid items)
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
}
/**
* Dequeue and reduce tiles of items as part of a inter-block reduction
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT num_items, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &/*even_share*/, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &queue, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_DYNAMIC> /*is_dynamic*/) ///< [in] Marker type indicating this is a dynamic mapping
{
return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeTiles(num_items, queue, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeTiles(num_items, queue, Int2Type<false && ATTEMPT_VECTORIZATION>());
}
};
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
*/
template <
typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type
typename KeysInputIteratorT, ///< Random-access input iterator type for keys
typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys
typename ValuesInputIteratorT, ///< Random-access input iterator type for values
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected
typename EqualityOpT, ///< KeyT equality operator type
typename ReductionOpT, ///< ValueT reduction operator type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentReduceByKey
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input keys type
typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
// The output keys type
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process a tile of input (dynamic chained scan)
*/
template <bool IS_LAST_TILE> ///< Whether the current tile is the last tile
__device__ __forceinline__ void ConsumeTile(
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys
KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up
ValueOutputT values[ITEMS_PER_THREAD]; // Tile values
OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags
OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices
OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
/**
* \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode
*/
template <
typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for data
typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values
typename EqualityOpT, ///< T equality operator type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentRle
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type T;
/// The lengths output value type
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
//---------------------------------------------------------------------
// Utility methods for scattering selections
//---------------------------------------------------------------------
/**
* Two-phase scatter, specialized for warp time-slicing
*/
template <bool FIRST_TILE>
__device__ __forceinline__ void ScatterTwoPhase(
OffsetT tile_num_runs_exclusive_in_global,
OffsetT warp_num_runs_aggregate,
OffsetT warp_num_runs_exclusive_in_tile,
OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD],
Int2Type<true> is_warp_time_slice)
{
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = LaneId();
// Locally compact items within the warp (first warp)
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
}
}
// Global scatter
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
{
OffsetT item_offset =
tile_num_runs_exclusive_in_global +
warp_num_runs_exclusive_in_tile +
(ITEM * WARP_THREADS) + lane_id;
// Scatter offset
d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
// Scatter length if not the first (global) length
if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
{
d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
}
}
}
}
/**
* Two-phase scatter
*/
template <bool FIRST_TILE>
__device__ __forceinline__ void ScatterTwoPhase(
OffsetT tile_num_runs_exclusive_in_global,
OffsetT warp_num_runs_aggregate,
OffsetT warp_num_runs_exclusive_in_tile,
OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD],
Int2Type<false> is_warp_time_slice)
{
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = LaneId();
// Unzip
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
WarpExchangeLengths(temp_storage.exchange_lengths[warp_id]).ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp);
// Global scatter
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
{
OffsetT item_offset =
tile_num_runs_exclusive_in_global +
warp_num_runs_exclusive_in_tile +
(ITEM * WARP_THREADS) + lane_id;
// Scatter offset
d_offsets_out[item_offset] = run_offsets[ITEM];
// Scatter length if not the first (global) length
if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
{
d_lengths_out[item_offset - 1] = run_lengths[ITEM];
}
}
}
}
/**
* Direct scatter
*/
template <bool FIRST_TILE>
__device__ __forceinline__ void ScatterDirect(
OffsetT tile_num_runs_exclusive_in_global,
OffsetT warp_num_runs_aggregate,
OffsetT warp_num_runs_exclusive_in_tile,
OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD])
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
{
OffsetT item_offset =
tile_num_runs_exclusive_in_global +
warp_num_runs_exclusive_in_tile +
thread_num_runs_exclusive_in_warp[ITEM];
// Scatter offset
d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
// Scatter length if not the first (global) length
if (item_offset >= 1)
{
d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
}
}
}
}
/**
* Scatter
*/
template <bool FIRST_TILE>
__device__ __forceinline__ void Scatter(
OffsetT tile_num_runs_aggregate,
OffsetT tile_num_runs_exclusive_in_global,
OffsetT warp_num_runs_aggregate,
OffsetT warp_num_runs_exclusive_in_tile,
OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD])
{
if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
{
// Direct scatter if the warp has any items
if (warp_num_runs_aggregate)
{
ScatterDirect<FIRST_TILE>(
tile_num_runs_exclusive_in_global,
warp_num_runs_aggregate,
warp_num_runs_exclusive_in_tile,
thread_num_runs_exclusive_in_warp,
lengths_and_offsets);
}
}
else
{
// Scatter two phase
ScatterTwoPhase<FIRST_TILE>(
tile_num_runs_exclusive_in_global,
warp_num_runs_aggregate,
warp_num_runs_exclusive_in_tile,
thread_num_runs_exclusive_in_warp,
lengths_and_offsets,
Int2Type<STORE_WARP_TIME_SLICING>());
}
}
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process a tile of input (dynamic chained scan)
*/
template <
bool LAST_TILE>
__device__ __forceinline__ LengthOffsetPair ConsumeTile(
OffsetT num_items, ///< Total number of global input items
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT &tile_status) ///< Global list of tile status
{
if (tile_idx == 0)
{
// First tile
// Load items
T items[ITEMS_PER_THREAD];
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value;
lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
lengths_and_num_runs2[ITEM].key : // keep
WARP_THREADS * ITEMS_PER_THREAD; // discard
}
OffsetT tile_num_runs_aggregate = tile_aggregate.key;
OffsetT tile_num_runs_exclusive_in_global = 0;
OffsetT warp_num_runs_aggregate = warp_aggregate.key;
OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key;
// Scatter
Scatter<true>(
tile_num_runs_aggregate,
tile_num_runs_exclusive_in_global,
warp_num_runs_aggregate,
warp_num_runs_exclusive_in_tile,
thread_num_runs_exclusive_in_warp,
lengths_and_offsets);
// Return running total (inclusive of this tile)
return tile_aggregate;
}
else
{
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
if (warp_id == 0)
{
prefix_op(tile_aggregate);
if (threadIdx.x == 0)
temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
}
CTA_SYNC();
LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
// Update thread_exclusive_in_warp to fold in warp and tile run-lengths
LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
if (thread_exclusive_in_warp.key == 0)
thread_exclusive_in_warp.value += thread_exclusive.value;
// Downsweep scan through lengths_and_num_runs
LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD];
LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD];
OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value;
lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
lengths_and_num_runs2[ITEM].key : // keep
WARP_THREADS * ITEMS_PER_THREAD; // discard
}
OffsetT tile_num_runs_aggregate = tile_aggregate.key;
OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key;
OffsetT warp_num_runs_aggregate = warp_aggregate.key;
OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key;
// Scatter
Scatter<false>(
tile_num_runs_aggregate,
tile_num_runs_exclusive_in_global,
warp_num_runs_aggregate,
warp_num_runs_exclusive_in_tile,
thread_num_runs_exclusive_in_warp,
lengths_and_offsets);
// Return running total (inclusive of this tile)
return prefix_op.inclusive_prefix;
}
}
xgboost/cub/cub/agent/agent_scan.cuh view on Meta::CPAN
/**
* \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
*/
template <
typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type
typename InputIteratorT, ///< Random-access input iterator type
typename OutputIteratorT, ///< Random-access output iterator type
typename ScanOpT, ///< Scan functor type
typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
typename OffsetT> ///< Signed integer type for global offsets
struct AgentScan
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
// The output value type
xgboost/cub/cub/agent/agent_scan.cuh view on Meta::CPAN
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process a tile of input (dynamic chained scan)
*/
template <bool IS_LAST_TILE> ///< Whether the current tile is the last tile
__device__ __forceinline__ void ConsumeTile(
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
// Load items
OutputT items[ITEMS_PER_THREAD];
if (IS_LAST_TILE)
BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
else
xgboost/cub/cub/agent/agent_segment_fixup.cuh view on Meta::CPAN
/**
* \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
*/
template <
typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type
typename PairsInputIteratorT, ///< Random-access input iterator type for keys
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename EqualityOpT, ///< KeyT equality operator type
typename ReductionOpT, ///< ValueT reduction operator type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentSegmentFixup
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Data type of key-value input iterator
typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
// Value type
xgboost/cub/cub/agent/agent_segment_fixup.cuh view on Meta::CPAN
// Tile status descriptor interface type
typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
// Constants
enum
{
BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
// Whether or not do fixup using RLE + global atomics
USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
(Equals<ValueT, float>::VALUE ||
Equals<ValueT, int>::VALUE ||
Equals<ValueT, unsigned int>::VALUE ||
Equals<ValueT, unsigned long long>::VALUE),
// Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
HAS_IDENTITY_ZERO = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
};
xgboost/cub/cub/agent/agent_segment_fixup.cuh view on Meta::CPAN
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process input tile. Specialized for atomic-fixup
*/
template <bool IS_LAST_TILE>
__device__ __forceinline__ void ConsumeTile(
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state, ///< Global tile state descriptor
Int2Type<true> use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key)
{
KeyValuePairT pairs[ITEMS_PER_THREAD];
// Load pairs
KeyValuePairT oob_pair;
oob_pair.key = -1;
xgboost/cub/cub/agent/agent_segment_fixup.cuh view on Meta::CPAN
if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
}
/**
* Process input tile. Specialized for reduce-by-key fixup
*/
template <bool IS_LAST_TILE>
__device__ __forceinline__ void ConsumeTile(
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state, ///< Global tile state descriptor
Int2Type<false> use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key)
{
KeyValuePairT pairs[ITEMS_PER_THREAD];
KeyValuePairT scatter_pairs[ITEMS_PER_THREAD];
// Load pairs
KeyValuePairT oob_pair;
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
* Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
* Otherwise performs discontinuity selection (keep unique)
*/
template <
typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for selection items
typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items
typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
typename OffsetT, ///< Signed integer type for global offsets
bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
struct AgentSelectIf
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
int local_rejection_idx = item_idx - local_selection_idx;
int local_scatter_offset = (selection_flags[ITEM]) ?
tile_num_rejections + local_selection_idx :
local_rejection_idx;
temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
}
CTA_SYNC();
// Gather items from shared memory and scatter to global
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
int rejection_idx = item_idx;
int selection_idx = item_idx - tile_num_rejections;
OffsetT scatter_offset = (item_idx < tile_num_rejections) ?
num_items - num_rejected_prefix - rejection_idx - 1 :
num_selections_prefix + selection_idx;
xgboost/cub/cub/agent/agent_spmv_csrt.cuh view on Meta::CPAN
* Parameterizable tuning policy type for AgentSpmv
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search
CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets
CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices
CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values
CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values
bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
struct AgentSpmvPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
};
static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices
static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values
static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
};
xgboost/cub/cub/agent/agent_spmv_orig.cuh view on Meta::CPAN
* Parameterizable tuning policy type for AgentSpmv
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search
CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets
CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices
CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values
CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values
bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
struct AgentSpmvPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
};
static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices
static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values
static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
};
xgboost/cub/cub/agent/agent_spmv_row_based.cuh view on Meta::CPAN
* Parameterizable tuning policy type for AgentSpmv
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search
CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets
CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices
CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values
CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values
bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
struct AgentSpmvPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
};
static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices
static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values
static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
};
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* \par A Simple Example
* \blockcollective{BlockDiscontinuity}
* \par
* The code snippet below illustrates the head flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_discontinuity.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the head- and tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_discontinuity.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
*
* \par A Simple Example
* \blockcollective{BlockExchange}
* \par
* The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Load a tile of data striped across threads
* int thread_data[4];
* cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
* \par
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Load a tile of ordered data into a striped arrangement across block threads
* int thread_data[4];
* cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
* \par
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
* \par
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Load a tile of ordered data into a warp-striped arrangement across warp threads
* int thread_data[4];
* cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
xgboost/cub/cub/block/block_exchange.cuh view on Meta::CPAN
* \par
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_exchange.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
xgboost/cub/cub/block/block_histogram.cuh view on Meta::CPAN
*
* \par A Simple Example
* \blockcollective{BlockHistogram}
* \par
* The code snippet below illustrates a 256-bin histogram of 512 integer samples that
* are partitioned across 128 threads where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_histogram.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
xgboost/cub/cub/block/block_histogram.cuh view on Meta::CPAN
* \brief Initialize the shared histogram counters to zero.
*
* \par Snippet
* The code snippet below illustrates a the initialization and update of a
* histogram of 512 integer samples that are partitioned across 128 threads
* where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_histogram.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
xgboost/cub/cub/block/block_histogram.cuh view on Meta::CPAN
* - \granularity
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a 256-bin histogram of 512 integer samples that
* are partitioned across 128 threads where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_histogram.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
xgboost/cub/cub/block/block_histogram.cuh view on Meta::CPAN
* - \smemreuse
*
* \par Snippet
* The code snippet below illustrates a the initialization and update of a
* histogram of 512 integer samples that are partitioned across 128 threads
* where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_histogram.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
xgboost/cub/cub/block/block_load.cuh view on Meta::CPAN
* - The utilization of memory transactions (coalescing) decreases as the
* access stride between threads increases (i.e., the number items per thread).
*/
BLOCK_LOAD_DIRECT,
/**
* \par Overview
*
* A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
* from memory using CUDA's built-in vectorized loads as a coalescing optimization.
* For example, <tt>ld.global.v4.s32</tt> instructions will be generated
* when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) remains high until the the
* access stride between threads (i.e., the number items per thread) exceeds the
* maximum vector load width (typically 4 items or 64B, whichever is lower).
* - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
* - \p ITEMS_PER_THREAD is odd
* - The \p InputIteratorTis not a simple pointer type
* - The block input offset is not quadword-aligned
xgboost/cub/cub/block/block_load.cuh view on Meta::CPAN
* \par
* The code snippet below illustrates the loading of a linear
* segment of 512 integers into a "blocked" arrangement across 128 threads where each
* thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
* meaning memory references are efficiently coalesced using a warp-striped access
* pattern (after which items are locally reordered among threads).
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
*
* // Allocate shared memory for BlockLoad
* __shared__ typename BlockLoad::TempStorage temp_storage;
*
* // Load a segment of consecutive items that are blocked across threads
* int thread_data[4];
* BlockLoad(temp_storage).Load(d_data, thread_data);
xgboost/cub/cub/block/block_load.cuh view on Meta::CPAN
* \par Snippet
* The code snippet below illustrates the loading of a linear
* segment of 512 integers into a "blocked" arrangement across 128 threads where each
* thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
* meaning memory references are efficiently coalesced using a warp-striped access
* pattern (after which items are locally reordered among threads).
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
*
* // Allocate shared memory for BlockLoad
* __shared__ typename BlockLoad::TempStorage temp_storage;
*
* // Load a segment of consecutive items that are blocked across threads
* int thread_data[4];
* BlockLoad(temp_storage).Load(d_data, thread_data);
xgboost/cub/cub/block/block_load.cuh view on Meta::CPAN
* \par Snippet
* The code snippet below illustrates the guarded loading of a linear
* segment of 512 integers into a "blocked" arrangement across 128 threads where each
* thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
* meaning memory references are efficiently coalesced using a warp-striped access
* pattern (after which items are locally reordered among threads).
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
*
* __global__ void ExampleKernel(int *d_data, int valid_items, ...)
* {
* // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
*
* // Allocate shared memory for BlockLoad
* __shared__ typename BlockLoad::TempStorage temp_storage;
*
* // Load a segment of consecutive items that are blocked across threads
* int thread_data[4];
* BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
xgboost/cub/cub/block/block_load.cuh view on Meta::CPAN
* \par Snippet
* The code snippet below illustrates the guarded loading of a linear
* segment of 512 integers into a "blocked" arrangement across 128 threads where each
* thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
* meaning memory references are efficiently coalesced using a warp-striped access
* pattern (after which items are locally reordered among threads).
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
*
* __global__ void ExampleKernel(int *d_data, int valid_items, ...)
* {
* // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
* typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
*
* // Allocate shared memory for BlockLoad
* __shared__ typename BlockLoad::TempStorage temp_storage;
*
* // Load a segment of consecutive items that are blocked across threads
* int thread_data[4];
* BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
xgboost/cub/cub/block/block_radix_rank.cuh view on Meta::CPAN
* \par Performance Considerations
* - \granularity
*
* \par Examples
* \par
* - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
* \code
* #include <cub/cub.cuh>
*
* template <int BLOCK_THREADS>
* __global__ void ExampleKernel(...)
* {
*
* \endcode
*/
template <
int BLOCK_DIM_X,
int RADIX_BITS,
bool DESCENDING,
bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,