Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/R-package/R/xgb.ggplot.R  view on Meta::CPAN

      print(
        plots[[i]], vp = grid::viewport(
          layout.pos.row = matchidx$row,
          layout.pos.col = matchidx$col
        )
      )
    }
  }
}

globalVariables(c(
  "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
  "element_blank", "element_text", "V1", "Weight"
))

xgboost/R-package/R/xgb.importance.R  view on Meta::CPAN

            Cover = Cover / sum(Cover),
            Frequency = Frequency / sum(Frequency))][
      order(Gain, decreasing = TRUE)]
  }
  result
}

# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature"))

xgboost/R-package/R/xgb.model.dt.tree.R  view on Meta::CPAN

  
  td[, t := NULL]
  td[, isLeaf := NULL]
  
  td[order(Tree, Node)]
}

# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf",".SD", ".SDcols"))

xgboost/R-package/R/xgb.plot.deepness.R  view on Meta::CPAN

    # list of paths to each leaf in a tree
    paths <- lapply(paths_tmp$vpath, names)
    # combine into a resulting path lengths table for a tree
    data.table(Depth = sapply(paths, length), ID = To[Leaf == TRUE])
  }, by = Tree]
}

# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(
  c(
    ".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature", "Leaf", "Weight"
  )
)

xgboost/R-package/R/xgb.plot.importance.R  view on Meta::CPAN

                      barplot(Importance, horiz = TRUE, border = NA, add = TRUE)]
    par(op)
  }
  
  invisible(importance_matrix)
}

# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "Importance"))

xgboost/R-package/R/xgb.plot.multi.trees.R  view on Meta::CPAN

  edges <- DiagrammeR::create_edge_df(
    from = match(edges.dt[,From], nodes.dt[,abs.node.position]),
    to = match(edges.dt[,To], nodes.dt[,abs.node.position]),
    rel = "leading_to")
  
  graph <- DiagrammeR::create_graph(
      nodes_df = nodes,
      edges_df = edges,
      attr_theme = NULL
      ) %>%
    DiagrammeR::add_global_graph_attrs(
      attr_type = "graph",
      attr  = c("layout", "rankdir"),
      value = c("dot", "LR")
      ) %>%
    DiagrammeR::add_global_graph_attrs(
      attr_type = "node",
      attr  = c("color", "fillcolor", "style", "shape", "fontname"),
      value = c("DimGray", "beige", "filled", "rectangle", "Helvetica")
      ) %>%
    DiagrammeR::add_global_graph_attrs(
      attr_type = "edge",
      attr  = c("color", "arrowsize", "arrowhead", "fontname"),
      value = c("DimGray", "1.5", "vee", "Helvetica"))
  
  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)  
}

globalVariables(c(".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos",
                  "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"))

xgboost/R-package/R/xgb.plot.tree.R  view on Meta::CPAN

            c(rep("", nrow(dt[Feature != "Leaf"]))),
    style = dt[Feature != "Leaf", ifelse(Missing == Yes, "bold", "solid")] %>%
            c(dt[Feature != "Leaf", ifelse(Missing == No, "bold", "solid")]),
    rel   = "leading_to")

  graph <- DiagrammeR::create_graph(
      nodes_df = nodes,
      edges_df = edges,
      attr_theme = NULL
      ) %>%
    DiagrammeR::add_global_graph_attrs(
      attr_type = "graph",
      attr  = c("layout", "rankdir"),
      value = c("dot", "LR")
      ) %>%
    DiagrammeR::add_global_graph_attrs(
      attr_type = "node",
      attr  = c("color", "style", "fontname"),
      value = c("DimGray", "filled", "Helvetica")
      ) %>%
    DiagrammeR::add_global_graph_attrs(
      attr_type = "edge",
      attr  = c("color", "arrowsize", "arrowhead", "fontname"),
      value = c("DimGray", "1.5", "vee", "Helvetica"))
  
  if (!render) return(invisible(graph))
  
  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
}

# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))

xgboost/R-package/R/xgb.train.R  view on Meta::CPAN

#'   \itemize{
#'     \item \code{reg:linear} linear regression (Default).
#'     \item \code{reg:logistic} logistic regression.
#'     \item \code{binary:logistic} logistic regression for binary classification. Output probability.
#'     \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
#'     \item \code{num_class} set the number of classes. To use only with multiclass objectives.
#'     \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
#'     \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each cl...
#'     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#'   }
#'   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#'   \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ...
#' }
#' 
#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
#'        \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
#' @param nrounds max number of boosting iterations.
#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
#'        Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
#'        of these datasets during each boosting iteration, and stored in the end as a field named 
#'        \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or 

xgboost/R-package/configure  view on Meta::CPAN

else
  as_fn_append ()
  {
    eval $1=\$$1\$2
  }
fi # as_fn_append

# as_fn_arith ARG...
# ------------------
# Perform arithmetic evaluation on the ARGs, and store the result in the
# global $as_val. Take advantage of shells that can avoid forks. The arguments
# must be portable across $(()) and expr.
if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
  eval 'as_fn_arith ()
  {
    as_val=$(( $* ))
  }'
else
  as_fn_arith ()
  {
    as_val=`expr "$@" || test $? -eq 1`

xgboost/R-package/configure  view on Meta::CPAN

else
  as_fn_append ()
  {
    eval $1=\$$1\$2
  }
fi # as_fn_append

# as_fn_arith ARG...
# ------------------
# Perform arithmetic evaluation on the ARGs, and store the result in the
# global $as_val. Take advantage of shells that can avoid forks. The arguments
# must be portable across $(()) and expr.
if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
  eval 'as_fn_arith ()
  {
    as_val=$(( $* ))
  }'
else
  as_fn_arith ()
  {
    as_val=`expr "$@" || test $? -eq 1`

xgboost/R-package/man/xgb.train.Rd  view on Meta::CPAN

  \itemize{
    \item \code{reg:linear} linear regression (Default).
    \item \code{reg:logistic} logistic regression.
    \item \code{binary:logistic} logistic regression for binary classification. Output probability.
    \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
    \item \code{num_class} set the number of classes. To use only with multiclass objectives.
    \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
    \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class...
    \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
  }
  \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
  \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ran...
}}

\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.}

\item{nrounds}{max number of boosting iterations.}

\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each

xgboost/amalgamation/xgboost-all0.cc  view on Meta::CPAN

#include "../src/tree/tree_model.cc"
#include "../src/tree/tree_updater.cc"
#include "../src/tree/updater_colmaker.cc"
#include "../src/tree/updater_fast_hist.cc"
#include "../src/tree/updater_prune.cc"
#include "../src/tree/updater_refresh.cc"
#include "../src/tree/updater_sync.cc"
#include "../src/tree/updater_histmaker.cc"
#include "../src/tree/updater_skmaker.cc"

// global
#include "../src/learner.cc"
#include "../src/logging.cc"
#include "../src/common/common.cc"
#include "../src/common/hist_util.cc"

// c_api
#include "../src/c_api/c_api.cc"
#include "../src/c_api/c_api_error.cc"

xgboost/cub/CHANGE_LOG.TXT  view on Meta::CPAN

    - Bug fixes: 
        - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
          cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
          returned when parameterized for floating-point types (fp32, fp64).
        - Workaround-fix for ptxas error when compiling with with -G flag on Linux 
          (for debug instrumentation) 
        - Misc. workaround-fixes for certain scan scenarios (using custom 
          scan operators) where code compiled for SM1x is run on newer 
          GPUs of higher compute-capability: the compiler could not tell
          which memory space was being used collective operations and was 
          mistakenly using global ops instead of shared ops. 

//-----------------------------------------------------------------------------

1.2.3    04/01/2014
    - Bug fixes: 
        - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
        - Fixed code-snippet bug in ArgIndexInputIteratorT documentation 

//-----------------------------------------------------------------------------

xgboost/cub/README.md  view on Meta::CPAN


![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)

<br><hr>
<h3>A Simple Example</h3>

```C++
#include <cub/cub.cuh>
 
// Block-sorting CUDA kernel
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
     using namespace cub;

     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads 
     // owning 16 integer items each
     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
     typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
     typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
 
     // Allocate shared memory

xgboost/cub/README.md  view on Meta::CPAN


The cub::BlockLoad and cub::BlockStore classes are similarly specialized.    
Furthermore, to provide coalesced accesses to device memory, these primitives are 
configured to access memory using a striped access pattern (where consecutive threads 
simultaneously access consecutive items) and then <em>transpose</em> the keys into 
a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads. 

Once specialized, these classes expose opaque \p TempStorage member types.  
The thread block uses these storage types to statically allocate the union of 
shared memory needed by the thread block.  (Alternatively these storage types 
could be aliased to global memory allocations).

<br><hr>
<h3>Stable Releases</h3>

CUB releases are labeled using version identifiers having three fields: 
*epoch.feature.update*.  The *epoch* field corresponds to support for
a major change in the CUDA programming model.  The *feature* field 
corresponds to a stable set of features, functionality, and interface.  The
*update* field corresponds to a bug-fix or performance update for that
feature set.  At the moment, we do not publicly provide non-stable releases 

xgboost/cub/cub/agent/agent_histogram.cuh  view on Meta::CPAN


/**
 * Parameterizable tuning policy type for AgentHistogram
 */
template <
    int                             _BLOCK_THREADS,                 ///< Threads per thread block
    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
struct AgentHistogramPolicy
{
    enum
    {
        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
        IS_RLE_COMPRESS            = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
        IS_WORK_STEALING           = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
    };

    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
};


/******************************************************************************
 * Thread block abstractions
 ******************************************************************************/

xgboost/cub/cub/agent/agent_histogram.cuh  view on Meta::CPAN

 */
template <
    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
    typename    OffsetT,                        ///< Signed integer type for global offsets
    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
struct AgentHistogram
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    /// The sample type of the input iterator
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

xgboost/cub/cub/agent/agent_histogram.cuh  view on Meta::CPAN


    /// Reference to final output histograms (gmem)
    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];

    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];

    /// The transform operator for determining privatized counter indices from samples, one for each channel
    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];

    /// Whether to prefer privatized smem counters vs privatized global counters
    bool prefer_smem;


    //---------------------------------------------------------------------
    // Initialize privatized bin counters
    //---------------------------------------------------------------------

    // Initialize privatized bin counters
    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
    {

xgboost/cub/cub/agent/agent_histogram.cuh  view on Meta::CPAN

    {
        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];

        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];

        InitBinCounters(privatized_histograms);
    }


    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
    __device__ __forceinline__ void InitGmemBinCounters()
    {
        InitBinCounters(d_privatized_histograms);
    }


    //---------------------------------------------------------------------
    // Update final output histograms
    //---------------------------------------------------------------------

xgboost/cub/cub/agent/agent_histogram.cuh  view on Meta::CPAN

    __device__ __forceinline__ void StoreSmemOutput()
    {
        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];

        StoreOutput(privatized_histograms);
    }


    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
    __device__ __forceinline__ void StoreGmemOutput()
    {
        StoreOutput(d_privatized_histograms);
    }


    //---------------------------------------------------------------------
    // Tile accumulation
    //---------------------------------------------------------------------

xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh  view on Meta::CPAN


/******************************************************************************
 * Tuning policy types
 ******************************************************************************/

/**
 * Types of scattering strategies
 */
enum RadixSortScatterAlgorithm
{
    RADIX_SORT_SCATTER_DIRECT,      ///< Scatter directly from registers to global bins
    RADIX_SORT_SCATTER_TWO_PHASE,   ///< First scatter from registers into shared memory bins, then into global bins
};


/**
 * Parameterizable tuning policy type for AgentRadixSortDownsweep
 */
template <
    int                         _BLOCK_THREADS,             ///< Threads per thread block
    int                         _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
    BlockLoadAlgorithm          _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use

xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh  view on Meta::CPAN

 ******************************************************************************/

/**
 * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
 */
template <
    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
    bool     IS_DESCENDING,                        ///< Whether or not the sorted-order is high-to-low
    typename KeyT,                              ///< KeyT type
    typename ValueT,                            ///< ValueT type
    typename OffsetT>                           ///< Signed integer type for global offsets
struct AgentRadixSortDownsweep
{
    //---------------------------------------------------------------------
    // Type definitions and constants
    //---------------------------------------------------------------------

    // Appropriate unsigned-bits representation of KeyT
    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;

    static const UnsignedBits LOWEST_KEY = Traits<KeyT>::LOWEST_KEY;

xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh  view on Meta::CPAN


    // Shared storage for this CTA
    _TempStorage    &temp_storage;

    // Input and output device pointers
    KeysItr         d_keys_in;
    ValuesItr       d_values_in;
    UnsignedBits    *d_keys_out;
    ValueT          *d_values_out;

    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
    OffsetT         bin_offset;

    // The least-significant bit position of the current digit to extract
    int             current_bit;

    // Number of bits in current digit
    int             num_bits;

    // Whether to short-cirucit
    int             short_circuit;

xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh  view on Meta::CPAN

     */
    template <bool FULL_TILE>
    __device__ __forceinline__ void ProcessTile(
        OffsetT block_offset,
        const OffsetT &valid_items = TILE_ITEMS)
    {
        // Per-thread tile data
        UnsignedBits    keys[ITEMS_PER_THREAD];                     // Keys
        UnsignedBits    twiddled_keys[ITEMS_PER_THREAD];            // Twiddled keys
        int             ranks[ITEMS_PER_THREAD];                    // For each key, the local rank within the CTA
        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];     // For each key, the global scatter base offset of the corresponding digit

        // Assign default (min/max) value to all keys
        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;

        // Load tile of keys
        BlockLoadKeys loader(temp_storage.load_keys);
        LoadItems(
            loader,
            keys,
            d_keys_in + block_offset,

xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh  view on Meta::CPAN

            {
                // Get inclusive digit prefix from exclusive prefix (lower bins come first)
                inclusive_digit_prefix = (threadIdx.x == RADIX_DIGITS - 1) ?
                    (BLOCK_THREADS * ITEMS_PER_THREAD) :
                    temp_storage.exclusive_digit_prefix[threadIdx.x + 1];
            }
        }

        CTA_SYNC();

        // Update global scatter base offsets for each digit
        if (threadIdx.x < RADIX_DIGITS)
        {


            bin_offset -= exclusive_digit_prefix;
            temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
            bin_offset += inclusive_digit_prefix;
        }

        CTA_SYNC();

xgboost/cub/cub/agent/agent_radix_sort_upsweep.cuh  view on Meta::CPAN

/******************************************************************************
 * Thread block abstractions
 ******************************************************************************/

/**
 * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
 */
template <
    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
    typename KeyT,                          ///< KeyT type
    typename OffsetT>                       ///< Signed integer type for global offsets
struct AgentRadixSortUpsweep
{

    //---------------------------------------------------------------------
    // Type definitions and constants
    //---------------------------------------------------------------------

    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;

    // Integer type for digit counters (to be packed into words of PackedCounters)

xgboost/cub/cub/agent/agent_reduce.cuh  view on Meta::CPAN

 * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
 *
 * Each thread reduces only the values it loads. If \p FIRST_TILE, this
 * partial reduction is stored into \p thread_aggregate.  Otherwise it is
 * accumulated into \p thread_aggregate.
 */
template <
    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
    typename InputIteratorT,           ///< Random-access iterator type for input
    typename OutputIteratorT,          ///< Random-access iterator type for output
    typename OffsetT,                  ///< Signed integer type for global offsets
    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
struct AgentReduce
{

    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    /// The input value type
    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

xgboost/cub/cub/agent/agent_reduce.cuh  view on Meta::CPAN

        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
            ConsumeRange(block_offset, block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
            ConsumeRange(block_offset, block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
    }


    /**
     * Reduce a contiguous segment of input tiles
     */
    __device__ __forceinline__ OutputT ConsumeTiles(
        OffsetT                             /*num_items*/,      ///< [in] Total number of global input items
        GridEvenShare<OffsetT>              &even_share,        ///< [in] GridEvenShare descriptor
        GridQueue<OffsetT>                  &/*queue*/,         ///< [in,out] GridQueue descriptor
        Int2Type<GRID_MAPPING_EVEN_SHARE>   /*is_even_share*/)  ///< [in] Marker type indicating this is an even-share mapping
    {
        // Initialize even-share descriptor for this thread block
        even_share.BlockInit();

        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
            ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
            ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());

xgboost/cub/cub/agent/agent_reduce.cuh  view on Meta::CPAN


        // Compute block-wide reduction (all threads have valid items)
        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);

    }

    /**
     * Dequeue and reduce tiles of items as part of a inter-block reduction
     */
    __device__ __forceinline__ OutputT ConsumeTiles(
        OffsetT                         num_items,          ///< [in] Total number of global input items
        GridEvenShare<OffsetT>          &/*even_share*/,    ///< [in] GridEvenShare descriptor
        GridQueue<OffsetT>              &queue,             ///< [in,out] GridQueue descriptor
        Int2Type<GRID_MAPPING_DYNAMIC>  /*is_dynamic*/)     ///< [in] Marker type indicating this is a dynamic mapping
    {
        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
            ConsumeTiles(num_items, queue, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
            ConsumeTiles(num_items, queue, Int2Type<false && ATTEMPT_VECTORIZATION>());
    }

};

xgboost/cub/cub/agent/agent_reduce_by_key.cuh  view on Meta::CPAN

 */
template <
    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
    typename    EqualityOpT,                    ///< KeyT equality operator type
    typename    ReductionOpT,                   ///< ValueT reduction operator type
    typename    OffsetT>                        ///< Signed integer type for global offsets
struct AgentReduceByKey
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input keys type
    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;

    // The output keys type

xgboost/cub/cub/agent/agent_reduce_by_key.cuh  view on Meta::CPAN


    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------

    /**
     * Process a tile of input (dynamic chained scan)
     */
    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
    __device__ __forceinline__ void ConsumeTile(
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN


/**
 * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
 */
template <
    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
    typename    InputIteratorT,         ///< Random-access input iterator type for data
    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
    typename    EqualityOpT,            ///< T equality operator type
    typename    OffsetT>                ///< Signed integer type for global offsets
struct AgentRle
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    /// The input value type
    typedef typename std::iterator_traits<InputIteratorT>::value_type T;

    /// The lengths output value type

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN


    //---------------------------------------------------------------------
    // Utility methods for scattering selections
    //---------------------------------------------------------------------

    /**
     * Two-phase scatter, specialized for warp time-slicing
     */
    template <bool FIRST_TILE>
    __device__ __forceinline__ void ScatterTwoPhase(
        OffsetT             tile_num_runs_exclusive_in_global,
        OffsetT             warp_num_runs_aggregate,
        OffsetT             warp_num_runs_exclusive_in_tile,
        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
        Int2Type<true>      is_warp_time_slice)
    {
        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
        int lane_id = LaneId();

        // Locally compact items within the warp (first warp)

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

            }
        }

        // Global scatter
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
        {
            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
            {
                OffsetT item_offset =
                    tile_num_runs_exclusive_in_global +
                    warp_num_runs_exclusive_in_tile +
                    (ITEM * WARP_THREADS) + lane_id;

                // Scatter offset
                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;

                // Scatter length if not the first (global) length
                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
                {
                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
                }
            }
        }
    }


    /**
     * Two-phase scatter
     */
    template <bool FIRST_TILE>
    __device__ __forceinline__ void ScatterTwoPhase(
        OffsetT             tile_num_runs_exclusive_in_global,
        OffsetT             warp_num_runs_aggregate,
        OffsetT             warp_num_runs_exclusive_in_tile,
        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
        Int2Type<false>     is_warp_time_slice)
    {
        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
        int lane_id = LaneId();

        // Unzip

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN


        WarpExchangeLengths(temp_storage.exchange_lengths[warp_id]).ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp);

        // Global scatter
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
        {
            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
            {
                OffsetT item_offset =
                    tile_num_runs_exclusive_in_global +
                    warp_num_runs_exclusive_in_tile +
                    (ITEM * WARP_THREADS) + lane_id;

                // Scatter offset
                d_offsets_out[item_offset] = run_offsets[ITEM];

                // Scatter length if not the first (global) length
                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
                {
                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
                }
            }
        }
    }


    /**
     * Direct scatter
     */
    template <bool FIRST_TILE>
    __device__ __forceinline__ void ScatterDirect(
        OffsetT             tile_num_runs_exclusive_in_global,
        OffsetT             warp_num_runs_aggregate,
        OffsetT             warp_num_runs_exclusive_in_tile,
        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
    {
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
            {
                OffsetT item_offset =
                    tile_num_runs_exclusive_in_global +
                    warp_num_runs_exclusive_in_tile +
                    thread_num_runs_exclusive_in_warp[ITEM];

                // Scatter offset
                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;

                // Scatter length if not the first (global) length
                if (item_offset >= 1)
                {
                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
                }
            }
        }
    }


    /**
     * Scatter
     */
    template <bool FIRST_TILE>
    __device__ __forceinline__ void Scatter(
        OffsetT             tile_num_runs_aggregate,
        OffsetT             tile_num_runs_exclusive_in_global,
        OffsetT             warp_num_runs_aggregate,
        OffsetT             warp_num_runs_exclusive_in_tile,
        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
    {
        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
        {
            // Direct scatter if the warp has any items
            if (warp_num_runs_aggregate)
            {
                ScatterDirect<FIRST_TILE>(
                    tile_num_runs_exclusive_in_global,
                    warp_num_runs_aggregate,
                    warp_num_runs_exclusive_in_tile,
                    thread_num_runs_exclusive_in_warp,
                    lengths_and_offsets);
            }
        }
        else
        {
            // Scatter two phase
            ScatterTwoPhase<FIRST_TILE>(
                tile_num_runs_exclusive_in_global,
                warp_num_runs_aggregate,
                warp_num_runs_exclusive_in_tile,
                thread_num_runs_exclusive_in_warp,
                lengths_and_offsets,
                Int2Type<STORE_WARP_TIME_SLICING>());
        }
    }



    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------

    /**
     * Process a tile of input (dynamic chained scan)
     */
    template <
        bool                LAST_TILE>
    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
        OffsetT             num_items,          ///< Total number of global input items
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,       ///< Tile offset
        ScanTileStateT       &tile_status)       ///< Global list of tile status
    {
        if (tile_idx == 0)
        {
            // First tile

            // Load items
            T items[ITEMS_PER_THREAD];

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
            {
                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
                                                                lengths_and_num_runs2[ITEM].key :         // keep
                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
            }

            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
            OffsetT tile_num_runs_exclusive_in_global    = 0;
            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;

            // Scatter
            Scatter<true>(
                tile_num_runs_aggregate,
                tile_num_runs_exclusive_in_global,
                warp_num_runs_aggregate,
                warp_num_runs_exclusive_in_tile,
                thread_num_runs_exclusive_in_warp,
                lengths_and_offsets);

            // Return running total (inclusive of this tile)
            return tile_aggregate;
        }
        else
        {

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
            if (warp_id == 0)
            {
                prefix_op(tile_aggregate);
                if (threadIdx.x == 0)
                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
            }

            CTA_SYNC();

            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;

            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
            if (thread_exclusive_in_warp.key == 0)
                thread_exclusive_in_warp.value += thread_exclusive.value;

            // Downsweep scan through lengths_and_num_runs
            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];

            ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);

xgboost/cub/cub/agent/agent_rle.cuh  view on Meta::CPAN

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
            {
                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
                                                                lengths_and_num_runs2[ITEM].key :         // keep
                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
            }

            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;

            // Scatter
            Scatter<false>(
                tile_num_runs_aggregate,
                tile_num_runs_exclusive_in_global,
                warp_num_runs_aggregate,
                warp_num_runs_exclusive_in_tile,
                thread_num_runs_exclusive_in_warp,
                lengths_and_offsets);

            // Return running total (inclusive of this tile)
            return prefix_op.inclusive_prefix;
        }
    }

xgboost/cub/cub/agent/agent_scan.cuh  view on Meta::CPAN


/**
 * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
 */
template <
    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
    typename InputIteratorT,        ///< Random-access input iterator type
    typename OutputIteratorT,       ///< Random-access output iterator type
    typename ScanOpT,               ///< Scan functor type
    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
    typename OffsetT>               ///< Signed integer type for global offsets
struct AgentScan
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input value type
    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

    // The output value type

xgboost/cub/cub/agent/agent_scan.cuh  view on Meta::CPAN


    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------

    /**
     * Process a tile of input (dynamic chained scan)
     */
    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
    __device__ __forceinline__ void ConsumeTile(
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        // Load items
        OutputT items[ITEMS_PER_THREAD];

        if (IS_LAST_TILE)
            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
        else

xgboost/cub/cub/agent/agent_segment_fixup.cuh  view on Meta::CPAN


/**
 * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
 */
template <
    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
    typename    EqualityOpT,                    ///< KeyT equality operator type
    typename    ReductionOpT,                   ///< ValueT reduction operator type
    typename    OffsetT>                        ///< Signed integer type for global offsets
struct AgentSegmentFixup
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // Data type of key-value input iterator
    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;

    // Value type

xgboost/cub/cub/agent/agent_segment_fixup.cuh  view on Meta::CPAN

    // Tile status descriptor interface type
    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;

    // Constants
    enum
    {
        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,

        // Whether or not do fixup using RLE + global atomics
        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
                                (Equals<ValueT, float>::VALUE || 
                                 Equals<ValueT, int>::VALUE ||
                                 Equals<ValueT, unsigned int>::VALUE ||
                                 Equals<ValueT, unsigned long long>::VALUE),

        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
    };

xgboost/cub/cub/agent/agent_segment_fixup.cuh  view on Meta::CPAN

    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------


    /**
     * Process input tile.  Specialized for atomic-fixup
     */
    template <bool IS_LAST_TILE>
    __device__ __forceinline__ void ConsumeTile(
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
    {
        KeyValuePairT   pairs[ITEMS_PER_THREAD];

        // Load pairs
        KeyValuePairT oob_pair;
        oob_pair.key = -1;

xgboost/cub/cub/agent/agent_segment_fixup.cuh  view on Meta::CPAN

        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
    }


    /**
     * Process input tile.  Specialized for reduce-by-key fixup
     */
    template <bool IS_LAST_TILE>
    __device__ __forceinline__ void ConsumeTile(
        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
    {
        KeyValuePairT   pairs[ITEMS_PER_THREAD];
        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];

        // Load pairs
        KeyValuePairT oob_pair;

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN

 * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
 * Otherwise performs discontinuity selection (keep unique)
 */
template <
    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
    typename    OffsetT,                        ///< Signed integer type for global offsets
    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
struct AgentSelectIf
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input value type
    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

xgboost/cub/cub/agent/agent_select_if.cuh  view on Meta::CPAN

            int local_rejection_idx     = item_idx - local_selection_idx;
            int local_scatter_offset    = (selection_flags[ITEM]) ?
                                            tile_num_rejections + local_selection_idx :
                                            local_rejection_idx;

            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
        }

        CTA_SYNC();

        // Gather items from shared memory and scatter to global
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
            int rejection_idx       = item_idx;
            int selection_idx       = item_idx - tile_num_rejections;
            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
                                        num_items - num_rejected_prefix - rejection_idx - 1 :
                                        num_selections_prefix + selection_idx;

xgboost/cub/cub/agent/agent_spmv_csrt.cuh  view on Meta::CPAN

 * Parameterizable tuning policy type for AgentSpmv
 */
template <
    int                             _BLOCK_THREADS,                         ///< Threads per thread block
    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
struct AgentSpmvPolicy
{
    enum
    {
        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
    };

    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use

};

xgboost/cub/cub/agent/agent_spmv_orig.cuh  view on Meta::CPAN

 * Parameterizable tuning policy type for AgentSpmv
 */
template <
    int                             _BLOCK_THREADS,                         ///< Threads per thread block
    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
struct AgentSpmvPolicy
{
    enum
    {
        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
    };

    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use

};

xgboost/cub/cub/agent/agent_spmv_row_based.cuh  view on Meta::CPAN

 * Parameterizable tuning policy type for AgentSpmv
 */
template <
    int                             _BLOCK_THREADS,                         ///< Threads per thread block
    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
struct AgentSpmvPolicy
{
    enum
    {
        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
    };

    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use

};

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

 * \par A Simple Example
 * \blockcollective{BlockDiscontinuity}
 * \par
 * The code snippet below illustrates the head flagging of 512 integer items that
 * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
 * where each thread owns 4 consecutive items.
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
 *
 * __global__ void ExampleKernel(...)
 * {
 *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
 *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
 *
 *     // Allocate shared memory for BlockDiscontinuity
 *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
 *
 *     // Obtain a segment of consecutive items that are blocked across threads
 *     int thread_data[4];
 *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_discontinuity.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
     * where each thread owns 4 consecutive items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
     *
     *     // Allocate shared memory for BlockDiscontinuity
     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

 *
 * \par A Simple Example
 * \blockcollective{BlockExchange}
 * \par
 * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
 * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
 *
 * __global__ void ExampleKernel(int *d_data, ...)
 * {
 *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
 *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
 *
 *     // Allocate shared memory for BlockExchange
 *     __shared__ typename BlockExchange::TempStorage temp_storage;
 *
 *     // Load a tile of data striped across threads
 *     int thread_data[4];
 *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

     * \par
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, ...)
     * {
     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
     *
     *     // Allocate shared memory for BlockExchange
     *     __shared__ typename BlockExchange::TempStorage temp_storage;
     *
     *     // Load a tile of ordered data into a striped arrangement across block threads
     *     int thread_data[4];
     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

     * \par
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, ...)
     * {
     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
     *
     *     // Allocate shared memory for BlockExchange
     *     __shared__ typename BlockExchange::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

     * \par
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, ...)
     * {
     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
     *
     *     // Allocate shared memory for BlockExchange
     *     __shared__ typename BlockExchange::TempStorage temp_storage;
     *
     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
     *     int thread_data[4];
     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

     * \par
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, ...)
     * {
     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
     *
     *     // Allocate shared memory for BlockExchange
     *     __shared__ typename BlockExchange::TempStorage temp_storage;
     *
     *     // Obtain a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     ...

xgboost/cub/cub/block/block_histogram.cuh  view on Meta::CPAN

 *
 * \par A Simple Example
 * \blockcollective{BlockHistogram}
 * \par
 * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
 * are partitioned across 128 threads where each thread owns 4 samples.
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
 *
 * __global__ void ExampleKernel(...)
 * {
 *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
 *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
 *
 *     // Allocate shared memory for BlockHistogram
 *     __shared__ typename BlockHistogram::TempStorage temp_storage;
 *
 *     // Allocate shared memory for block-wide histogram bin counts
 *     __shared__ unsigned int smem_histogram[256];
 *

xgboost/cub/cub/block/block_histogram.cuh  view on Meta::CPAN

     * \brief Initialize the shared histogram counters to zero.
     *
     * \par Snippet
     * The code snippet below illustrates a the initialization and update of a
     * histogram of 512 integer samples that are partitioned across 128 threads
     * where each thread owns 4 samples.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
     *
     *     // Allocate shared memory for BlockHistogram
     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
     *
     *     // Allocate shared memory for block-wide histogram bin counts
     *     __shared__ unsigned int smem_histogram[256];
     *

xgboost/cub/cub/block/block_histogram.cuh  view on Meta::CPAN

     * - \granularity
     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
     * are partitioned across 128 threads where each thread owns 4 samples.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
     *
     *     // Allocate shared memory for BlockHistogram
     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
     *
     *     // Allocate shared memory for block-wide histogram bin counts
     *     __shared__ unsigned int smem_histogram[256];
     *

xgboost/cub/cub/block/block_histogram.cuh  view on Meta::CPAN

     * - \smemreuse
     *
     * \par Snippet
     * The code snippet below illustrates a the initialization and update of a
     * histogram of 512 integer samples that are partitioned across 128 threads
     * where each thread owns 4 samples.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
     *
     * __global__ void ExampleKernel(...)
     * {
     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
     *
     *     // Allocate shared memory for BlockHistogram
     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
     *
     *     // Allocate shared memory for block-wide histogram bin counts
     *     __shared__ unsigned int smem_histogram[256];
     *

xgboost/cub/cub/block/block_load.cuh  view on Meta::CPAN

     * - The utilization of memory transactions (coalescing) decreases as the
     *   access stride between threads increases (i.e., the number items per thread).
     */
    BLOCK_LOAD_DIRECT,

    /**
     * \par Overview
     *
     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
     *
     * \par Performance Considerations
     * - The utilization of memory transactions (coalescing) remains high until the the
     *   access stride between threads (i.e., the number items per thread) exceeds the
     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
     *   - \p ITEMS_PER_THREAD is odd
     *   - The \p InputIteratorTis not a simple pointer type
     *   - The block input offset is not quadword-aligned

xgboost/cub/cub/block/block_load.cuh  view on Meta::CPAN

 * \par
 * The code snippet below illustrates the loading of a linear
 * segment of 512 integers into a "blocked" arrangement across 128 threads where each
 * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
 * meaning memory references are efficiently coalesced using a warp-striped access
 * pattern (after which items are locally reordered among threads).
 * \par
 * \code
 * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
 *
 * __global__ void ExampleKernel(int *d_data, ...)
 * {
 *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
 *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
 *
 *     // Allocate shared memory for BlockLoad
 *     __shared__ typename BlockLoad::TempStorage temp_storage;
 *
 *     // Load a segment of consecutive items that are blocked across threads
 *     int thread_data[4];
 *     BlockLoad(temp_storage).Load(d_data, thread_data);

xgboost/cub/cub/block/block_load.cuh  view on Meta::CPAN

     * \par Snippet
     * The code snippet below illustrates the loading of a linear
     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
     * meaning memory references are efficiently coalesced using a warp-striped access
     * pattern (after which items are locally reordered among threads).
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, ...)
     * {
     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
     *
     *     // Allocate shared memory for BlockLoad
     *     __shared__ typename BlockLoad::TempStorage temp_storage;
     *
     *     // Load a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     BlockLoad(temp_storage).Load(d_data, thread_data);

xgboost/cub/cub/block/block_load.cuh  view on Meta::CPAN

     * \par Snippet
     * The code snippet below illustrates the guarded loading of a linear
     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
     * meaning memory references are efficiently coalesced using a warp-striped access
     * pattern (after which items are locally reordered among threads).
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
     * {
     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
     *
     *     // Allocate shared memory for BlockLoad
     *     __shared__ typename BlockLoad::TempStorage temp_storage;
     *
     *     // Load a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);

xgboost/cub/cub/block/block_load.cuh  view on Meta::CPAN

     * \par Snippet
     * The code snippet below illustrates the guarded loading of a linear
     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
     * meaning memory references are efficiently coalesced using a warp-striped access
     * pattern (after which items are locally reordered among threads).
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
     *
     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
     * {
     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
     *
     *     // Allocate shared memory for BlockLoad
     *     __shared__ typename BlockLoad::TempStorage temp_storage;
     *
     *     // Load a segment of consecutive items that are blocked across threads
     *     int thread_data[4];
     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);

xgboost/cub/cub/block/block_radix_rank.cuh  view on Meta::CPAN

 * \par Performance Considerations
 * - \granularity
 *
 * \par Examples
 * \par
 * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
 *      \code
 *      #include <cub/cub.cuh>
 *
 *      template <int BLOCK_THREADS>
 *      __global__ void ExampleKernel(...)
 *      {
 *
 *      \endcode
 */
template <
    int                     BLOCK_DIM_X,
    int                     RADIX_BITS,
    bool                    DESCENDING,
    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,



( run in 1.671 second using v1.01-cache-2.11-cpan-49f99fa48dc )