Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/block_histogram.cuh view on Meta::CPAN
*/
#pragma once
#include "specializations/block_histogram_sort.cuh"
#include "specializations/block_histogram_atomic.cuh"
#include "../util_ptx.cuh"
#include "../util_arch.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Algorithmic variants
******************************************************************************/
/**
* \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
*/
enum BlockHistogramAlgorithm
{
/**
* \par Overview
* Sorting followed by differentiation. Execution is comprised of two phases:
* -# Sort the data using efficient radix sort
* -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
*
* \par Performance Considerations
* Delivers consistent throughput regardless of sample bin distribution.
*/
BLOCK_HISTO_SORT,
/**
* \par Overview
* Use atomic addition to update byte counts directly
*
* \par Performance Considerations
* Performance is strongly tied to the hardware implementation of atomic
* addition, and may be significantly degraded for non uniformly-random
* input distributions where many concurrent updates are likely to be
* made to the same bin counter.
*/
BLOCK_HISTO_ATOMIC,
};
/******************************************************************************
* Block histogram
******************************************************************************/
/**
* \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 
* \ingroup BlockModule
*
* \tparam T The sample type being histogrammed (must be castable to an integer bin identifier)
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam ITEMS_PER_THREAD The number of items per thread
* \tparam BINS The number bins within the histogram
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
*
* \par Overview
* - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
* counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
* - BlockHistogram can be optionally specialized to use different algorithms:
* -# <b>cub::BLOCK_HISTO_SORT</b>. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
* -# <b>cub::BLOCK_HISTO_ATOMIC</b>. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
*
* \par Performance Considerations
* - \granularity
*
* \par A Simple Example
* \blockcollective{BlockHistogram}
* \par
* The code snippet below illustrates a 256-bin histogram of 512 integer samples that
* are partitioned across 128 threads where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/block/block_histogram.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
* // Obtain input samples per thread
* unsigned char data[4];
* ...
*
* // Compute the block-wide histogram
* BlockHistogram(temp_storage).Histogram(data, smem_histogram);
*
* \endcode
*
* \par Performance and Usage Considerations
* - The histogram output can be constructed in shared or device-accessible memory
* - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
*
*/
template <
typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
int BINS,
( run in 0.515 second using v1.01-cache-2.11-cpan-5623c5533a1 )