Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
*/
#pragma once
#include <iterator>
#include "../util_type.cuh"
#include "../block/block_load.cuh"
#include "../grid/grid_queue.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy
******************************************************************************/
/**
*
*/
enum BlockHistogramMemoryPreference
{
GMEM,
SMEM,
BLEND
};
/**
* Parameterizable tuning policy type for AgentHistogram
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue
struct AgentHistogramPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
};
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
*/
template <
typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type
int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory.
int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels.
int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename SampleIteratorT, ///< Random-access input iterator type for reading samples
typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
typename OffsetT, ///< Signed integer type for global offsets
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
struct AgentHistogram
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// The sample type of the input iterator
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
/// The pixel type of SampleT
typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
/// The quad type of SampleT
typedef typename CubVector<SampleT, 4>::Type QuadT;
/// Constants
enum
{
BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ?
AgentHistogramPolicyT::MEM_PREFERENCE :
GMEM,
IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
};
/// Cache load modifier for reading input elements
static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
/// Input iterator wrapper type (for applying cache modifier)
typedef typename If<IsPointer<SampleIteratorT>::VALUE,
CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
SampleIteratorT>::Type // Directly use the supplied input iterator type
WrappedSampleIteratorT;
/// Pixel input iterator type (for applying cache modifier)
typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
WrappedPixelIteratorT;
/// Qaud input iterator type (for applying cache modifier)
typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
WrappedQuadIteratorT;
/// Parameterized BlockLoad type for samples
typedef BlockLoad<
SampleT,
BLOCK_THREADS,
SAMPLES_PER_THREAD,
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
/// Parameterized BlockLoad type for quads
typedef BlockLoad<
QuadT,
BLOCK_THREADS,
QUADS_PER_THREAD,
AgentHistogramPolicyT::LOAD_ALGORITHM>
BlockLoadQuadT;
/// Shared memory type required by this thread block
struct _TempStorage
{
CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding)
int tile_idx;
union
{
typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples
typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels
typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads
};
};
/// Temporary storage type (unionable)
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
/// Reference to temp_storage
_TempStorage &temp_storage;
/// Sample input iterator (with cache modifier applied, if possible)
WrappedSampleIteratorT d_wrapped_samples;
/// Native pointer for input samples (possibly NULL if unavailable)
SampleT* d_native_samples;
/// The number of output bins for each channel
int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
/// The number of privatized bins for each channel
int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
/// Reference to gmem privatized histograms for each channel
CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
/// Reference to final output histograms (gmem)
CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
/// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
/// The transform operator for determining privatized counter indices from samples, one for each channel
PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
/// Whether to prefer privatized smem counters vs privatized global counters
bool prefer_smem;
//---------------------------------------------------------------------
// Initialize privatized bin counters
//---------------------------------------------------------------------
// Initialize privatized bin counters
__device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
{
// Initialize histogram bin counts to zeros
#pragma unroll
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
{
privatized_histograms[CHANNEL][privatized_bin] = 0;
}
}
// Barrier to make sure all threads are done updating counters
CTA_SYNC();
}
// Initialize privatized bin counters. Specialized for privatized shared-memory counters
__device__ __forceinline__ void InitSmemBinCounters()
{
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
InitBinCounters(privatized_histograms);
}
// Initialize privatized bin counters. Specialized for privatized global-memory counters
__device__ __forceinline__ void InitGmemBinCounters()
{
InitBinCounters(d_privatized_histograms);
}
//---------------------------------------------------------------------
// Update final output histograms
//---------------------------------------------------------------------
// Update final output histograms from privatized histograms
__device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
{
// Barrier to make sure all threads are done updating counters
CTA_SYNC();
// Apply privatized bin counts to output bin counts
#pragma unroll
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
int channel_bins = num_privatized_bins[CHANNEL];
for (int privatized_bin = threadIdx.x;
privatized_bin < channel_bins;
privatized_bin += BLOCK_THREADS)
{
int output_bin = -1;
CounterT count = privatized_histograms[CHANNEL][privatized_bin];
bool is_valid = count > 0;
output_decode_op[CHANNEL].BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
if (output_bin >= 0)
{
atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
}
}
}
}
// Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters
__device__ __forceinline__ void StoreSmemOutput()
{
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
StoreOutput(privatized_histograms);
}
// Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
__device__ __forceinline__ void StoreGmemOutput()
{
StoreOutput(d_privatized_histograms);
}
//---------------------------------------------------------------------
// Tile accumulation
//---------------------------------------------------------------------
// Accumulate pixels. Specialized for RLE compression.
__device__ __forceinline__ void AccumulatePixels(
SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
bool is_valid[PIXELS_PER_THREAD],
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
Int2Type<true> is_rle_compress)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
// Bin pixels
int bins[PIXELS_PER_THREAD];
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
{
bins[PIXEL] = -1;
privatized_decode_op[CHANNEL].BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
}
CounterT accumulator = 1;
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
{
if (bins[PIXEL] == bins[PIXEL + 1])
{
accumulator++;
}
else
{
if (bins[PIXEL] >= 0)
atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
accumulator = 1;
}
}
// Last pixel
if (bins[PIXELS_PER_THREAD - 1] >= 0)
atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
}
}
// Accumulate pixels. Specialized for individual accumulation of each pixel.
__device__ __forceinline__ void AccumulatePixels(
SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
bool is_valid[PIXELS_PER_THREAD],
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
( run in 1.578 second using v1.01-cache-2.11-cpan-39bf76dae61 )