Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/agent/agent_histogram.cuh view on Meta::CPAN
// Load partially-full, aligned tile using the pixel iterator
__device__ __forceinline__ void LoadTile(
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<false> is_full_tile,
Int2Type<true> is_aligned)
{
typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
int valid_pixels = valid_samples / NUM_CHANNELS;
// Load using a wrapped pixel iterator
BlockLoadPixelT(temp_storage.pixel_load).Load(
d_wrapped_pixels,
reinterpret_cast<AliasedPixels&>(samples),
valid_pixels);
}
// Load partially-full, mis-aligned tile using sample iterator
__device__ __forceinline__ void LoadTile(
OffsetT block_offset,
int valid_samples,
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
Int2Type<false> is_full_tile,
Int2Type<false> is_aligned)
{
typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
BlockLoadSampleT(temp_storage.sample_load).Load(
d_wrapped_samples + block_offset,
reinterpret_cast<AliasedSamples&>(samples),
valid_samples);
}
//---------------------------------------------------------------------
// Tile processing
//---------------------------------------------------------------------
// Consume a tile of data samples
template <
bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
bool IS_FULL_TILE> // Whether the tile is full
__device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
{
SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
bool is_valid[PIXELS_PER_THREAD];
// Load tile
LoadTile(
block_offset,
valid_samples,
samples,
Int2Type<IS_FULL_TILE>(),
Int2Type<IS_ALIGNED>());
// Set valid flags
#pragma unroll
for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
// Accumulate samples
#if CUB_PTX_ARCH >= 120
if (prefer_smem)
AccumulateSmemPixels(samples, is_valid);
else
AccumulateGmemPixels(samples, is_valid);
#else
AccumulateGmemPixels(samples, is_valid);
#endif
}
// Consume row tiles. Specialized for work-stealing from queue
template <bool IS_ALIGNED>
__device__ __forceinline__ void ConsumeTiles(
OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< The number of rows in the region of interest
OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
int tiles_per_row, ///< Number of image tiles per row
GridQueue<int> tile_queue,
Int2Type<true> is_work_stealing)
{
int num_tiles = num_rows * tiles_per_row;
int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
OffsetT num_even_share_tiles = gridDim.x * gridDim.y;
while (tile_idx < num_tiles)
{
int row = tile_idx / tiles_per_row;
int col = tile_idx - (row * tiles_per_row);
OffsetT row_offset = row * row_stride_samples;
OffsetT col_offset = (col * TILE_SAMPLES);
OffsetT tile_offset = row_offset + col_offset;
if (col == tiles_per_row - 1)
{
// Consume a partially-full tile at the end of the row
OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
}
else
{
// Consume full tile
ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
}
CTA_SYNC();
// Get next tile
if (threadIdx.x == 0)
temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
CTA_SYNC();
( run in 0.898 second using v1.01-cache-2.11-cpan-140bd7fdf52 )