Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/histogram/histogram_gmem_atomics.h view on Meta::CPAN
atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1);
}
}
}
// Second pass histogram kernel (accumulation)
template <
int NUM_PARTS,
int ACTIVE_CHANNELS,
int NUM_BINS>
__global__ void histogram_gmem_accum(
const unsigned int *in,
int n,
unsigned int *out)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i > ACTIVE_CHANNELS * NUM_BINS)
return; // out of range
unsigned int total = 0;
for (int j = 0; j < n; j++)
total += in[i + NUM_PARTS * j];
out[i] = total;
}
} // namespace histogram_gmem_atomics
template <
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
double run_gmem_atomics(
PixelType *d_image,
int width,
int height,
unsigned int *d_hist,
bool warmup)
{
enum
{
NUM_PARTS = 1024
};
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
dim3 block(32, 4);
dim3 grid(16, 16);
int total_blocks = grid.x * grid.y;
// allocate partial histogram
unsigned int *d_part_hist;
cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
dim3 block2(128);
dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
GpuTimer gpu_timer;
gpu_timer.Start();
histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
d_image,
width,
height,
d_part_hist);
histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
d_part_hist,
total_blocks,
d_hist);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
cudaFree(d_part_hist);
return elapsed_millis;
}
( run in 0.942 second using v1.01-cache-2.11-cpan-d7f47b0818f )