Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Simple demonstration of cub::BlockScan
*
* To compile using the command line:
* nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console (define before including cub.h)
#define CUB_STDERR
#include <stdio.h>
#include <iostream>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/block/block_scan.cuh>
#include "../../test/test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
/// Verbose output
bool g_verbose = false;
/// Timing iterations
int g_timing_iterations = 100;
/// Default grid size
int g_grid_size = 1;
//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------
/**
* Simple kernel for performing a block-wide exclusive prefix sum over integers
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockScanAlgorithm ALGORITHM>
__global__ void BlockPrefixSumKernel(
int *d_in, // Tile of input
int *d_out, // Tile of output
clock_t *d_elapsed) // Elapsed cycle count of block scan
{
// Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
// Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
// Specialize BlockScan type for our thread block
typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;
// Shared memory
__shared__ union
{
typename BlockLoadT::TempStorage load;
typename BlockStoreT::TempStorage store;
typename BlockScanT::TempStorage scan;
} temp_storage;
// Per-thread tile data
int data[ITEMS_PER_THREAD];
// Load items into a blocked arrangement
BlockLoadT(temp_storage.load).Load(d_in, data);
// Barrier for smem reuse
__syncthreads();
// Start cycle timer
clock_t start = clock();
// Compute exclusive prefix sum
int aggregate;
BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
// Stop cycle timer
clock_t stop = clock();
// Barrier for smem reuse
__syncthreads();
// Store items from a blocked arrangement
BlockStoreT(temp_storage.store).Store(d_out, data);
// Store aggregate and elapsed clocks
if (threadIdx.x == 0)
{
*d_elapsed = (start > stop) ? start - stop : stop - start;
d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
}
}
//---------------------------------------------------------------------
// Host utilities
//---------------------------------------------------------------------
/**
* Initialize exclusive prefix sum problem (and solution).
* Returns the aggregate
*/
int Initialize(
int *h_in,
int *h_reference,
int num_items)
{
int inclusive = 0;
for (int i = 0; i < num_items; ++i)
{
h_in[i] = i % 17;
h_reference[i] = inclusive;
inclusive += h_in[i];
}
return inclusive;
}
/**
* Test thread block scan
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockScanAlgorithm ALGORITHM>
void Test()
{
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
// Allocate host arrays
int *h_in = new int[TILE_SIZE];
int *h_reference = new int[TILE_SIZE];
int *h_gpu = new int[TILE_SIZE + 1];
// Initialize problem and reference output on host
int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
// Initialize device arrays
int *d_in = NULL;
( run in 2.666 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )