Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/specializations/block_scan_warp_scans3.cuh view on Meta::CPAN
* \file
* cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
#pragma once
#include "../../util_arch.cuh"
#include "../../util_ptx.cuh"
#include "../../warp/warp_scan.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
template <
typename T,
int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
struct BlockScanWarpScans
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// Constants
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
/// Number of warp threads
INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
/// Number of outer scan warps
OUTER_WARPS = INNER_WARP_THREADS
};
/// Outer WarpScan utility type
typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
/// Inner WarpScan utility type
typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
/// Shared memory storage layout type
struct _TempStorage
{
union
{
Uninitialized<OuterScanArray> outer_warp_scan; ///< Buffer for warp-synchronous outer scans
typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan
};
T warp_aggregates[OUTER_WARPS];
T block_aggregate; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
// Thread fields
_TempStorage &temp_storage;
unsigned int linear_tid;
unsigned int warp_id;
unsigned int lane_id;
//---------------------------------------------------------------------
// Constructors
//---------------------------------------------------------------------
/// Constructor
__device__ __forceinline__ BlockScanWarpScans(
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
{}
//---------------------------------------------------------------------
// Exclusive scans
//---------------------------------------------------------------------
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op) ///< [in] Binary scan operator
{
// Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
T block_aggregate;
ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input items
T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input)
const T &initial_value, ///< [in] Initial value to seed the exclusive scan
ScanOp scan_op) ///< [in] Binary scan operator
( run in 1.136 second using v1.01-cache-2.11-cpan-39bf76dae61 )