Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/specializations/block_scan_warp_scans.cuh view on Meta::CPAN
*
******************************************************************************/
/**
* \file
* cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
#pragma once
#include "../../util_arch.cuh"
#include "../../util_ptx.cuh"
#include "../../warp/warp_scan.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
template <
typename T,
int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
struct BlockScanWarpScans
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// Constants
enum
{
/// Number of warp threads
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
/// Number of active warps
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
};
/// WarpScan utility type
typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
/// WarpScan utility type
typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
/// Shared memory storage layout type
struct __align__(32) _TempStorage
{
T warp_aggregates[WARPS];
typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans
T block_prefix; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
// Thread fields
_TempStorage &temp_storage;
unsigned int linear_tid;
unsigned int warp_id;
unsigned int lane_id;
//---------------------------------------------------------------------
// Constructors
//---------------------------------------------------------------------
/// Constructor
__device__ __forceinline__ BlockScanWarpScans(
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
lane_id(LaneId())
{}
//---------------------------------------------------------------------
// Utility methods
//---------------------------------------------------------------------
template <typename ScanOp, int WARP>
__device__ __forceinline__ void ApplyWarpAggregates(
T &warp_prefix, ///< [out] The calling thread's partial reduction
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items
Int2Type<WARP> /*addend_warp*/)
{
if (warp_id == WARP)
warp_prefix = block_aggregate;
T addend = temp_storage.warp_aggregates[WARP];
block_aggregate = scan_op(block_aggregate, addend);
ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
}
template <typename ScanOp>
__device__ __forceinline__ void ApplyWarpAggregates(
T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction
ScanOp /*scan_op*/, ///< [in] Binary scan operator
T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items
Int2Type<WARPS> /*addend_warp*/)
( run in 0.968 second using v1.01-cache-2.11-cpan-39bf76dae61 )