Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/block/specializations/block_scan_warp_scans2.cuh view on Meta::CPAN
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
#pragma once
#include "../../util_arch.cuh"
#include "../../util_ptx.cuh"
#include "../../warp/warp_scan.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
template <
typename T,
int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
struct BlockScanWarpScans
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
/// Constants
enum
{
/// Number of warp threads
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
/// Number of active warps
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
};
/// WarpScan utility type
typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
/// WarpScan utility type
typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans
typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans
T warp_aggregates[WARPS];
T block_prefix; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
// Thread fields
_TempStorage &temp_storage;
unsigned int linear_tid;
unsigned int warp_id;
unsigned int lane_id;
//---------------------------------------------------------------------
// Constructors
//---------------------------------------------------------------------
/// Constructor
__device__ __forceinline__ BlockScanWarpScans(
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),
linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
lane_id(LaneId())
{}
//---------------------------------------------------------------------
// Utility methods
//---------------------------------------------------------------------
template <typename ScanOp, int WARP>
__device__ __forceinline__ void ApplyWarpAggregates(
T &warp_prefix, ///< [out] The calling thread's partial reduction
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items
Int2Type<WARP> addend_warp)
{
if (warp_id == WARP)
warp_prefix = block_aggregate;
T addend = temp_storage.warp_aggregates[WARP];
block_aggregate = scan_op(block_aggregate, addend);
ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
}
template <typename ScanOp>
__device__ __forceinline__ void ApplyWarpAggregates(
T &warp_prefix, ///< [out] The calling thread's partial reduction
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items
( run in 2.022 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )