Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
ReduceByKeyOp scan_op;
PartialProduct retval = running_prefix;
running_prefix = scan_op(running_prefix, block_aggregate);
return retval;
}
};
/**
* Operator for detecting discontinuities in a list of row identifiers.
*/
struct NewRowOp
{
/// Returns true if row_b is the start of a new row
template <typename VertexId>
__device__ __forceinline__ bool operator()(
const VertexId& row_a,
const VertexId& row_b)
{
return (row_a != row_b);
}
};
/******************************************************************************
* Persistent thread block types
******************************************************************************/
/**
* SpMV threadblock abstraction for processing a contiguous segment of
* sparse COO tiles.
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
typename VertexId,
typename Value>
struct PersistentBlockSpmv
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Constants
enum
{
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// Head flag type
typedef int HeadFlag;
// Partial dot product type
typedef PartialProduct<VertexId, Value> PartialProduct;
// Parameterized BlockScan type for reduce-value-by-row scan
typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
// Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
// Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
// Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
// Shared memory type for this threadblock
struct TempStorage
{
union
{
typename BlockExchangeRows::TempStorage exchange_rows; // Smem needed for BlockExchangeRows
typename BlockExchangeValues::TempStorage exchange_values; // Smem needed for BlockExchangeValues
struct
{
typename BlockScan::TempStorage scan; // Smem needed for BlockScan
typename BlockDiscontinuity::TempStorage discontinuity; // Smem needed for BlockDiscontinuity
};
};
VertexId first_block_row; ///< The first row-ID seen by this thread block
VertexId last_block_row; ///< The last row-ID seen by this thread block
Value first_product; ///< The first dot-product written by this thread block
};
//---------------------------------------------------------------------
// Thread fields
//---------------------------------------------------------------------
TempStorage &temp_storage;
BlockPrefixCallbackOp<PartialProduct> prefix_op;
VertexId *d_rows;
VertexId *d_columns;
Value *d_values;
Value *d_vector;
Value *d_result;
PartialProduct *d_block_partials;
int block_offset;
int block_end;
//---------------------------------------------------------------------
// Operations
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__
PersistentBlockSpmv(
TempStorage &temp_storage,
VertexId *d_rows,
VertexId *d_columns,
Value *d_values,
Value *d_vector,
Value *d_result,
PartialProduct *d_block_partials,
int block_offset,
int block_end)
:
temp_storage(temp_storage),
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
if (threadIdx.x == 0)
{
VertexId first_block_row = d_rows[block_offset];
VertexId last_block_row = d_rows[block_end - 1];
temp_storage.first_block_row = first_block_row;
temp_storage.last_block_row = last_block_row;
temp_storage.first_product = Value(0);
// Initialize prefix_op to identity
prefix_op.running_prefix.row = first_block_row;
prefix_op.running_prefix.partial = Value(0);
}
__syncthreads();
}
/**
* Processes a COO input tile of edges, outputting dot products for each row
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ProcessTile(
int block_offset,
int guarded_items = 0)
{
VertexId columns[ITEMS_PER_THREAD];
VertexId rows[ITEMS_PER_THREAD];
Value values[ITEMS_PER_THREAD];
PartialProduct partial_sums[ITEMS_PER_THREAD];
HeadFlag head_flags[ITEMS_PER_THREAD];
// Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values)
if (FULL_TILE)
{
// Unguarded loads
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
}
else
{
// This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last
// vertex for out-of-bound items, but zero-valued
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
}
// Load the referenced values from x and compute the dot product partials sums
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
#if CUB_PTX_ARCH >= 350
values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
#else
values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
#endif
}
// Transpose from warp-striped to blocked arrangement
BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
__syncthreads();
// Transpose from warp-striped to blocked arrangement
BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
// Barrier for smem reuse and coherence
__syncthreads();
// FlagT row heads by looking for discontinuities
BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
head_flags, // (Out) Head flags
rows, // Original row ids
NewRowOp(), // Functor for detecting start of new rows
prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile
// Assemble partial product structures
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
partial_sums[ITEM].partial = values[ITEM];
partial_sums[ITEM].row = rows[ITEM];
}
// Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
PartialProduct block_aggregate;
BlockScan(temp_storage.scan).ExclusiveScan(
partial_sums, // Scan input
partial_sums, // Scan output
ReduceByKeyOp(), // Scan operator
block_aggregate, // Block-wide total (unused)
prefix_op); // Prefix operator for seeding the block-wide scan with the running total
// Barrier for smem reuse and coherence
__syncthreads();
// Scatter an accumulated dot product if it is the head of a valid row
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if (head_flags[ITEM])
{
d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
// Save off the first partial product that this thread block will scatter
if (partial_sums[ITEM].row == temp_storage.first_block_row)
{
temp_storage.first_product = partial_sums[ITEM].partial;
}
}
}
}
/**
* Iterate over input tiles belonging to this thread block
*/
__device__ __forceinline__
void ProcessTiles()
{
// Process full tiles
while (block_offset <= block_end - TILE_ITEMS)
{
ProcessTile<true>(block_offset);
block_offset += TILE_ITEMS;
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
TempStorage &temp_storage;
BlockPrefixCallbackOp<PartialProduct> prefix_op;
Value *d_result;
PartialProduct *d_block_partials;
int num_partials;
//---------------------------------------------------------------------
// Operations
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__
FinalizeSpmvBlock(
TempStorage &temp_storage,
Value *d_result,
PartialProduct *d_block_partials,
int num_partials)
:
temp_storage(temp_storage),
d_result(d_result),
d_block_partials(d_block_partials),
num_partials(num_partials)
{
// Initialize scalar shared memory values
if (threadIdx.x == 0)
{
VertexId first_block_row = d_block_partials[0].row;
VertexId last_block_row = d_block_partials[num_partials - 1].row;
temp_storage.last_block_row = last_block_row;
// Initialize prefix_op to identity
prefix_op.running_prefix.row = first_block_row;
prefix_op.running_prefix.partial = Value(0);
}
__syncthreads();
}
/**
* Processes a COO input tile of edges, outputting dot products for each row
*/
template <bool FULL_TILE>
__device__ __forceinline__
void ProcessTile(
int block_offset,
int guarded_items = 0)
{
VertexId rows[ITEMS_PER_THREAD];
PartialProduct partial_sums[ITEMS_PER_THREAD];
HeadFlag head_flags[ITEMS_PER_THREAD];
// Load a tile of block partials from previous kernel
if (FULL_TILE)
{
// Full tile
#if CUB_PTX_ARCH >= 350
LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
#else
LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
#endif
}
else
{
// Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
PartialProduct default_sum;
default_sum.row = temp_storage.last_block_row;
default_sum.partial = Value(0);
#if CUB_PTX_ARCH >= 350
LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
#else
LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
#endif
}
// Copy out row IDs for row-head flagging
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
rows[ITEM] = partial_sums[ITEM].row;
}
// FlagT row heads by looking for discontinuities
BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
rows, // Original row ids
head_flags, // (Out) Head flags
NewRowOp(), // Functor for detecting start of new rows
prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile
// Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
PartialProduct block_aggregate;
BlockScan(temp_storage.scan).ExclusiveScan(
partial_sums, // Scan input
partial_sums, // Scan output
ReduceByKeyOp(), // Scan operator
block_aggregate, // Block-wide total (unused)
prefix_op); // Prefix operator for seeding the block-wide scan with the running total
// Scatter an accumulated dot product if it is the head of a valid row
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if (head_flags[ITEM])
{
d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
}
}
}
/**
* Iterate over input tiles belonging to this thread block
*/
__device__ __forceinline__
void ProcessTiles()
{
// Process full tiles
int block_offset = 0;
while (block_offset <= num_partials - TILE_ITEMS)
{
ProcessTile<true>(block_offset);
block_offset += TILE_ITEMS;
}
// Process final partial tile (if present)
int guarded_items = num_partials - block_offset;
if (guarded_items)
{
ProcessTile<false>(block_offset, guarded_items);
}
// Scatter the final aggregate (this kernel contains only 1 threadblock)
( run in 2.752 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )