Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/cub/cub/agent/agent_spmv_orig.cuh view on Meta::CPAN
ValueT vector_value = wd_vector_x[column_idx];
row_total += value * vector_value;
}
// Warp reduce
row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total);
// Output
if (lane_idx == 0)
{
spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total;
}
}
// Return the tile's running carry-out
KeyValuePairT tile_carry(tile_num_rows, 0.0);
return tile_carry;
}
*/
/**
* Consume a merge tile, specialized for indirect load of nonzeros
* /
__device__ __forceinline__ KeyValuePairT ConsumeTile2(
int tile_idx,
CoordinateT tile_start_coord,
CoordinateT tile_end_coord,
Int2Type<false> is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
{
int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
ValueT* s_tile_nonzeros = &temp_storage.merge_items[0].nonzero;
ValueT nonzeros[ITEMS_PER_THREAD];
// Gather the nonzeros for the merge tile into shared memory
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx];
ValueT value = wd_values[tile_start_coord.y + nonzero_idx];
ValueT vector_value = spmv_params.t_vector_x[column_idx];
#if (CUB_PTX_ARCH >= 350)
vector_value = wd_vector_x[column_idx];
#endif
nonzeros[ITEM] = value * vector_value;
}
// Exchange striped->blocked
BlockExchangeT(temp_storage.exchange).StripedToBlocked(nonzeros);
CTA_SYNC();
// Compute an inclusive prefix sum
BlockPrefixSumT(temp_storage.prefix_sum).InclusiveSum(nonzeros, nonzeros);
CTA_SYNC();
if (threadIdx.x == 0)
s_tile_nonzeros[0] = 0.0;
// Scatter back to smem
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM + 1;
s_tile_nonzeros[item_idx] = nonzeros[ITEM];
}
CTA_SYNC();
// Gather the row end-offsets for the merge tile into shared memory
#pragma unroll 1
for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
{
OffsetT start = CUB_MAX(wd_row_end_offsets[tile_start_coord.x + item - 1], tile_start_coord.y);
OffsetT end = wd_row_end_offsets[tile_start_coord.x + item];
start -= tile_start_coord.y;
end -= tile_start_coord.y;
ValueT row_partial = s_tile_nonzeros[end] - s_tile_nonzeros[start];
spmv_params.d_vector_y[tile_start_coord.x + item] = row_partial;
}
// Get the tile's carry-out
KeyValuePairT tile_carry;
if (threadIdx.x == 0)
{
tile_carry.key = tile_num_rows;
OffsetT start = CUB_MAX(wd_row_end_offsets[tile_end_coord.x - 1], tile_start_coord.y);
start -= tile_start_coord.y;
OffsetT end = tile_num_nonzeros;
tile_carry.value = s_tile_nonzeros[end] - s_tile_nonzeros[start];
}
// Return the tile's running carry-out
return tile_carry;
}
*/
/**
* Consume input tile
*/
__device__ __forceinline__ void ConsumeTile(
CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates
( run in 1.284 second using v1.01-cache-2.11-cpan-39bf76dae61 )