view release on metacpan or search on metacpan
xgboost/cub/README.md
xgboost/cub/common.mk
xgboost/cub/cub/agent/agent_histogram.cuh
xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh
xgboost/cub/cub/agent/agent_radix_sort_upsweep.cuh
xgboost/cub/cub/agent/agent_reduce.cuh
xgboost/cub/cub/agent/agent_reduce_by_key.cuh
xgboost/cub/cub/agent/agent_rle.cuh
xgboost/cub/cub/agent/agent_scan.cuh
xgboost/cub/cub/agent/agent_segment_fixup.cuh
xgboost/cub/cub/agent/agent_select_if.cuh
xgboost/cub/cub/agent/agent_spmv_csrt.cuh
xgboost/cub/cub/agent/agent_spmv_orig.cuh
xgboost/cub/cub/agent/agent_spmv_row_based.cuh
xgboost/cub/cub/agent/single_pass_scan_operators.cuh
xgboost/cub/cub/block/block_adjacent_difference.cuh
xgboost/cub/cub/block/block_discontinuity.cuh
xgboost/cub/cub/block/block_exchange.cuh
xgboost/cub/cub/block/block_histogram.cuh
xgboost/cub/cub/block/block_load.cuh
xgboost/cub/cub/block/block_radix_rank.cuh
xgboost/cub/cub/block/specializations/block_scan_warp_scans3.cuh
xgboost/cub/cub/cub.cuh
xgboost/cub/cub/device/device_histogram.cuh
xgboost/cub/cub/device/device_partition.cuh
xgboost/cub/cub/device/device_radix_sort.cuh
xgboost/cub/cub/device/device_reduce.cuh
xgboost/cub/cub/device/device_run_length_encode.cuh
xgboost/cub/cub/device/device_scan.cuh
xgboost/cub/cub/device/device_segmented_radix_sort.cuh
xgboost/cub/cub/device/device_segmented_reduce.cuh
xgboost/cub/cub/device/device_select.cuh
xgboost/cub/cub/device/device_spmv.cuh
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh
xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh
xgboost/cub/cub/grid/grid_barrier.cuh
xgboost/cub/cub/grid/grid_even_share.cuh
xgboost/cub/cub/grid/grid_mapping.cuh
xgboost/cub/cub/grid/grid_queue.cuh
xgboost/cub/cub/host/mutex.cuh
xgboost/cub/cub/iterator/arg_index_input_iterator.cuh
xgboost/cub/cub/iterator/cache_modified_input_iterator.cuh
xgboost/cub/examples/block/example_block_radix_sort.cu
xgboost/cub/examples/block/example_block_reduce.cu
xgboost/cub/examples/block/example_block_scan.cu
xgboost/cub/examples/block/reduce_by_key.cu
xgboost/cub/examples/device/Makefile
xgboost/cub/examples/device/example_device_partition_flagged.cu
xgboost/cub/examples/device/example_device_partition_if.cu
xgboost/cub/examples/device/example_device_radix_sort.cu
xgboost/cub/examples/device/example_device_reduce.cu
xgboost/cub/examples/device/example_device_scan.cu
xgboost/cub/examples/device/example_device_select_flagged.cu
xgboost/cub/examples/device/example_device_select_if.cu
xgboost/cub/examples/device/example_device_select_unique.cu
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
xgboost/cub/experimental/Makefile
xgboost/cub/experimental/defunct/example_coo_spmv.cu
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu
xgboost/cub/experimental/histogram/histogram_cub.h
xgboost/cub/experimental/histogram/histogram_gmem_atomics.h
xgboost/cub/experimental/histogram/histogram_smem_atomics.h
xgboost/cub/experimental/histogram_compare.cu
xgboost/cub/experimental/sparse_matrix.h
xgboost/cub/experimental/spmv_compare.cu
xgboost/cub/test/test_block_load_store.cu
xgboost/cub/test/test_block_radix_sort.cu
xgboost/cub/test/test_block_reduce.cu
xgboost/cub/test/test_block_scan.cu
xgboost/cub/test/test_device_histogram.cu
xgboost/cub/test/test_device_radix_sort.cu
xgboost/cub/test/test_device_reduce.cu
xgboost/cub/test/test_device_reduce_by_key.cu
xgboost/cub/test/test_device_run_length_encode.cu
xgboost/cub/test/test_device_scan.cu
xgboost/cub/test/test_device_select_if.cu
xgboost/cub/test/test_device_select_unique.cu
xgboost/cub/test/test_grid_barrier.cu
xgboost/cub/test/test_iterator.cu
xgboost/cub/test/test_util.h
xgboost/cub/test/test_warp_reduce.cu
xgboost/cub/test/test_warp_scan.cu
xgboost/cub/tune/Makefile
xgboost/cub/tune/tune_device_reduce.cu
xgboost/demo/README.md
xgboost/demo/binary_classification/README.md
xgboost/demo/binary_classification/agaricus-lepiota.data
xgboost/R-package/configure view on Meta::CPAN
cat >>confdefs.h <<_ACEOF
#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
_ACEOF
cat >>confdefs.h <<_ACEOF
#define PACKAGE_URL "$PACKAGE_URL"
_ACEOF
# Let the site file select an alternate cache file if it wants to.
# Prefer an explicitly selected file to automatically selected ones.
ac_site_file1=NONE
ac_site_file2=NONE
if test -n "$CONFIG_SITE"; then
# We do not want a PATH search for config.site.
case $CONFIG_SITE in #((
-*) ac_site_file1=./$CONFIG_SITE;;
*/*) ac_site_file1=$CONFIG_SITE;;
*) ac_site_file1=./$CONFIG_SITE;;
esac
elif test "x$prefix" != xNONE; then
xgboost/R-package/demo/caret_wrapper.R view on Meta::CPAN
require(xgboost)
require(data.table)
require(vcd)
require(e1071)
# Load Arthritis dataset in memory.
data(Arthritis)
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = F)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, e...
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you alre...
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]
#-------------Basic Training using XGBoost in caret Library-----------------
xgboost/R-package/demo/create_sparse_matrix.R view on Meta::CPAN
df <- data.table(Arthritis, keep.rownames = F)
# Let's have a look to the data.table
cat("Print the dataset\n")
print(df)
# 2 columns have factor type, one has ordinal type (ordinal variable is a categorical variable with values wich can be ordered, here: None > Some > Marked).
cat("Structure of the dataset\n")
str(df)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, e...
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you alre...
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]
xgboost/R-package/vignettes/discoverYourData.Rmd view on Meta::CPAN
For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation wi...
Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
```{r, warning=FALSE,message=FALSE}
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
head(sparse_matrix)
```
> Formula `Improved ~ .` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` column selection removes the intercept column which is full of `1` (this column is generated by the conversion). For more ...
Create the output `numeric` vector (not as a sparse `Matrix`):
```{r}
output_vector = df[,Improved] == "Marked"
```
1. set `Y` vector to `0`;
2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
3. return `Y` vector.
xgboost/R-package/vignettes/discoverYourData.Rmd view on Meta::CPAN
```{r}
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)
# Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
head(importanceClean)
```
> In the table above we have removed two not needed columns and select only the first lines.
First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used seve...
How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.
The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole populati...
Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).
> You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for th...
xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN
- Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
- Added workaround for TexRefInputIteratorTto work with CUDA 6.0
//-----------------------------------------------------------------------------
1.1.1 12/11/2013
- New features:
- Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy. Compatible with Thrust API.
- Added descending sorting to DeviceRadixSort and BlockRadixSort
- Added min, max, arg-min, and arg-max to DeviceReduce
- Added DeviceSelect (select-unique, select-if, and select-flagged)
- Added DevicePartition (partition-if, partition-flagged)
- Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
- Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
- Performance
- Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
- Interface changes:
- Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces. The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
- Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
- Documentation and testing:
- Added simple examples of device-wide methods
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
/**
* \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
*/
template <
typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type
typename KeysInputIteratorT, ///< Random-access input iterator type for keys
typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys
typename ValuesInputIteratorT, ///< Random-access input iterator type for values
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected
typename EqualityOpT, ///< KeyT equality operator type
typename ReductionOpT, ///< ValueT reduction operator type
typename OffsetT> ///< Signed integer type for global offsets
struct AgentReduceByKey
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input keys type
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
/**
* Scatter flagged items
*/
__device__ __forceinline__ void Scatter(
KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
OffsetT (&segment_flags)[ITEMS_PER_THREAD],
OffsetT (&segment_indices)[ITEMS_PER_THREAD],
OffsetT num_tile_segments,
OffsetT num_tile_segments_prefix)
{
// Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
{
ScatterTwoPhase(
scatter_items,
segment_flags,
segment_indices,
num_tile_segments,
num_tile_segments_prefix);
}
else
xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN
OffsetT num_segments = num_segments_prefix + num_tile_segments;
// If the last tile is a whole tile, output the final_value
if (num_remaining == TILE_ITEMS)
{
d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1];
d_aggregates_out[num_segments] = total_aggregate;
num_segments++;
}
// Output the total number of items selected
*d_num_runs_out = num_segments;
}
}
/**
* Scan tiles of items as part of a dynamic chained scan
*/
__device__ __forceinline__ void ConsumeRange(
int num_items, ///< Total number of input items
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
d_in(d_in),
d_offsets_out(d_offsets_out),
d_lengths_out(d_lengths_out),
equality_op(equality_op),
scan_op(cub::Sum()),
num_items(num_items)
{}
//---------------------------------------------------------------------
// Utility methods for initializing the selections
//---------------------------------------------------------------------
template <bool FIRST_TILE, bool LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT tile_offset,
OffsetT num_remaining,
T (&items)[ITEMS_PER_THREAD],
LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD])
{
bool head_flags[ITEMS_PER_THREAD];
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
thread_exclusive_in_warp,
identity,
scan_op);
// Last lane in each warp shares its warp-aggregate
if (lane_id == WARP_THREADS - 1)
temp_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive;
CTA_SYNC();
// Accumulate total selected and the warp-wide prefix
warp_exclusive_in_tile = identity;
warp_aggregate = temp_storage.warp_aggregates.Alias()[warp_id];
tile_aggregate = temp_storage.warp_aggregates.Alias()[0];
#pragma unroll
for (int WARP = 1; WARP < WARPS; ++WARP)
{
if (warp_id == WARP)
warp_exclusive_in_tile = tile_aggregate;
tile_aggregate = scan_op(tile_aggregate, temp_storage.warp_aggregates.Alias()[WARP]);
}
}
//---------------------------------------------------------------------
// Utility methods for scattering selections
//---------------------------------------------------------------------
/**
* Two-phase scatter, specialized for warp time-slicing
*/
template <bool FIRST_TILE>
__device__ __forceinline__ void ScatterTwoPhase(
OffsetT tile_num_runs_exclusive_in_global,
OffsetT warp_num_runs_aggregate,
OffsetT warp_num_runs_exclusive_in_tile,
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
// Return running total (inclusive of this tile)
return prefix_op.inclusive_prefix;
}
}
/**
* Scan tiles of items as part of a dynamic chained scan
*/
template <typename NumRunsIteratorT> ///< Output iterator type for recording number of items selected
__device__ __forceinline__ void ConsumeRange(
int num_tiles, ///< Total number of input tiles
ScanTileStateT& tile_status, ///< Global list of tile status
NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified
{
// Blocks are launched in increasing order, so just assign one tile per block
int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile
OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN
// Not the last tile (full)
ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
}
else if (num_remaining > 0)
{
// The last tile (possibly partially-full)
LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
if (threadIdx.x == 0)
{
// Output the total number of items selected
*d_num_runs_out = running_total.key;
// The inclusive prefix contains accumulated length reduction for the last run
if (running_total.key > 0)
d_lengths_out[running_total.key - 1] = running_total.value;
}
}
}
};
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
*/
#pragma once
#include <iterator>
#include "single_pass_scan_operators.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_scan.cuh"
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
*
* Performs functor-based selection if SelectOpT functor type != NullType
* Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
* Otherwise performs discontinuity selection (keep unique)
*/
template <
typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for selection items
typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items
typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
typename OffsetT, ///< Signed integer type for global offsets
bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
struct AgentSelectIf
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// The input value type
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN
// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
_TempStorage& temp_storage; ///< Reference to temp_storage
WrappedInputIteratorT d_in; ///< Input items
SelectedOutputIteratorT d_selected_out; ///< Unique output items
WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable)
InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
SelectOpT select_op; ///< Selection operator
OffsetT num_items; ///< Total number of input items
//---------------------------------------------------------------------
// Constructor
//---------------------------------------------------------------------
// Constructor
__device__ __forceinline__
AgentSelectIf(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorT d_in, ///< Input data
FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable)
SelectedOutputIteratorT d_selected_out, ///< Output data
SelectOpT select_op, ///< Selection operator
EqualityOpT equality_op, ///< Equality operator
OffsetT num_items) ///< Total number of input items
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_flags_in(d_flags_in),
d_selected_out(d_selected_out),
select_op(select_op),
inequality_op(equality_op),
num_items(num_items)
{}
//---------------------------------------------------------------------
// Utility methods for initializing the selections
//---------------------------------------------------------------------
/**
* Initialize selections (specialized for selection operator)
*/
template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT /*tile_offset*/,
OffsetT num_tile_items,
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
Int2Type<USE_SELECT_OP> /*select_method*/)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
// Out-of-bounds items are selection_flags
selection_flags[ITEM] = 1;
if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
selection_flags[ITEM] = select_op(items[ITEM]);
}
}
/**
* Initialize selections (specialized for valid flags)
*/
template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT tile_offset,
OffsetT num_tile_items,
OutputT (&/*items*/)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
Int2Type<USE_SELECT_FLAGS> /*select_method*/)
{
CTA_SYNC();
FlagT flags[ITEMS_PER_THREAD];
if (IS_LAST_TILE)
{
// Out-of-bounds items are selection_flags
BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
}
else
{
BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
}
// Convert flag type to selection_flags type
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
selection_flags[ITEM] = flags[ITEM];
}
}
/**
* Initialize selections (specialized for discontinuity detection)
*/
template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
__device__ __forceinline__ void InitializeSelections(
OffsetT tile_offset,
OffsetT num_tile_items,
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
Int2Type<USE_DISCONTINUITY> /*select_method*/)
{
if (IS_FIRST_TILE)
{
CTA_SYNC();
// Set head selection_flags. First tile sets the first flag for the first item
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
}
else
{
OutputT tile_predecessor;
if (threadIdx.x == 0)
tile_predecessor = d_in[tile_offset - 1];
CTA_SYNC();
BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
}
// Set selection flags for out-of-bounds items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
// Set selection_flags for out-of-bounds items
if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
selection_flags[ITEM] = 1;
}
}
//---------------------------------------------------------------------
// Scatter utility methods
//---------------------------------------------------------------------
/**
* Scatter flagged items to output offsets (specialized for direct scattering)
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void ScatterDirect(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
OffsetT num_selections)
{
// Scatter flagged items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (selection_flags[ITEM])
{
if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
{
d_selected_out[selection_indices[ITEM]] = items[ITEM];
}
}
}
}
/**
* Scatter flagged items to output offsets (specialized for two-phase scattering)
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void ScatterTwoPhase(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
int /*num_tile_items*/, ///< Number of valid items in this tile
int num_tile_selections, ///< Number of selections in this tile
OffsetT num_selections_prefix, ///< Total number of selections prior to this tile
OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile
Int2Type<false> /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition
{
CTA_SYNC();
// Compact and scatter items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
if (selection_flags[ITEM])
{
temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
}
}
CTA_SYNC();
for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
{
d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
}
}
/**
* Scatter flagged items to output offsets (specialized for two-phase scattering)
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void ScatterTwoPhase(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
int num_tile_items, ///< Number of valid items in this tile
int num_tile_selections, ///< Number of selections in this tile
OffsetT num_selections_prefix, ///< Total number of selections prior to this tile
OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile
Int2Type<true> /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition
{
CTA_SYNC();
int tile_num_rejections = num_tile_items - num_tile_selections;
// Scatter items to shared memory (rejections first)
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
int local_selection_idx = selection_indices[ITEM] - num_selections_prefix;
int local_rejection_idx = item_idx - local_selection_idx;
int local_scatter_offset = (selection_flags[ITEM]) ?
tile_num_rejections + local_selection_idx :
local_rejection_idx;
temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
}
CTA_SYNC();
// Gather items from shared memory and scatter to global
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
int rejection_idx = item_idx;
int selection_idx = item_idx - tile_num_rejections;
OffsetT scatter_offset = (item_idx < tile_num_rejections) ?
num_items - num_rejected_prefix - rejection_idx - 1 :
num_selections_prefix + selection_idx;
OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
if (!IS_LAST_TILE || (item_idx < num_tile_items))
{
d_selected_out[scatter_offset] = item;
}
}
}
/**
* Scatter flagged items
*/
template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
__device__ __forceinline__ void Scatter(
OutputT (&items)[ITEMS_PER_THREAD],
OffsetT (&selection_flags)[ITEMS_PER_THREAD],
OffsetT (&selection_indices)[ITEMS_PER_THREAD],
int num_tile_items, ///< Number of valid items in this tile
int num_tile_selections, ///< Number of selections in this tile
OffsetT num_selections_prefix, ///< Total number of selections prior to this tile
OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile
OffsetT num_selections) ///< Total number of selections including this tile
{
// Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
{
ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
items,
selection_flags,
selection_indices,
num_tile_items,
num_tile_selections,
num_selections_prefix,
num_rejected_prefix,
Int2Type<KEEP_REJECTS>());
}
else
{
ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
items,
selection_flags,
selection_indices,
num_selections);
}
}
//---------------------------------------------------------------------
// Cooperatively scan a device-wide sequence of tiles with other CTAs
//---------------------------------------------------------------------
/**
* Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile)
*/
template <bool IS_LAST_TILE>
__device__ __forceinline__ OffsetT ConsumeFirstTile(
int num_tile_items, ///< Number of input items comprising this tile
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
OutputT items[ITEMS_PER_THREAD];
OffsetT selection_flags[ITEMS_PER_THREAD];
OffsetT selection_indices[ITEMS_PER_THREAD];
// Load items
if (IS_LAST_TILE)
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
else
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
// Initialize selection_flags
InitializeSelections<true, IS_LAST_TILE>(
tile_offset,
num_tile_items,
items,
selection_flags,
Int2Type<SELECT_METHOD>());
CTA_SYNC();
// Exclusive scan of selection_flags
OffsetT num_tile_selections;
BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
if (threadIdx.x == 0)
{
// Update tile status if this is not the last tile
if (!IS_LAST_TILE)
tile_state.SetInclusive(0, num_tile_selections);
}
// Discount any out-of-bounds selections
if (IS_LAST_TILE)
num_tile_selections -= (TILE_ITEMS - num_tile_items);
// Scatter flagged items
Scatter<IS_LAST_TILE, true>(
items,
selection_flags,
selection_indices,
num_tile_items,
num_tile_selections,
0,
0,
num_tile_selections);
return num_tile_selections;
}
/**
* Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile)
*/
template <bool IS_LAST_TILE>
__device__ __forceinline__ OffsetT ConsumeSubsequentTile(
int num_tile_items, ///< Number of input items comprising this tile
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
OutputT items[ITEMS_PER_THREAD];
OffsetT selection_flags[ITEMS_PER_THREAD];
OffsetT selection_indices[ITEMS_PER_THREAD];
// Load items
if (IS_LAST_TILE)
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
else
BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
// Initialize selection_flags
InitializeSelections<false, IS_LAST_TILE>(
tile_offset,
num_tile_items,
items,
selection_flags,
Int2Type<SELECT_METHOD>());
CTA_SYNC();
// Exclusive scan of values and selection_flags
TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
OffsetT num_tile_selections = prefix_op.GetBlockAggregate();
OffsetT num_selections = prefix_op.GetInclusivePrefix();
OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix();
OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix;
// Discount any out-of-bounds selections
if (IS_LAST_TILE)
{
int num_discount = TILE_ITEMS - num_tile_items;
num_selections -= num_discount;
num_tile_selections -= num_discount;
}
// Scatter flagged items
Scatter<IS_LAST_TILE, false>(
items,
selection_flags,
selection_indices,
num_tile_items,
num_tile_selections,
num_selections_prefix,
num_rejected_prefix,
num_selections);
return num_selections;
}
/**
* Process a tile of input
*/
template <bool IS_LAST_TILE>
__device__ __forceinline__ OffsetT ConsumeTile(
int num_tile_items, ///< Number of input items comprising this tile
int tile_idx, ///< Tile index
OffsetT tile_offset, ///< Tile offset
ScanTileStateT& tile_state) ///< Global tile state descriptor
{
OffsetT num_selections;
if (tile_idx == 0)
{
num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
}
else
{
num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
}
return num_selections;
}
/**
* Scan tiles of items as part of a dynamic chained scan
*/
template <typename NumSelectedIteratorT> ///< Output iterator type for recording number of items selection_flags
__device__ __forceinline__ void ConsumeRange(
int num_tiles, ///< Total number of input tiles
ScanTileStateT& tile_state, ///< Global tile state descriptor
NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags
{
// Blocks are launched in increasing order, so just assign one tile per block
int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile
if (tile_idx < num_tiles - 1)
{
// Not the last tile (full)
ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
}
else
{
// The last tile (possibly partially-full)
OffsetT num_remaining = num_items - tile_offset;
OffsetT num_selections = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
if (threadIdx.x == 0)
{
// Output the total number of items selection_flags
*d_num_selected_out = num_selections;
}
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/block/block_scan.cuh view on Meta::CPAN
* - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
* -# <b>cub::BLOCK_SCAN_RAKING</b>. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
* -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
* -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
*
* \par Performance Considerations
* - \granularity
* - Uses special instructions when applicable (e.g., warp \p SHFL)
* - Uses synchronization-free communication between warp lanes when applicable
* - Invokes a minimal number of minimal block-wide synchronization barriers (only
* one or two depending on algorithm selection)
* - Incurs zero bank conflicts for most types
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
* - Prefix sum variants (<b><em>vs.</em></b> generic scan)
* - \blocksize
* - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
*
* \par A Simple Example
* \blockcollective{BlockScan}
* \par
* The code snippet below illustrates an exclusive prefix sum of 512 integer items that
xgboost/cub/cub/cub.cuh view on Meta::CPAN
// Device
#include "device/device_histogram.cuh"
#include "device/device_partition.cuh"
#include "device/device_radix_sort.cuh"
#include "device/device_reduce.cuh"
#include "device/device_run_length_encode.cuh"
#include "device/device_scan.cuh"
#include "device/device_segmented_radix_sort.cuh"
#include "device/device_segmented_reduce.cuh"
#include "device/device_select.cuh"
#include "device/device_spmv.cuh"
// Grid
//#include "grid/grid_barrier.cuh"
#include "grid/grid_even_share.cuh"
#include "grid/grid_mapping.cuh"
#include "grid/grid_queue.cuh"
// Thread
#include "thread/thread_load.cuh"
xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN
/**
* \file
* cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "dispatch/dispatch_select_if.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. 
* \ingroup SingleModule
*
* \par Overview
* These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
* a specified input sequence.
*
* \par Usage Considerations
* \cdp_class{DevicePartition}
*
* \par Performance
* \linear_performance{partition}
*
* \par
* The following chart illustrates DevicePartition::If
* performance across different CUDA architectures for \p int32 items,
* where 50% of the items are randomly selected for the first partition.
* \plots_below
*
* \image html partition_if_int32_50_percent.png
*
*/
struct DevicePartition
{
/**
* \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. .
* - Copies of the selected items are compacted into \p d_out and maintain their original
* relative ordering, however copies of the unselected items are compacted into the
* rear of \p d_out in reverse order.
* - \devicestorage
*
* \par Snippet
* The code snippet below illustrates the compaction of items selected from an \p int device vector.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/device/device_partition.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
* int num_items; // e.g., 8
* int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
* char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
* int *d_out; // e.g., [ , , , , , , , ]
* int *d_num_selected_out; // e.g., [ ]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run selection
* cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2]
* // d_num_selected_out <-- [4]
*
* \endcode
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
* \tparam FlagIterator <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
* \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
*/
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. 
*
* \par
* - Copies of the selected items are compacted into \p d_out and maintain their original
* relative ordering, however copies of the unselected items are compacted into the
* rear of \p d_out in reverse order.
* - \devicestorage
*
* \par Performance
* The following charts illustrate saturated partition-if performance across different
* CUDA architectures for \p int32 and \p int64 items, respectively. Items are
* selected for the first partition with 50% probability.
*
* \image html partition_if_int32_50_percent.png
* \image html partition_if_int64_50_percent.png
*
* \par
* The following charts are similar, but 5% selection probability for the first partition:
*
* \image html partition_if_int32_5_percent.png
* \image html partition_if_int64_5_percent.png
*
* \par Snippet
* The code snippet below illustrates the compaction of items selected from an \p int device vector.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/device/device_partition.cuh>
*
* // Functor type for selecting values less than some criteria
* struct LessThan
* {
* int compare;
*
* CUB_RUNTIME_FUNCTION __forceinline__
* LessThan(int compare) : compare(compare) {}
*
* CUB_RUNTIME_FUNCTION __forceinline__
* bool operator()(const int &a) const {
* return (a < compare);
* }
* };
*
* // Declare, allocate, and initialize device-accessible pointers for input and output
* int num_items; // e.g., 8
* int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
* int *d_out; // e.g., [ , , , , , , , ]
* int *d_num_selected_out; // e.g., [ ]
* LessThan select_op(7);
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run selection
* cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
*
* // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9]
* // d_num_selected_out <-- [5]
*
* \endcode
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
* \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
* \tparam SelectOp <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
*/
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp>
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
select_op,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_partition_flagged.cu
xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "dispatch/dispatch_select_if.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. 
* \ingroup SingleModule
*
* \par Overview
* These operations apply a selection criterion to selectively copy
* items from a specified input sequence to a compact output sequence.
*
* \par Usage Considerations
* \cdp_class{DeviceSelect}
*
* \par Performance
* \linear_performance{select-flagged, select-if, and select-unique}
*
* \par
* The following chart illustrates DeviceSelect::If
* performance across different CUDA architectures for \p int32 items,
* where 50% of the items are randomly selected.
*
* \image html select_if_int32_50_percent.png
*
* \par
* The following chart illustrates DeviceSelect::Unique
* performance across different CUDA architectures for \p int32 items
* where segments have lengths uniformly sampled from [1,1000].
*
* \image html select_unique_int32_len_500.png
*
* \par
* \plots_below
*
*/
struct DeviceSelect
{
/**
* \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. 
*
* \par
* - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
* - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
* - \devicestorage
*
* \par Snippet
* The code snippet below illustrates the compaction of items selected from an \p int device vector.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh>
*
* // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
* int num_items; // e.g., 8
* int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
* char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
* int *d_out; // e.g., [ , , , , , , , ]
* int *d_num_selected_out; // e.g., [ ]
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run selection
* cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
*
* // d_out <-- [1, 4, 6, 7]
* // d_num_selected_out <-- [4]
*
* \endcode
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
* \tparam FlagIterator <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
* \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
*/
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType SelectOp; // Selection op (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. 
*
* \par
* - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
* - \devicestorage
*
* \par Performance
* The following charts illustrate saturated select-if performance across different
* CUDA architectures for \p int32 and \p int64 items, respectively. Items are
* selected with 50% probability.
*
* \image html select_if_int32_50_percent.png
* \image html select_if_int64_50_percent.png
*
* \par
* The following charts are similar, but 5% selection probability:
*
* \image html select_if_int32_5_percent.png
* \image html select_if_int64_5_percent.png
*
* \par Snippet
* The code snippet below illustrates the compaction of items selected from an \p int device vector.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh>
*
* // Functor type for selecting values less than some criteria
* struct LessThan
* {
* int compare;
*
* CUB_RUNTIME_FUNCTION __forceinline__
* LessThan(int compare) : compare(compare) {}
*
* CUB_RUNTIME_FUNCTION __forceinline__
* bool operator()(const int &a) const {
* return (a < compare);
* }
* };
*
* // Declare, allocate, and initialize device-accessible pointers for input and output
* int num_items; // e.g., 8
* int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
* int *d_out; // e.g., [ , , , , , , , ]
* int *d_num_selected_out; // e.g., [ ]
* LessThan select_op(7);
* ...
*
* // Determine temporary device storage requirements
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run selection
* cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
*
* // d_out <-- [0, 2, 3, 5, 2]
* // d_num_selected_out <-- [5]
*
* \endcode
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
* \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
* \tparam SelectOp <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
*/
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp>
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType EqualityOp; // Equality operator (not used)
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
select_op,
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
/**
* \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ;
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run selection
* cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
*
* // d_out <-- [0, 2, 9, 5, 8]
* // d_num_selected_out <-- [5]
*
* \endcode
*
* \tparam InputIteratorT <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
* \tparam OutputIteratorT <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
* \tparam NumSelectedIteratorT <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
*/
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Unique(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef int OffsetT; // Signed integer type for global offsets
typedef NullType* FlagIterator; // FlagT iterator type (not used)
typedef NullType SelectOp; // Selection op (not used)
typedef Equality EqualityOp; // Default == operator
return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
NULL,
d_out,
d_num_selected_out,
SelectOp(),
EqualityOp(),
num_items,
stream,
debug_synchronous);
}
};
/**
* \example example_device_select_flagged.cu
* \example example_device_select_if.cu
* \example example_device_select_unique.cu
*/
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
num_items,
begin_bit,
end_bit);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Update selector
d_keys.selector ^= 1;
d_values.selector ^= 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
//------------------------------------------------------------------------------
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
if (CubDebug(error = InvokePass(
d_keys.Current(), d_keys_remaining_passes.Current(),
d_values.Current(), d_values_remaining_passes.Current(),
d_spine, spine_length, current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
// Run remaining passes
while (current_bit < end_bit)
{
if (CubDebug(error = InvokePass(
d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
d_spine, spine_length, current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
// Invert selectors
d_keys_remaining_passes.selector ^= 1;
d_values_remaining_passes.selector ^= 1;
}
// Update selector
if (!is_overwrite_okay) {
num_passes = 1; // Sorted data always ends up in the other vector
}
d_keys.selector = (d_keys.selector + num_passes) & 1;
d_values.selector = (d_values.selector + num_passes) & 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
//------------------------------------------------------------------------------
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN
if (CubDebug(error = InvokePass(
d_keys.Current(), d_keys_remaining_passes.Current(),
d_values.Current(), d_values_remaining_passes.Current(),
current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
// Run remaining passes
while (current_bit < end_bit)
{
if (CubDebug(error = InvokePass(
d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
current_bit,
(current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
// Invert selectors and update current bit
d_keys_remaining_passes.selector ^= 1;
d_values_remaining_passes.selector ^= 1;
}
// Update selector
if (!is_overwrite_okay) {
num_passes = 1; // Sorted data always ends up in the other vector
}
d_keys.selector = (d_keys.selector + num_passes) & 1;
d_values.selector = (d_values.selector + num_passes) & 1;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
//------------------------------------------------------------------------------
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
__global__ void DeviceReduceByKeyKernel(
KeysInputIteratorT d_keys_in, ///< Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< Pointer to the output sequence of unique keys (one key per run)
ValuesInputIteratorT d_values_in, ///< Pointer to the input sequence of corresponding values
AggregatesOutputIteratorT d_aggregates_out, ///< Pointer to the output sequence of value aggregates (one aggregate per run)
NumRunsOutputIteratorT d_num_runs_out, ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
ScanTileStateT tile_state, ///< Tile status interface
int start_tile, ///< The starting tile for the current grid
EqualityOpT equality_op, ///< KeyT equality operator
ReductionOpT reduction_op, ///< ValueT reduction operator
OffsetT num_items) ///< Total number of items to select from
{
// Thread block type for reducing tiles of value segments
typedef AgentReduceByKey<
AgentReduceByKeyPolicyT,
KeysInputIteratorT,
UniqueOutputIteratorT,
ValuesInputIteratorT,
AggregatesOutputIteratorT,
NumRunsOutputIteratorT,
EqualityOpT,
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values
AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
EqualityOpT equality_op, ///< [in] KeyT equality operator
ReductionOpT reduction_op, ///< [in] ValueT reduction operator
OffsetT num_items, ///< [in] Total number of items to select from
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
int /*ptx_version*/, ///< [in] PTX version of dispatch kernels
ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
(void)d_temp_storage;
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys
UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run)
ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values
AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
EqualityOpT equality_op, ///< [in] KeyT equality operator
ReductionOpT reduction_op, ///< [in] ValueT reduction operator
OffsetT num_items, ///< [in] Total number of items to select from
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
namespace cub {
/******************************************************************************
* Kernel entry points
*****************************************************************************/
/**
* Select kernel entry point (multi-block)
*
* Performs functor-based selection if SelectOp functor type != NullType
* Otherwise performs flag-based selection if FlagIterator's value type != NullType
* Otherwise performs discontinuity selection (keep unique)
*/
template <
typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator
typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator
typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator
typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator
typename ScanTileStateT, ///< Tile status interface type
typename EqualityOpT, ///< T equality operator type
typename OffsetT> ///< Signed integer type for global offsets
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN
__global__ void DeviceRleSweepKernel(
InputIteratorT d_in, ///< [in] Pointer to input sequence of data items
OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets
LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths
NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
ScanTileStateT tile_status, ///< [in] Tile status interface
EqualityOpT equality_op, ///< [in] Equality operator for input items
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
int num_tiles) ///< [in] Total number of tiles for the entire problem
{
// Thread block type for selecting data from input tiles
typedef AgentRle<
AgentRlePolicyT,
InputIteratorT,
OffsetsOutputIteratorT,
LengthsOutputIteratorT,
EqualityOpT,
OffsetT> AgentRleT;
// Shared memory for AgentRle
__shared__ typename AgentRleT::TempStorage temp_storage;
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh view on Meta::CPAN
{
// Initialize tile status
tile_state.InitializeStatus(num_tiles);
}
/**
* Initialization kernel for tile status initialization (multi-block)
*/
template <
typename ScanTileStateT, ///< Tile status interface type
typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected
__global__ void DeviceCompactInitKernel(
ScanTileStateT tile_state, ///< [in] Tile status interface
int num_tiles, ///< [in] Number of tiles
NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
{
// Initialize tile status
tile_state.InitializeStatus(num_tiles);
// Initialize d_num_selected_out
if ((blockIdx.x == 0) && (threadIdx.x == 0))
*d_num_selected_out = 0;
}
/**
* Scan kernel entry point (multi-block)
*/
template <
typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator
typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "dispatch_scan.cuh"
#include "../../agent/agent_select_if.cuh"
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Kernel entry points
*****************************************************************************/
/**
* Select kernel entry point (multi-block)
*
* Performs functor-based selection if SelectOpT functor type != NullType
* Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
* Otherwise performs discontinuity selection (keep unique)
*/
template <
typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type
typename InputIteratorT, ///< Random-access input iterator type for reading input items
typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items
typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected
typename ScanTileStateT, ///< Tile status interface type
typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
typename OffsetT, ///< Signed integer type for global offsets
bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
__global__ void DeviceSelectSweepKernel(
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable)
SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
ScanTileStateT tile_status, ///< [in] Tile status interface
SelectOpT select_op, ///< [in] Selection operator
EqualityOpT equality_op, ///< [in] Equality operator
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
int num_tiles) ///< [in] Total number of tiles for the entire problem
{
// Thread block type for selecting data from input tiles
typedef AgentSelectIf<
AgentSelectIfPolicyT,
InputIteratorT,
FlagsInputIteratorT,
SelectedOutputIteratorT,
SelectOpT,
EqualityOpT,
OffsetT,
KEEP_REJECTS> AgentSelectIfT;
// Shared memory for AgentSelectIf
__shared__ typename AgentSelectIfT::TempStorage temp_storage;
// Process tiles
AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
num_tiles,
tile_status,
d_num_selected_out);
}
/******************************************************************************
* Dispatch
******************************************************************************/
/**
* Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
*/
template <
typename InputIteratorT, ///< Random-access input iterator type for reading input items
typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items
typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected
typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
typename OffsetT, ///< Signed integer type for global offsets
bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
struct DispatchSelectIf
{
/******************************************************************************
* Types and constants
******************************************************************************/
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
* Utilities
******************************************************************************/
/**
* Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
*/
template <typename KernelConfig>
CUB_RUNTIME_FUNCTION __forceinline__
static void InitConfigs(
int ptx_version,
KernelConfig &select_if_config)
{
#if (CUB_PTX_ARCH > 0)
(void)ptx_version;
// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
select_if_config.template Init<PtxSelectIfPolicyT>();
#else
// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 350)
{
select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
}
else if (ptx_version >= 300)
{
select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
}
else if (ptx_version >= 200)
{
select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
}
else if (ptx_version >= 130)
{
select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
}
else
{
select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
}
#endif
}
/**
* Kernel kernel dispatch configuration.
*/
struct KernelConfig
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
tile_items = block_threads * items_per_thread;
}
};
/******************************************************************************
* Dispatch entrypoints
******************************************************************************/
/**
* Internal dispatch routine for computing a device-wide selection using the
* specified kernel functions.
*/
template <
typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel
typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable)
SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
SelectOpT select_op, ///< [in] Selection operator
EqualityOpT equality_op, ///< [in] Equality operator
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p fals...
int /*ptx_version*/, ///< [in] PTX version of dispatch kernels
ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
{
#ifndef CUB_RUNTIME_ENABLED
(void)d_temp_storage;
(void)temp_storage_bytes;
(void)d_in;
(void)d_flags;
(void)d_selected_out;
(void)d_num_selected_out;
(void)select_op;
(void)equality_op;
(void)num_items;
(void)stream;
(void)debug_synchronous;
(void)scan_init_kernel;
(void)select_if_kernel;
(void)select_if_config;
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
cudaError error = cudaSuccess;
do
{
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Number of input tiles
int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
int num_tiles = (num_items + tile_size - 1) / tile_size;
// Specify temporary storage allocation requirements
size_t allocation_sizes[1];
if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors
// Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
void* allocations[1];
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
if (d_temp_storage == NULL)
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
// Log scan_init_kernel configuration
int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
// Invoke scan_init_kernel to initialize tile descriptors
scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
tile_status,
num_tiles,
d_num_selected_out);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Return if empty problem
if (num_items == 0)
break;
// Get SM occupancy for select_if_kernel
int range_select_sm_occupancy;
if (CubDebug(error = MaxSmOccupancy(
range_select_sm_occupancy, // out
select_if_kernel,
select_if_config.block_threads))) break;
// Get max x-dimension of grid
int max_dim_x;
if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
// Get grid size for scanning tiles
dim3 scan_grid_size;
scan_grid_size.z = 1;
scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
// Log select_if_kernel configuration
if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
// Invoke select_if_kernel
select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
d_in,
d_flags,
d_selected_out,
d_num_selected_out,
tile_status,
select_op,
equality_op,
num_items,
num_tiles);
// Check for failure to launch
if (CubDebug(error = cudaPeekAtLastError())) break;
// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN
/**
* Internal dispatch routine
*/
CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Dispatch(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable)
SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items
NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
SelectOpT select_op, ///< [in] Selection operator
EqualityOpT equality_op, ///< [in] Equality operator
OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
cudaStream_t stream, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. ...
{
cudaError error = cudaSuccess;
do
{
// Get PTX version
int ptx_version;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif
// Get kernel kernel dispatch configurations
KernelConfig select_if_config;
InitConfigs(ptx_version, select_if_config);
// Dispatch
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_selected_out,
d_num_selected_out,
select_op,
equality_op,
num_items,
stream,
debug_synchronous,
ptx_version,
DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
select_if_config))) break;
}
while (0);
return error;
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh view on Meta::CPAN
template <
typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type
typename PairsInputIteratorT, ///< Random-access input iterator type for keys
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename OffsetT, ///< Signed integer type for global offsets
typename ScanTileStateT> ///< Tile status interface type
__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
__global__ void DeviceSegmentFixupKernel(
PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates
OffsetT num_items, ///< [in] Total number of items to select from
int num_tiles, ///< [in] Total number of tiles for the entire problem
ScanTileStateT tile_state) ///< [in] Tile status interface
{
// Thread block type for reducing tiles of value segments
typedef AgentSegmentFixup<
AgentSegmentFixupPolicyT,
PairsInputIteratorT,
AggregatesOutputIteratorT,
cub::Equality,
cub::Sum,
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN
template <
typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type
typename PairsInputIteratorT, ///< Random-access input iterator type for keys
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename OffsetT, ///< Signed integer type for global offsets
typename ScanTileStateT> ///< Tile status interface type
__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
__global__ void DeviceSegmentFixupKernel(
PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates
OffsetT num_items, ///< [in] Total number of items to select from
int num_tiles, ///< [in] Total number of tiles for the entire problem
ScanTileStateT tile_state) ///< [in] Tile status interface
{
// Thread block type for reducing tiles of value segments
typedef AgentSegmentFixup<
AgentSegmentFixupPolicyT,
PairsInputIteratorT,
AggregatesOutputIteratorT,
cub::Equality,
cub::Sum,
xgboost/cub/cub/util_ptx.cuh view on Meta::CPAN
return x;
}
/**
* \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.
*
* \par
* The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
* {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
* {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
* the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
*
* \par Snippet
* The code snippet below illustrates byte-permute.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* int a = 0x03020100;
* int b = 0x07060504;
* int index = 0x00007531;
*
* int selected = PRMT(a, b, index); // 0x07050301
*
* \endcode
*
*/
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
{
int ret;
asm volatile("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
xgboost/cub/cub/util_type.cuh view on Meta::CPAN
* @{
*/
/******************************************************************************
* Type equality
******************************************************************************/
/**
* \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
*/
template <bool IF, typename ThenType, typename ElseType>
struct If
{
/// Conditional type result
typedef ThenType Type; // true
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
xgboost/cub/cub/util_type.cuh view on Meta::CPAN
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
*
* Many multi-pass computations require a pair of "ping-pong" storage
* buffers (e.g., one for reading from and the other for writing to, and then
* vice-versa for the subsequent pass). This structure wraps a set of device
* buffers and a "selector" member to track which is "current".
*/
template <typename T>
struct DoubleBuffer
{
/// Pair of device buffer pointers
T *d_buffers[2];
/// Selector into \p d_buffers (i.e., the active/valid buffer)
int selector;
/// \brief Constructor
__host__ __device__ __forceinline__ DoubleBuffer()
{
selector = 0;
d_buffers[0] = NULL;
d_buffers[1] = NULL;
}
/// \brief Constructor
__host__ __device__ __forceinline__ DoubleBuffer(
T *d_current, ///< The currently valid buffer
T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current
{
selector = 0;
d_buffers[0] = d_current;
d_buffers[1] = d_alternate;
}
/// \brief Return pointer to the currently valid buffer
__host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
/// \brief Return pointer to the currently invalid buffer
__host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
};
/******************************************************************************
* Typedef-detection
******************************************************************************/
xgboost/cub/examples/device/Makefile view on Meta::CPAN
DEPS = $(CUB_DEPS) \
$(CUB_DIR)test/Makefile \
$(CUB_DIR)test/test_util.h \
$(CUB_DIR)test/mersenne.h \
ALL = example_device_partition_flagged \
example_device_partition_if \
example_device_radix_sort \
example_device_reduce \
example_device_scan \
example_device_select_unique \
example_device_select_flagged \
example_device_select_if \
example_device_sort_find_non_trivial_runs
#-------------------------------------------------------------------------------
# make default
#-------------------------------------------------------------------------------
default:
xgboost/cub/examples/device/Makefile view on Meta::CPAN
#-------------------------------------------------------------------------------
example_device_radix_sort: bin/example_device_radix_sort_$(BIN_SUFFIX)
bin/example_device_radix_sort_$(BIN_SUFFIX) : example_device_radix_sort.cu $(DEPS)
mkdir -p bin
$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_radix_sort_$(BIN_SUFFIX) example_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
#-------------------------------------------------------------------------------
# make example_device_select_unique
#-------------------------------------------------------------------------------
example_device_select_unique: bin/example_device_select_unique_$(BIN_SUFFIX)
bin/example_device_select_unique_$(BIN_SUFFIX) : example_device_select_unique.cu $(DEPS)
mkdir -p bin
$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_unique_$(BIN_SUFFIX) example_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
#-------------------------------------------------------------------------------
# make example_device_select_flagged
#-------------------------------------------------------------------------------
example_device_select_flagged: bin/example_device_select_flagged_$(BIN_SUFFIX)
bin/example_device_select_flagged_$(BIN_SUFFIX) : example_device_select_flagged.cu $(DEPS)
mkdir -p bin
$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_flagged_$(BIN_SUFFIX) example_device_select_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
#-------------------------------------------------------------------------------
# make example_device_select_if
#-------------------------------------------------------------------------------
example_device_select_if: bin/example_device_select_if_$(BIN_SUFFIX)
bin/example_device_select_if_$(BIN_SUFFIX) : example_device_select_if.cu $(DEPS)
mkdir -p bin
$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_if_$(BIN_SUFFIX) example_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
#-------------------------------------------------------------------------------
# make example_device_sort_find_non_trivial_runs
#-------------------------------------------------------------------------------
example_device_sort_find_non_trivial_runs: bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX)
bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) : example_device_sort_find_non_trivial_runs.cu $(DEPS)
mkdir -p bin
xgboost/cub/examples/device/example_device_partition_flagged.cu view on Meta::CPAN
/**
* Solve unique problem
*/
int Solve(
int *h_in,
unsigned char *h_flags,
int *h_reference,
int num_items)
{
int num_selected = 0;
for (int i = 0; i < num_items; ++i)
{
if (h_flags[i])
{
h_reference[num_selected] = h_in[i];
num_selected++;
}
else
{
h_reference[num_items - (i - num_selected) - 1] = h_in[i];
}
}
return num_selected;
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
xgboost/cub/examples/device/example_device_partition_flagged.cu view on Meta::CPAN
// Initialize device
CubDebugExit(args.DeviceInit());
// Allocate host arrays
int *h_in = new int[num_items];
int *h_reference = new int[num_items];
unsigned char *h_flags = new unsigned char[num_items];
// Initialize problem and solution
Initialize(h_in, h_flags, num_items, max_segment);
int num_selected = Solve(h_in, h_flags, h_reference, num_items);
printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
fflush(stdout);
// Allocate problem device arrays
int *d_in = NULL;
unsigned char *d_flags = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array and num selected
int *d_out = NULL;
int *d_num_selected_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
printf("\t Data %s ", compare ? "FAIL" : "PASS");
compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s ", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
printf("\n\n");
return 0;
}
xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN
*
******************************************************************************/
/******************************************************************************
* Simple example of DevicePartition::If().
*
* Partitions items from from a sequence of int keys using a
* section functor (greater-than)
*
* To compile using the command line:
* nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <cub/util_allocator.cuh>
#include <cub/device/device_partition.cuh>
xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN
*/
void Initialize(
int *h_in,
int num_items,
int max_segment)
{
int key = 0;
int i = 0;
while (i < num_items)
{
// Randomly select number of repeating occurrences uniformly from [1..max_segment]
unsigned short max_short = (unsigned short) -1;
unsigned short repeat;
RandomBits(repeat);
repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
repeat = CUB_MAX(1, repeat);
int j = i;
while (j < CUB_MIN(i + repeat, num_items))
{
h_in[j] = key;
xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN
}
}
/**
* Solve unique problem
*/
template <typename SelectOp>
int Solve(
int *h_in,
SelectOp select_op,
int *h_reference,
int num_items)
{
int num_selected = 0;
for (int i = 0; i < num_items; ++i)
{
if (select_op(h_in[i]))
{
h_reference[num_selected] = h_in[i];
num_selected++;
}
else
{
h_reference[num_items - (i - num_selected) - 1] = h_in[i];
}
}
return num_selected;
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN
// DevicePartition a pivot index
unsigned int pivot_index;
unsigned int max_int = (unsigned int) -1;
RandomBits(pivot_index);
pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
// Initialize problem and solution
Initialize(h_in, num_items, max_segment);
GreaterThan select_op(h_in[pivot_index]);
int num_selected = Solve(h_in, select_op, h_reference, num_items);
printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
fflush(stdout);
// Allocate problem device arrays
int *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array and num selected
int *d_out = NULL;
int *d_num_selected_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
printf("\t Data %s ", compare ? "FAIL" : "PASS");
compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s ", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
xgboost/cub/examples/device/example_device_radix_sort.cu view on Meta::CPAN
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));
// Allocate temporary storage
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Initialize device arrays
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
// Run
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
if (h_keys) delete[] h_keys;
if (h_reference_keys) delete[] h_reference_keys;
if (h_values) delete[] h_values;
if (h_reference_values) delete[] h_reference_values;
if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
xgboost/cub/examples/device/example_device_select_flagged.cu view on Meta::CPAN
*
******************************************************************************/
/******************************************************************************
* Simple example of DeviceSelect::Flagged().
*
* Selects flagged items from from a sequence of int keys using a
* corresponding sequence of unsigned char flags.
*
* To compile using the command line:
* nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <cub/util_allocator.cuh>
#include <cub/device/device_select.cuh>
#include "../../test/test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
xgboost/cub/examples/device/example_device_select_flagged.cu view on Meta::CPAN
/**
* Solve unique problem
*/
int Solve(
int *h_in,
unsigned char *h_flags,
int *h_reference,
int num_items)
{
int num_selected = 0;
for (int i = 0; i < num_items; ++i)
{
if (h_flags[i])
{
h_reference[num_selected] = h_in[i];
num_selected++;
}
else
{
h_reference[num_items - (i - num_selected) - 1] = h_in[i];
}
}
return num_selected;
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
xgboost/cub/examples/device/example_device_select_flagged.cu view on Meta::CPAN
// Initialize device
CubDebugExit(args.DeviceInit());
// Allocate host arrays
int *h_in = new int[num_items];
int *h_reference = new int[num_items];
unsigned char *h_flags = new unsigned char[num_items];
// Initialize problem and solution
Initialize(h_in, h_flags, num_items, max_segment);
int num_selected = Solve(h_in, h_flags, h_reference, num_items);
printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
fflush(stdout);
// Allocate problem device arrays
int *d_in = NULL;
unsigned char *d_flags = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array and num selected
int *d_out = NULL;
int *d_num_selected_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
printf("\t Data %s ", compare ? "FAIL" : "PASS");
compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s ", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
printf("\n\n");
return 0;
}
xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN
*
******************************************************************************/
/******************************************************************************
* Simple example of DeviceSelect::If().
*
* Selects items from from a sequence of int keys using a
* section functor (greater-than)
*
* To compile using the command line:
* nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <cub/util_allocator.cuh>
#include <cub/device/device_select.cuh>
#include "../../test/test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN
*/
void Initialize(
int *h_in,
int num_items,
int max_segment)
{
int key = 0;
int i = 0;
while (i < num_items)
{
// Randomly select number of repeating occurrences uniformly from [1..max_segment]
unsigned short max_short = (unsigned short) -1;
unsigned short repeat;
RandomBits(repeat);
repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
repeat = CUB_MAX(1, repeat);
int j = i;
while (j < CUB_MIN(i + repeat, num_items))
{
h_in[j] = key;
xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN
}
}
/**
* Solve unique problem
*/
template <typename SelectOp>
int Solve(
int *h_in,
SelectOp select_op,
int *h_reference,
int num_items)
{
int num_selected = 0;
for (int i = 0; i < num_items; ++i)
{
if (select_op(h_in[i]))
{
h_reference[num_selected] = h_in[i];
num_selected++;
}
else
{
h_reference[num_items - (i - num_selected) - 1] = h_in[i];
}
}
return num_selected;
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN
// Select a pivot index
unsigned int pivot_index;
unsigned int max_int = (unsigned int) -1;
RandomBits(pivot_index);
pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
// Initialize problem and solution
Initialize(h_in, num_items, max_segment);
GreaterThan select_op(h_in[pivot_index]);
int num_selected = Solve(h_in, select_op, h_reference, num_items);
printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
fflush(stdout);
// Allocate problem device arrays
int *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array and num selected
int *d_out = NULL;
int *d_num_selected_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
printf("\t Data %s ", compare ? "FAIL" : "PASS");
compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s ", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN
*
******************************************************************************/
/******************************************************************************
* Simple example of DeviceSelect::Unique().
*
* Selects the first element from each run of identical values from a sequence
* of int keys.
*
* To compile using the command line:
* nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <cub/util_allocator.cuh>
#include <cub/device/device_select.cuh>
#include "../../test/test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN
*/
void Initialize(
int *h_in,
int num_items,
int max_segment)
{
int key = 0;
int i = 0;
while (i < num_items)
{
// Randomly select number of repeating occurrences uniformly from [1..max_segment]
unsigned short max_short = (unsigned short) -1;
unsigned short repeat;
RandomBits(repeat);
repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
repeat = CUB_MAX(1, repeat);
int j = i;
while (j < CUB_MIN(i + repeat, num_items))
{
h_in[j] = key;
xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN
/**
* Solve unique problem
*/
int Solve(
int *h_in,
int *h_reference,
int num_items)
{
int num_selected = 0;
if (num_items > 0)
{
h_reference[num_selected] = h_in[0];
num_selected++;
}
for (int i = 1; i < num_items; ++i)
{
if (h_in[i] != h_in[i - 1])
{
h_reference[num_selected] = h_in[i];
num_selected++;
}
}
return num_selected;
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN
// Initialize device
CubDebugExit(args.DeviceInit());
// Allocate host arrays
int* h_in = new int[num_items];
int* h_reference = new int[num_items];
// Initialize problem and solution
Initialize(h_in, num_items, max_segment);
int num_selected = Solve(h_in, h_reference, num_items);
printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
num_items, (int) sizeof(int), num_selected, num_items / num_selected);
fflush(stdout);
// Allocate problem device arrays
int *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array and num selected
int *d_out = NULL;
int *d_num_selected_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
printf("\t Data %s ", compare ? "FAIL" : "PASS");
compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s ", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Simple example of sorting a sequence of keys and values (each pair is a
* randomly-selected int32 paired with its original offset in the unsorted sequence), and then
* isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
*
* To compile using the command line:
* nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
{
// Allocate and initialize device arrays for sorting
DoubleBuffer<Key> d_keys;
DoubleBuffer<Value> d_values;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
// Start timer
gpu_timer.Start();
// Allocate temporary storage for sorting
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Do the sort
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
// Free unused buffers and sorting temporary storage
if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Start timer
gpu_rle_timer.Start();
// Allocate device arrays for enumerating non-trivial runs
int *d_offests_out = NULL;
int *d_lengths_out = NULL;
int *d_num_runs = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));
// Allocate temporary storage for isolating non-trivial runs
d_temp_storage = NULL;
CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
d_temp_storage,
temp_storage_bytes,
d_keys.d_buffers[d_keys.selector],
d_offests_out,
d_lengths_out,
d_num_runs,
num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Do the isolation
CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
d_temp_storage,
temp_storage_bytes,
d_keys.d_buffers[d_keys.selector],
d_offests_out,
d_lengths_out,
d_num_runs,
num_items));
// Free keys buffer
if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
//
// Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
//
// Stop sort timer
gpu_timer.Stop();
gpu_rle_timer.Stop();
if (i == 0)
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
AssertEquals(0, compare);
}
else
{
elapsed_millis += gpu_timer.ElapsedMillis();
elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
}
// GPU cleanup
if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
}
// Host cleanup
if (h_keys) delete[] h_keys;
if (h_values) delete[] h_values;
if (h_offsets_reference) delete[] h_offsets_reference;