select results from the CPAN

select

Alien-XGBoost

view release on metacpan or search on metacpan

xgboost/cub/README.md
xgboost/cub/common.mk
xgboost/cub/cub/agent/agent_histogram.cuh
xgboost/cub/cub/agent/agent_radix_sort_downsweep.cuh
xgboost/cub/cub/agent/agent_radix_sort_upsweep.cuh
xgboost/cub/cub/agent/agent_reduce.cuh
xgboost/cub/cub/agent/agent_reduce_by_key.cuh
xgboost/cub/cub/agent/agent_rle.cuh
xgboost/cub/cub/agent/agent_scan.cuh
xgboost/cub/cub/agent/agent_segment_fixup.cuh
xgboost/cub/cub/agent/agent_select_if.cuh
xgboost/cub/cub/agent/agent_spmv_csrt.cuh
xgboost/cub/cub/agent/agent_spmv_orig.cuh
xgboost/cub/cub/agent/agent_spmv_row_based.cuh
xgboost/cub/cub/agent/single_pass_scan_operators.cuh
xgboost/cub/cub/block/block_adjacent_difference.cuh
xgboost/cub/cub/block/block_discontinuity.cuh
xgboost/cub/cub/block/block_exchange.cuh
xgboost/cub/cub/block/block_histogram.cuh
xgboost/cub/cub/block/block_load.cuh
xgboost/cub/cub/block/block_radix_rank.cuh

MANIFEST view on Meta::CPAN

xgboost/cub/cub/block/specializations/block_scan_warp_scans3.cuh
xgboost/cub/cub/cub.cuh
xgboost/cub/cub/device/device_histogram.cuh
xgboost/cub/cub/device/device_partition.cuh
xgboost/cub/cub/device/device_radix_sort.cuh
xgboost/cub/cub/device/device_reduce.cuh
xgboost/cub/cub/device/device_run_length_encode.cuh
xgboost/cub/cub/device/device_scan.cuh
xgboost/cub/cub/device/device_segmented_radix_sort.cuh
xgboost/cub/cub/device/device_segmented_reduce.cuh
xgboost/cub/cub/device/device_select.cuh
xgboost/cub/cub/device/device_spmv.cuh
xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh
xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh
xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh
xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
xgboost/cub/cub/device/dispatch/dispatch_rle.cuh
xgboost/cub/cub/device/dispatch/dispatch_scan.cuh
xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh
xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh
xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh
xgboost/cub/cub/grid/grid_barrier.cuh
xgboost/cub/cub/grid/grid_even_share.cuh
xgboost/cub/cub/grid/grid_mapping.cuh
xgboost/cub/cub/grid/grid_queue.cuh
xgboost/cub/cub/host/mutex.cuh
xgboost/cub/cub/iterator/arg_index_input_iterator.cuh
xgboost/cub/cub/iterator/cache_modified_input_iterator.cuh

MANIFEST view on Meta::CPAN

xgboost/cub/examples/block/example_block_radix_sort.cu
xgboost/cub/examples/block/example_block_reduce.cu
xgboost/cub/examples/block/example_block_scan.cu
xgboost/cub/examples/block/reduce_by_key.cu
xgboost/cub/examples/device/Makefile
xgboost/cub/examples/device/example_device_partition_flagged.cu
xgboost/cub/examples/device/example_device_partition_if.cu
xgboost/cub/examples/device/example_device_radix_sort.cu
xgboost/cub/examples/device/example_device_reduce.cu
xgboost/cub/examples/device/example_device_scan.cu
xgboost/cub/examples/device/example_device_select_flagged.cu
xgboost/cub/examples/device/example_device_select_if.cu
xgboost/cub/examples/device/example_device_select_unique.cu
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
xgboost/cub/experimental/Makefile
xgboost/cub/experimental/defunct/example_coo_spmv.cu
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu
xgboost/cub/experimental/histogram/histogram_cub.h
xgboost/cub/experimental/histogram/histogram_gmem_atomics.h
xgboost/cub/experimental/histogram/histogram_smem_atomics.h
xgboost/cub/experimental/histogram_compare.cu
xgboost/cub/experimental/sparse_matrix.h
xgboost/cub/experimental/spmv_compare.cu

MANIFEST view on Meta::CPAN

xgboost/cub/test/test_block_load_store.cu
xgboost/cub/test/test_block_radix_sort.cu
xgboost/cub/test/test_block_reduce.cu
xgboost/cub/test/test_block_scan.cu
xgboost/cub/test/test_device_histogram.cu
xgboost/cub/test/test_device_radix_sort.cu
xgboost/cub/test/test_device_reduce.cu
xgboost/cub/test/test_device_reduce_by_key.cu
xgboost/cub/test/test_device_run_length_encode.cu
xgboost/cub/test/test_device_scan.cu
xgboost/cub/test/test_device_select_if.cu
xgboost/cub/test/test_device_select_unique.cu
xgboost/cub/test/test_grid_barrier.cu
xgboost/cub/test/test_iterator.cu
xgboost/cub/test/test_util.h
xgboost/cub/test/test_warp_reduce.cu
xgboost/cub/test/test_warp_scan.cu
xgboost/cub/tune/Makefile
xgboost/cub/tune/tune_device_reduce.cu
xgboost/demo/README.md
xgboost/demo/binary_classification/README.md
xgboost/demo/binary_classification/agaricus-lepiota.data

xgboost/R-package/configure view on Meta::CPAN


cat >>confdefs.h <<_ACEOF
#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
_ACEOF

cat >>confdefs.h <<_ACEOF
#define PACKAGE_URL "$PACKAGE_URL"
_ACEOF


# Let the site file select an alternate cache file if it wants to.
# Prefer an explicitly selected file to automatically selected ones.
ac_site_file1=NONE
ac_site_file2=NONE
if test -n "$CONFIG_SITE"; then
  # We do not want a PATH search for config.site.
  case $CONFIG_SITE in #((
    -*)  ac_site_file1=./$CONFIG_SITE;;
    */*) ac_site_file1=$CONFIG_SITE;;
    *)   ac_site_file1=./$CONFIG_SITE;;
  esac
elif test "x$prefix" != xNONE; then

xgboost/R-package/demo/caret_wrapper.R view on Meta::CPAN

require(xgboost)
require(data.table)
require(vcd)
require(e1071)

# Load Arthritis dataset in memory.
data(Arthritis)
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = F)

# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, e...
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]

# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you alre...
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]

# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]

#-------------Basic Training using XGBoost in caret Library-----------------

xgboost/R-package/demo/create_sparse_matrix.R view on Meta::CPAN

df <- data.table(Arthritis, keep.rownames = F)

# Let's have a look to the data.table
cat("Print the dataset\n")
print(df)

# 2 columns have factor type, one has ordinal type (ordinal variable is a categorical variable with values wich can be ordered, here: None > Some > Marked).
cat("Structure of the dataset\n")
str(df)

# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, e...

# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]

# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you alre...
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]

# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]

xgboost/R-package/vignettes/discoverYourData.Rmd view on Meta::CPAN


For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation wi...

Column `Improved` is excluded because it will be our `label` column, the one we want to predict.

```{r, warning=FALSE,message=FALSE}
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
head(sparse_matrix)
```

> Formula `Improved ~ .` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` column selection removes the intercept column which is full of `1` (this column is generated by the conversion). For more ...

Create the output `numeric` vector (not as a sparse `Matrix`):

```{r}
output_vector = df[,Improved] == "Marked"
```

1. set `Y` vector to `0`;
2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
3. return `Y` vector.

xgboost/R-package/vignettes/discoverYourData.Rmd view on Meta::CPAN


```{r}
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)

# Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]

head(importanceClean)
```

> In the table above we have removed two not needed columns and select only the first lines.

First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used seve...

How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.

The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole populati...

Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).

> You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for th...

xgboost/cub/CHANGE_LOG.TXT view on Meta::CPAN

        - Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
        - Added workaround for TexRefInputIteratorTto work with CUDA 6.0
    
//-----------------------------------------------------------------------------

1.1.1    12/11/2013
    - New features:
        - Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy.  Compatible with Thrust API. 
        - Added descending sorting to DeviceRadixSort and BlockRadixSort
        - Added min, max, arg-min, and arg-max to DeviceReduce
        - Added DeviceSelect (select-unique, select-if, and select-flagged)
        - Added DevicePartition (partition-if, partition-flagged)
        - Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
        - Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
    - Performance
        - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
    - Interface changes:
        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
        - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
    - Documentation and testing:
        - Added simple examples of device-wide methods

xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN


/**
 * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
 */
template <
    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
    typename    EqualityOpT,                    ///< KeyT equality operator type
    typename    ReductionOpT,                   ///< ValueT reduction operator type
    typename    OffsetT>                        ///< Signed integer type for global offsets
struct AgentReduceByKey
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input keys type

xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN

    /**
     * Scatter flagged items
     */
    __device__ __forceinline__ void Scatter(
        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
        OffsetT         num_tile_segments,
        OffsetT         num_tile_segments_prefix)
    {
        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
        {
            ScatterTwoPhase(
                scatter_items,
                segment_flags,
                segment_indices,
                num_tile_segments,
                num_tile_segments_prefix);
        }
        else

xgboost/cub/cub/agent/agent_reduce_by_key.cuh view on Meta::CPAN

            OffsetT num_segments = num_segments_prefix + num_tile_segments;

            // If the last tile is a whole tile, output the final_value
            if (num_remaining == TILE_ITEMS)
            {
                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
                d_aggregates_out[num_segments]  = total_aggregate;
                num_segments++;
            }

            // Output the total number of items selected
            *d_num_runs_out = num_segments;
        }
    }


    /**
     * Scan tiles of items as part of a dynamic chained scan
     */
    __device__ __forceinline__ void ConsumeRange(
        int                 num_items,          ///< Total number of input items

xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN

        d_in(d_in),
        d_offsets_out(d_offsets_out),
        d_lengths_out(d_lengths_out),
        equality_op(equality_op),
        scan_op(cub::Sum()),
        num_items(num_items)
    {}


    //---------------------------------------------------------------------
    // Utility methods for initializing the selections
    //---------------------------------------------------------------------

    template <bool FIRST_TILE, bool LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT             tile_offset,
        OffsetT             num_remaining,
        T                   (&items)[ITEMS_PER_THREAD],
        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
    {
        bool                head_flags[ITEMS_PER_THREAD];

xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN

            thread_exclusive_in_warp,
            identity,
            scan_op);

        // Last lane in each warp shares its warp-aggregate
        if (lane_id == WARP_THREADS - 1)
            temp_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive;

        CTA_SYNC();

        // Accumulate total selected and the warp-wide prefix
        warp_exclusive_in_tile          = identity;
        warp_aggregate                  = temp_storage.warp_aggregates.Alias()[warp_id];
        tile_aggregate                  = temp_storage.warp_aggregates.Alias()[0];

        #pragma unroll
        for (int WARP = 1; WARP < WARPS; ++WARP)
        {
            if (warp_id == WARP)
                warp_exclusive_in_tile = tile_aggregate;

            tile_aggregate = scan_op(tile_aggregate, temp_storage.warp_aggregates.Alias()[WARP]);
        }
    }


    //---------------------------------------------------------------------
    // Utility methods for scattering selections
    //---------------------------------------------------------------------

    /**
     * Two-phase scatter, specialized for warp time-slicing
     */
    template <bool FIRST_TILE>
    __device__ __forceinline__ void ScatterTwoPhase(
        OffsetT             tile_num_runs_exclusive_in_global,
        OffsetT             warp_num_runs_aggregate,
        OffsetT             warp_num_runs_exclusive_in_tile,

xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN


            // Return running total (inclusive of this tile)
            return prefix_op.inclusive_prefix;
        }
    }


    /**
     * Scan tiles of items as part of a dynamic chained scan
     */
    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
    __device__ __forceinline__ void ConsumeRange(
        int                 num_tiles,              ///< Total number of input tiles
        ScanTileStateT&     tile_status,            ///< Global list of tile status
        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
    {
        // Blocks are launched in increasing order, so just assign one tile per block
        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)

xgboost/cub/cub/agent/agent_rle.cuh view on Meta::CPAN

            // Not the last tile (full)
            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
        }
        else if (num_remaining > 0)
        {
            // The last tile (possibly partially-full)
            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);

            if (threadIdx.x == 0)
            {
                // Output the total number of items selected
                *d_num_runs_out = running_total.key;

                // The inclusive prefix contains accumulated length reduction for the last run
                if (running_total.key > 0)
                    d_lengths_out[running_total.key - 1] = running_total.value;
            }
        }
    }
};

xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
 */

#pragma once

#include <iterator>

#include "single_pass_scan_operators.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_scan.cuh"

xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN





/******************************************************************************
 * Thread block abstractions
 ******************************************************************************/


/**
 * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
 *
 * Performs functor-based selection if SelectOpT functor type != NullType
 * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
 * Otherwise performs discontinuity selection (keep unique)
 */
template <
    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
    typename    OffsetT,                        ///< Signed integer type for global offsets
    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
struct AgentSelectIf
{
    //---------------------------------------------------------------------
    // Types and constants
    //---------------------------------------------------------------------

    // The input value type
    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

xgboost/cub/cub/agent/agent_select_if.cuh view on Meta::CPAN

    // Alias wrapper allowing storage to be unioned
    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------
    // Per-thread fields
    //---------------------------------------------------------------------

    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
    WrappedInputIteratorT           d_in;               ///< Input items
    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
    SelectOpT                       select_op;          ///< Selection operator
    OffsetT                         num_items;          ///< Total number of input items


    //---------------------------------------------------------------------
    // Constructor
    //---------------------------------------------------------------------

    // Constructor
    __device__ __forceinline__
    AgentSelectIf(
        TempStorage                 &temp_storage,      ///< Reference to temp_storage
        InputIteratorT              d_in,               ///< Input data
        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
        SelectedOutputIteratorT     d_selected_out,     ///< Output data
        SelectOpT                   select_op,          ///< Selection operator
        EqualityOpT                 equality_op,        ///< Equality operator
        OffsetT                     num_items)          ///< Total number of input items
    :
        temp_storage(temp_storage.Alias()),
        d_in(d_in),
        d_flags_in(d_flags_in),
        d_selected_out(d_selected_out),
        select_op(select_op),
        inequality_op(equality_op),
        num_items(num_items)
    {}


    //---------------------------------------------------------------------
    // Utility methods for initializing the selections
    //---------------------------------------------------------------------

    /**
     * Initialize selections (specialized for selection operator)
     */
    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT                     /*tile_offset*/,
        OffsetT                     num_tile_items,
        OutputT                     (&items)[ITEMS_PER_THREAD],
        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
        Int2Type<USE_SELECT_OP>     /*select_method*/)
    {
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            // Out-of-bounds items are selection_flags
            selection_flags[ITEM] = 1;

            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
                selection_flags[ITEM] = select_op(items[ITEM]);
        }
    }


    /**
     * Initialize selections (specialized for valid flags)
     */
    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT                     tile_offset,
        OffsetT                     num_tile_items,
        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
    {
        CTA_SYNC();

        FlagT flags[ITEMS_PER_THREAD];

        if (IS_LAST_TILE)
        {
            // Out-of-bounds items are selection_flags
            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
        }
        else
        {
            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
        }

        // Convert flag type to selection_flags type
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            selection_flags[ITEM] = flags[ITEM];
        }
    }


    /**
     * Initialize selections (specialized for discontinuity detection)
     */
    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
    __device__ __forceinline__ void InitializeSelections(
        OffsetT                     tile_offset,
        OffsetT                     num_tile_items,
        OutputT                     (&items)[ITEMS_PER_THREAD],
        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
        Int2Type<USE_DISCONTINUITY> /*select_method*/)
    {
        if (IS_FIRST_TILE)
        {
            CTA_SYNC();

            // Set head selection_flags.  First tile sets the first flag for the first item
            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
        }
        else
        {
            OutputT tile_predecessor;
            if (threadIdx.x == 0)
                tile_predecessor = d_in[tile_offset - 1];

            CTA_SYNC();

            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
        }

        // Set selection flags for out-of-bounds items
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            // Set selection_flags for out-of-bounds items
            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
                selection_flags[ITEM] = 1;
        }
    }


    //---------------------------------------------------------------------
    // Scatter utility methods
    //---------------------------------------------------------------------

    /**
     * Scatter flagged items to output offsets (specialized for direct scattering)
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void ScatterDirect(
        OutputT (&items)[ITEMS_PER_THREAD],
        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
        OffsetT num_selections)
    {
        // Scatter flagged items
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            if (selection_flags[ITEM])
            {
                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
                {
                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
                }
            }
        }
    }


    /**
     * Scatter flagged items to output offsets (specialized for two-phase scattering)
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void ScatterTwoPhase(
        OutputT         (&items)[ITEMS_PER_THREAD],
        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
        int             num_tile_selections,                        ///< Number of selections in this tile
        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
    {
        CTA_SYNC();

        // Compact and scatter items
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
            if (selection_flags[ITEM])
            {
                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
            }
        }

        CTA_SYNC();

        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
        {
            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
        }
    }


    /**
     * Scatter flagged items to output offsets (specialized for two-phase scattering)
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void ScatterTwoPhase(
        OutputT         (&items)[ITEMS_PER_THREAD],
        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
        int             num_tile_items,                             ///< Number of valid items in this tile
        int             num_tile_selections,                        ///< Number of selections in this tile
        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
    {
        CTA_SYNC();

        int tile_num_rejections = num_tile_items - num_tile_selections;

        // Scatter items to shared memory (rejections first)
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
            int local_rejection_idx     = item_idx - local_selection_idx;
            int local_scatter_offset    = (selection_flags[ITEM]) ?
                                            tile_num_rejections + local_selection_idx :
                                            local_rejection_idx;

            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
        }

        CTA_SYNC();

        // Gather items from shared memory and scatter to global
        #pragma unroll
        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
        {
            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
            int rejection_idx       = item_idx;
            int selection_idx       = item_idx - tile_num_rejections;
            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
                                        num_items - num_rejected_prefix - rejection_idx - 1 :
                                        num_selections_prefix + selection_idx;

            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];

            if (!IS_LAST_TILE || (item_idx < num_tile_items))
            {
                d_selected_out[scatter_offset] = item;
            }
        }
    }


    /**
     * Scatter flagged items
     */
    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
    __device__ __forceinline__ void Scatter(
        OutputT         (&items)[ITEMS_PER_THREAD],
        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
        int             num_tile_items,                             ///< Number of valid items in this tile
        int             num_tile_selections,                        ///< Number of selections in this tile
        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
        OffsetT         num_selections)                             ///< Total number of selections including this tile
    {
        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
        {
            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
                items,
                selection_flags,
                selection_indices,
                num_tile_items,
                num_tile_selections,
                num_selections_prefix,
                num_rejected_prefix,
                Int2Type<KEEP_REJECTS>());
        }
        else
        {
            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
                items,
                selection_flags,
                selection_indices,
                num_selections);
        }
    }

    //---------------------------------------------------------------------
    // Cooperatively scan a device-wide sequence of tiles with other CTAs
    //---------------------------------------------------------------------


    /**
     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
     */
    template <bool IS_LAST_TILE>
    __device__ __forceinline__ OffsetT ConsumeFirstTile(
        int                 num_tile_items,      ///< Number of input items comprising this tile
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        OutputT     items[ITEMS_PER_THREAD];
        OffsetT     selection_flags[ITEMS_PER_THREAD];
        OffsetT     selection_indices[ITEMS_PER_THREAD];

        // Load items
        if (IS_LAST_TILE)
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
        else
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);

        // Initialize selection_flags
        InitializeSelections<true, IS_LAST_TILE>(
            tile_offset,
            num_tile_items,
            items,
            selection_flags,
            Int2Type<SELECT_METHOD>());

        CTA_SYNC();

        // Exclusive scan of selection_flags
        OffsetT num_tile_selections;
        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);

        if (threadIdx.x == 0)
        {
            // Update tile status if this is not the last tile
            if (!IS_LAST_TILE)
                tile_state.SetInclusive(0, num_tile_selections);
        }

        // Discount any out-of-bounds selections
        if (IS_LAST_TILE)
            num_tile_selections -= (TILE_ITEMS - num_tile_items);

        // Scatter flagged items
        Scatter<IS_LAST_TILE, true>(
            items,
            selection_flags,
            selection_indices,
            num_tile_items,
            num_tile_selections,
            0,
            0,
            num_tile_selections);

        return num_tile_selections;
    }


    /**
     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
     */
    template <bool IS_LAST_TILE>
    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
        int                 num_tile_items,      ///< Number of input items comprising this tile
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        OutputT     items[ITEMS_PER_THREAD];
        OffsetT     selection_flags[ITEMS_PER_THREAD];
        OffsetT     selection_indices[ITEMS_PER_THREAD];

        // Load items
        if (IS_LAST_TILE)
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
        else
            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);

        // Initialize selection_flags
        InitializeSelections<false, IS_LAST_TILE>(
            tile_offset,
            num_tile_items,
            items,
            selection_flags,
            Int2Type<SELECT_METHOD>());

        CTA_SYNC();

        // Exclusive scan of values and selection_flags
        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);

        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;

        // Discount any out-of-bounds selections
        if (IS_LAST_TILE)
        {
            int num_discount    = TILE_ITEMS - num_tile_items;
            num_selections      -= num_discount;
            num_tile_selections -= num_discount;
        }

        // Scatter flagged items
        Scatter<IS_LAST_TILE, false>(
            items,
            selection_flags,
            selection_indices,
            num_tile_items,
            num_tile_selections,
            num_selections_prefix,
            num_rejected_prefix,
            num_selections);

        return num_selections;
    }


    /**
     * Process a tile of input
     */
    template <bool IS_LAST_TILE>
    __device__ __forceinline__ OffsetT ConsumeTile(
        int                 num_tile_items,         ///< Number of input items comprising this tile
        int                 tile_idx,           ///< Tile index
        OffsetT             tile_offset,        ///< Tile offset
        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
    {
        OffsetT num_selections;
        if (tile_idx == 0)
        {
            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
        }
        else
        {
            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
        }

        return num_selections;
    }


    /**
     * Scan tiles of items as part of a dynamic chained scan
     */
    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
    __device__ __forceinline__ void ConsumeRange(
        int                     num_tiles,          ///< Total number of input tiles
        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
    {
        // Blocks are launched in increasing order, so just assign one tile per block
        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile

        if (tile_idx < num_tiles - 1)
        {
            // Not the last tile (full)
            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
        }
        else
        {
            // The last tile (possibly partially-full)
            OffsetT num_remaining   = num_items - tile_offset;
            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);

            if (threadIdx.x == 0)
            {
                // Output the total number of items selection_flags
                *d_num_selected_out = num_selections;
            }
        }
    }

};



}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/block/block_scan.cuh view on Meta::CPAN

 * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
 *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
 *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
 *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
 *
 * \par Performance Considerations
 * - \granularity
 * - Uses special instructions when applicable (e.g., warp \p SHFL)
 * - Uses synchronization-free communication between warp lanes when applicable
 * - Invokes a minimal number of minimal block-wide synchronization barriers (only
 *   one or two depending on algorithm selection)
 * - Incurs zero bank conflicts for most types
 * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
 *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
 *   - \blocksize
 * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
 *
 * \par A Simple Example
 * \blockcollective{BlockScan}
 * \par
 * The code snippet below illustrates an exclusive prefix sum of 512 integer items that

xgboost/cub/cub/cub.cuh view on Meta::CPAN


// Device
#include "device/device_histogram.cuh"
#include "device/device_partition.cuh"
#include "device/device_radix_sort.cuh"
#include "device/device_reduce.cuh"
#include "device/device_run_length_encode.cuh"
#include "device/device_scan.cuh"
#include "device/device_segmented_radix_sort.cuh"
#include "device/device_segmented_reduce.cuh"
#include "device/device_select.cuh"
#include "device/device_spmv.cuh"

// Grid
//#include "grid/grid_barrier.cuh"
#include "grid/grid_even_share.cuh"
#include "grid/grid_mapping.cuh"
#include "grid/grid_queue.cuh"

// Thread
#include "thread/thread_load.cuh"

xgboost/cub/cub/device/device_partition.cuh view on Meta::CPAN

/**
 * \file
 * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
 */

#pragma once

#include <stdio.h>
#include <iterator>

#include "dispatch/dispatch_select_if.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/**
 * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
 * \ingroup SingleModule
 *
 * \par Overview
 * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
 * a specified input sequence.
 *
 * \par Usage Considerations
 * \cdp_class{DevicePartition}
 *
 * \par Performance
 * \linear_performance{partition}
 *
 * \par
 * The following chart illustrates DevicePartition::If
 * performance across different CUDA architectures for \p int32 items,
 * where 50% of the items are randomly selected for the first partition.
 * \plots_below
 *
 * \image html partition_if_int32_50_percent.png
 *
 */
struct DevicePartition
{
    /**
     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.pn...
     *
     * \par
     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
     * - Copies of the selected items are compacted into \p d_out and maintain their original
     *   relative ordering, however copies of the unselected items are compacted into the
     *   rear of \p d_out in reverse order.
     * - \devicestorage
     *
     * \par Snippet
     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
     * \par
     * \code
     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
     *
     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
     * int  num_items;              // e.g., 8
     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
     * int  *d_num_selected_out;    // e.g., [ ]
     * ...
     *
     * // Determine temporary device storage requirements
     * void     *d_temp_storage = NULL;
     * size_t   temp_storage_bytes = 0;
     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // Allocate temporary storage
     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
     *
     * // Run selection
     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
     * // d_num_selected_out    <-- [4]
     *
     * \endcode
     *
     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
     */
    template <
        typename                    InputIteratorT,
        typename                    FlagIterator,
        typename                    OutputIteratorT,
        typename                    NumSelectedIteratorT>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
        int                         num_items,                      ///< [in] Total number of items to select from
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
     *
     * \par
     * - Copies of the selected items are compacted into \p d_out and maintain their original
     *   relative ordering, however copies of the unselected items are compacted into the
     *   rear of \p d_out in reverse order.
     * - \devicestorage
     *
     * \par Performance
     * The following charts illustrate saturated partition-if performance across different
     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
     * selected for the first partition with 50% probability.
     *
     * \image html partition_if_int32_50_percent.png
     * \image html partition_if_int64_50_percent.png
     *
     * \par
     * The following charts are similar, but 5% selection probability for the first partition:
     *
     * \image html partition_if_int32_5_percent.png
     * \image html partition_if_int64_5_percent.png
     *
     * \par Snippet
     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
     *
     * // Functor type for selecting values less than some criteria
     * struct LessThan
     * {
     *     int compare;
     *
     *     CUB_RUNTIME_FUNCTION __forceinline__
     *     LessThan(int compare) : compare(compare) {}
     *
     *     CUB_RUNTIME_FUNCTION __forceinline__
     *     bool operator()(const int &a) const {
     *         return (a < compare);
     *     }
     * };
     *
     * // Declare, allocate, and initialize device-accessible pointers for input and output
     * int      num_items;              // e.g., 8
     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
     * int      *d_num_selected_out;    // e.g., [ ]
     * LessThan select_op(7);
     * ...
     *
     * // Determine temporary device storage requirements
     * void     *d_temp_storage = NULL;
     * size_t   temp_storage_bytes = 0;
     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
     *
     * // Allocate temporary storage
     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
     *
     * // Run selection
     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
     *
     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
     * // d_num_selected_out    <-- [5]
     *
     * \endcode
     *
     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
     */
    template <
        typename                    InputIteratorT,
        typename                    OutputIteratorT,
        typename                    NumSelectedIteratorT,
        typename                    SelectOp>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t If(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
        int                         num_items,                      ///< [in] Total number of items to select from
        SelectOp                    select_op,                      ///< [in] Unary selection operator
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            select_op,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_partition_flagged.cu

xgboost/cub/cub/device/device_select.cuh view on Meta::CPAN

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
 */

#pragma once

#include <stdio.h>
#include <iterator>

#include "dispatch/dispatch_select_if.cuh"
#include "../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/**
 * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
 * \ingroup SingleModule
 *
 * \par Overview
 * These operations apply a selection criterion to selectively copy
 * items from a specified input sequence to a compact output sequence.
 *
 * \par Usage Considerations
 * \cdp_class{DeviceSelect}
 *
 * \par Performance
 * \linear_performance{select-flagged, select-if, and select-unique}
 *
 * \par
 * The following chart illustrates DeviceSelect::If
 * performance across different CUDA architectures for \p int32 items,
 * where 50% of the items are randomly selected.
 *
 * \image html select_if_int32_50_percent.png
 *
 * \par
 * The following chart illustrates DeviceSelect::Unique
 * performance across different CUDA architectures for \p int32 items
 * where segments have lengths uniformly sampled from [1,1000].
 *
 * \image html select_unique_int32_len_500.png
 *
 * \par
 * \plots_below
 *
 */
struct DeviceSelect
{
    /**
     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
     *
     * \par
     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage
     *
     * \par Snippet
     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
     * \par
     * \code
     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
     *
     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
     * int  num_items;              // e.g., 8
     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
     * int  *d_num_selected_out;    // e.g., [ ]
     * ...
     *
     * // Determine temporary device storage requirements
     * void     *d_temp_storage = NULL;
     * size_t   temp_storage_bytes = 0;
     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // Allocate temporary storage
     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
     *
     * // Run selection
     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
     *
     * // d_out                 <-- [1, 4, 6, 7]
     * // d_num_selected_out    <-- [4]
     *
     * \endcode
     *
     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
     */
    template <
        typename                    InputIteratorT,
        typename                    FlagIterator,
        typename                    OutputIteratorT,
        typename                    NumSelectedIteratorT>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
     *
     * \par
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage
     *
     * \par Performance
     * The following charts illustrate saturated select-if performance across different
     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
     * selected with 50% probability.
     *
     * \image html select_if_int32_50_percent.png
     * \image html select_if_int64_50_percent.png
     *
     * \par
     * The following charts are similar, but 5% selection probability:
     *
     * \image html select_if_int32_5_percent.png
     * \image html select_if_int64_5_percent.png
     *
     * \par Snippet
     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
     * \par
     * \code
     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
     *
     * // Functor type for selecting values less than some criteria
     * struct LessThan
     * {
     *     int compare;
     *
     *     CUB_RUNTIME_FUNCTION __forceinline__
     *     LessThan(int compare) : compare(compare) {}
     *
     *     CUB_RUNTIME_FUNCTION __forceinline__
     *     bool operator()(const int &a) const {
     *         return (a < compare);
     *     }
     * };
     *
     * // Declare, allocate, and initialize device-accessible pointers for input and output
     * int      num_items;              // e.g., 8
     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
     * int      *d_num_selected_out;    // e.g., [ ]
     * LessThan select_op(7);
     * ...
     *
     * // Determine temporary device storage requirements
     * void     *d_temp_storage = NULL;
     * size_t   temp_storage_bytes = 0;
     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
     *
     * // Allocate temporary storage
     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
     *
     * // Run selection
     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
     *
     * // d_out                 <-- [0, 2, 3, 5, 2]
     * // d_num_selected_out    <-- [5]
     *
     * \endcode
     *
     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
     */
    template <
        typename                    InputIteratorT,
        typename                    OutputIteratorT,
        typename                    NumSelectedIteratorT,
        typename                    SelectOp>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t If(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        SelectOp                    select_op,                      ///< [in] Unary selection operator
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            select_op,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.p...
     *
     * \par
     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage
     *
     * \par Performance
     * The following charts illustrate saturated select-unique performance across different
     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
     * lengths uniformly sampled from [1,1000].
     *
     * \image html select_unique_int32_len_500.png
     * \image html select_unique_int64_len_500.png
     *
     * \par
     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
     *
     * \image html select_unique_int32_len_5.png
     * \image html select_unique_int64_len_5.png
     *
     * \par Snippet
     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
     * \par
     * \code
     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
     *
     * // Declare, allocate, and initialize device-accessible pointers for input and output
     * int  num_items;              // e.g., 8
     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
     * int  *d_num_selected_out;    // e.g., [ ]
     * ...
     *
     * // Determine temporary device storage requirements
     * void     *d_temp_storage = NULL;
     * size_t   temp_storage_bytes = 0;
     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
     *
     * // Allocate temporary storage
     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
     *
     * // Run selection
     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
     *
     * // d_out                 <-- [0, 2, 9, 5, 8]
     * // d_num_selected_out    <-- [5]
     *
     * \endcode
     *
     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
     */
    template <
        typename                    InputIteratorT,
        typename                    OutputIteratorT,
        typename                    NumSelectedIteratorT>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Unique(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef Equality                EqualityOp;     // Default == operator

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_select_flagged.cu
 * \example example_device_select_if.cu
 * \example example_device_select_unique.cu
 */

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN

                num_items,
                begin_bit,
                end_bit);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update selector
            d_keys.selector ^= 1;
            d_values.selector ^= 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    //------------------------------------------------------------------------------

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN

            if (CubDebug(error = InvokePass(
                d_keys.Current(), d_keys_remaining_passes.Current(),
                d_values.Current(), d_values_remaining_passes.Current(),
                d_spine, spine_length, current_bit,
                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;

            // Run remaining passes
            while (current_bit < end_bit)
            {
                if (CubDebug(error = InvokePass(
                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    d_spine, spine_length, current_bit,
                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;

                // Invert selectors
                d_keys_remaining_passes.selector ^= 1;
                d_values_remaining_passes.selector ^= 1;
            }

            // Update selector
            if (!is_overwrite_okay) {
                num_passes = 1; // Sorted data always ends up in the other vector
            }

            d_keys.selector = (d_keys.selector + num_passes) & 1;
            d_values.selector = (d_values.selector + num_passes) & 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    //------------------------------------------------------------------------------

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh view on Meta::CPAN

            if (CubDebug(error = InvokePass(
                d_keys.Current(), d_keys_remaining_passes.Current(),
                d_values.Current(), d_values_remaining_passes.Current(),
                current_bit,
                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;

            // Run remaining passes
            while (current_bit < end_bit)
            {
                if (CubDebug(error = InvokePass(
                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    current_bit,
                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;

                // Invert selectors and update current bit
                d_keys_remaining_passes.selector ^= 1;
                d_values_remaining_passes.selector ^= 1;
            }

            // Update selector
            if (!is_overwrite_okay) {
                num_passes = 1; // Sorted data always ends up in the other vector
            }

            d_keys.selector = (d_keys.selector + num_passes) & 1;
            d_values.selector = (d_values.selector + num_passes) & 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    //------------------------------------------------------------------------------

xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN

__global__ void DeviceReduceByKeyKernel(
    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
    ScanTileStateT              tile_state,                     ///< Tile status interface
    int                         start_tile,                     ///< The starting tile for the current grid
    EqualityOpT                 equality_op,                    ///< KeyT equality operator
    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
    OffsetT                     num_items)                      ///< Total number of items to select from
{
    // Thread block type for reducing tiles of value segments
    typedef AgentReduceByKey<
            AgentReduceByKeyPolicyT,
            KeysInputIteratorT,
            UniqueOutputIteratorT,
            ValuesInputIteratorT,
            AggregatesOutputIteratorT,
            NumRunsOutputIteratorT,
            EqualityOpT,

xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN

    static cudaError_t Dispatch(
        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
        OffsetT                     num_items,                  ///< [in] Total number of items to select from
        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
        ScanInitKernelT            	init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
        ReduceByKeyKernelT         	reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
    {

#ifndef CUB_RUNTIME_ENABLED
      (void)d_temp_storage;

xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh view on Meta::CPAN

    static cudaError_t Dispatch(
        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
        OffsetT                     num_items,                      ///< [in] Total number of items to select from
        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;

xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN

namespace cub {


/******************************************************************************
 * Kernel entry points
 *****************************************************************************/

/**
 * Select kernel entry point (multi-block)
 *
 * Performs functor-based selection if SelectOp functor type != NullType
 * Otherwise performs flag-based selection if FlagIterator's value type != NullType
 * Otherwise performs discontinuity selection (keep unique)
 */
template <
    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
    typename            ScanTileStateT,              ///< Tile status interface type
    typename            EqualityOpT,                 ///< T equality operator type
    typename            OffsetT>                    ///< Signed integer type for global offsets

xgboost/cub/cub/device/dispatch/dispatch_rle.cuh view on Meta::CPAN

__global__ void DeviceRleSweepKernel(
    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
    ScanTileStateT              tile_status,        ///< [in] Tile status interface
    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
{
    // Thread block type for selecting data from input tiles
    typedef AgentRle<
        AgentRlePolicyT,
        InputIteratorT,
        OffsetsOutputIteratorT,
        LengthsOutputIteratorT,
        EqualityOpT,
        OffsetT> AgentRleT;

    // Shared memory for AgentRle
    __shared__ typename AgentRleT::TempStorage temp_storage;

xgboost/cub/cub/device/dispatch/dispatch_scan.cuh view on Meta::CPAN

{
    // Initialize tile status
    tile_state.InitializeStatus(num_tiles);
}

/**
 * Initialization kernel for tile status initialization (multi-block)
 */
template <
    typename                ScanTileStateT,         ///< Tile status interface type
    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
__global__ void DeviceCompactInitKernel(
    ScanTileStateT          tile_state,             ///< [in] Tile status interface
    int                     num_tiles,              ///< [in] Number of tiles
    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
{
    // Initialize tile status
    tile_state.InitializeStatus(num_tiles);

    // Initialize d_num_selected_out
    if ((blockIdx.x == 0) && (threadIdx.x == 0))
        *d_num_selected_out = 0;
}


/**
 * Scan kernel entry point (multi-block)
 */
template <
    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
 */

#pragma once

#include <stdio.h>
#include <iterator>

#include "dispatch_scan.cuh"
#include "../../agent/agent_select_if.cuh"
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

/******************************************************************************
 * Kernel entry points
 *****************************************************************************/

/**
 * Select kernel entry point (multi-block)
 *
 * Performs functor-based selection if SelectOpT functor type != NullType
 * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
 * Otherwise performs discontinuity selection (keep unique)
 */
template <
    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
    typename            ScanTileStateT,             ///< Tile status interface type
    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
    typename            OffsetT,                    ///< Signed integer type for global offsets
    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
__global__ void DeviceSelectSweepKernel(
    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
    ScanTileStateT          tile_status,            ///< [in] Tile status interface
    SelectOpT               select_op,              ///< [in] Selection operator
    EqualityOpT             equality_op,            ///< [in] Equality operator
    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
{
    // Thread block type for selecting data from input tiles
    typedef AgentSelectIf<
        AgentSelectIfPolicyT,
        InputIteratorT,
        FlagsInputIteratorT,
        SelectedOutputIteratorT,
        SelectOpT,
        EqualityOpT,
        OffsetT,
        KEEP_REJECTS> AgentSelectIfT;

    // Shared memory for AgentSelectIf
    __shared__ typename AgentSelectIfT::TempStorage temp_storage;

    // Process tiles
    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
        num_tiles,
        tile_status,
        d_num_selected_out);
}




/******************************************************************************
 * Dispatch
 ******************************************************************************/

/**
 * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
 */
template <
    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
    typename    OffsetT,                        ///< Signed integer type for global offsets
    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
struct DispatchSelectIf
{
    /******************************************************************************
     * Types and constants
     ******************************************************************************/

    // The output value type
    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN

     * Utilities
     ******************************************************************************/

    /**
     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
     */
    template <typename KernelConfig>
    CUB_RUNTIME_FUNCTION __forceinline__
    static void InitConfigs(
        int             ptx_version,
        KernelConfig    &select_if_config)
    {
    #if (CUB_PTX_ARCH > 0)
        (void)ptx_version;

        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
        select_if_config.template Init<PtxSelectIfPolicyT>();

    #else

        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
        if (ptx_version >= 350)
        {
            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
        }
        else if (ptx_version >= 300)
        {
            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
        }
        else if (ptx_version >= 200)
        {
            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
        }
        else if (ptx_version >= 130)
        {
            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
        }
        else
        {
            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
        }

    #endif
    }


    /**
     * Kernel kernel dispatch configuration.
     */
    struct KernelConfig

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN

            tile_items          = block_threads * items_per_thread;
        }
    };


    /******************************************************************************
     * Dispatch entrypoints
     ******************************************************************************/

    /**
     * Internal dispatch routine for computing a device-wide selection using the
     * specified kernel functions.
     */
    template <
        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
        SelectOpT                   select_op,                      ///< [in] Selection operator
        EqualityOpT                 equality_op,                    ///< [in] Equality operator
        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
    {

#ifndef CUB_RUNTIME_ENABLED
        (void)d_temp_storage;
        (void)temp_storage_bytes;
        (void)d_in;
        (void)d_flags;
        (void)d_selected_out;
        (void)d_num_selected_out;
        (void)select_op;
        (void)equality_op;
        (void)num_items;
        (void)stream;
        (void)debug_synchronous;
        (void)scan_init_kernel;
        (void)select_if_kernel;
        (void)select_if_config;

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported);

#else

        cudaError error = cudaSuccess;
        do
        {
            // Get device ordinal
            int device_ordinal;
            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;

            // Get SM count
            int sm_count;
            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;

            // Number of input tiles
            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
            int num_tiles = (num_items + tile_size - 1) / tile_size;

            // Specify temporary storage allocation requirements
            size_t  allocation_sizes[1];
            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors

            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
            void* allocations[1];
            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
            if (d_temp_storage == NULL)

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN

            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

            // Log scan_init_kernel configuration
            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

            // Invoke scan_init_kernel to initialize tile descriptors
            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
                tile_status,
                num_tiles,
                d_num_selected_out);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Return if empty problem
            if (num_items == 0)
                break;

            // Get SM occupancy for select_if_kernel
            int range_select_sm_occupancy;
            if (CubDebug(error = MaxSmOccupancy(
                range_select_sm_occupancy,            // out
                select_if_kernel,
                select_if_config.block_threads))) break;

            // Get max x-dimension of grid
            int max_dim_x;
            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;

            // Get grid size for scanning tiles
            dim3 scan_grid_size;
            scan_grid_size.z = 1;
            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);

            // Log select_if_kernel configuration
            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);

            // Invoke select_if_kernel
            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
                d_in,
                d_flags,
                d_selected_out,
                d_num_selected_out,
                tile_status,
                select_op,
                equality_op,
                num_items,
                num_tiles);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
        }

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh view on Meta::CPAN



    /**
     * Internal dispatch routine
     */
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
        SelectOpT                   select_op,                      ///< [in] Selection operator
        EqualityOpT                 equality_op,                    ///< [in] Equality operator
        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else
            ptx_version = CUB_PTX_ARCH;
    #endif

            // Get kernel kernel dispatch configurations
            KernelConfig select_if_config;
            InitConfigs(ptx_version, select_if_config);

            // Dispatch
            if (CubDebug(error = Dispatch(
                d_temp_storage,
                temp_storage_bytes,
                d_in,
                d_flags,
                d_selected_out,
                d_num_selected_out,
                select_op,
                equality_op,
                num_items,
                stream,
                debug_synchronous,
                ptx_version,
                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
                select_if_config))) break;
        }
        while (0);

        return error;
    }
};


}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh view on Meta::CPAN

template <
    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
    typename    OffsetT,                        ///< Signed integer type for global offsets
    typename    ScanTileStateT>                 ///< Tile status interface type
__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
__global__ void DeviceSegmentFixupKernel(
    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
    OffsetT                     num_items,          ///< [in] Total number of items to select from
    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
    ScanTileStateT              tile_state)         ///< [in] Tile status interface
{
    // Thread block type for reducing tiles of value segments
    typedef AgentSegmentFixup<
            AgentSegmentFixupPolicyT,
            PairsInputIteratorT,
            AggregatesOutputIteratorT,
            cub::Equality,
            cub::Sum,

xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh view on Meta::CPAN

template <
    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
    typename    OffsetT,                        ///< Signed integer type for global offsets
    typename    ScanTileStateT>                 ///< Tile status interface type
__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
__global__ void DeviceSegmentFixupKernel(
    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
    OffsetT                     num_items,          ///< [in] Total number of items to select from
    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
    ScanTileStateT              tile_state)         ///< [in] Tile status interface
{
    // Thread block type for reducing tiles of value segments
    typedef AgentSegmentFixup<
            AgentSegmentFixupPolicyT,
            PairsInputIteratorT,
            AggregatesOutputIteratorT,
            cub::Equality,
            cub::Sum,

xgboost/cub/cub/util_ptx.cuh view on Meta::CPAN

    return x;
}


/**
 * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
 *
 * \par
 * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
 * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
 * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
 * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
 *
 * \par Snippet
 * The code snippet below illustrates byte-permute.
 * \par
 * \code
 * #include <cub/cub.cuh>
 *
 * __global__ void ExampleKernel(...)
 * {
 *     int a        = 0x03020100;
 *     int b        = 0x07060504;
 *     int index    = 0x00007531;
 *
 *     int selected = PRMT(a, b, index);    // 0x07050301
 *
 * \endcode
 *
 */
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
{
    int ret;
    asm volatile("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
    return ret;
}

xgboost/cub/cub/util_type.cuh view on Meta::CPAN

 * @{
 */



/******************************************************************************
 * Type equality
 ******************************************************************************/

/**
 * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
 */
template <bool IF, typename ThenType, typename ElseType>
struct If
{
    /// Conditional type result
    typedef ThenType Type;      // true
};

#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document

xgboost/cub/cub/util_type.cuh view on Meta::CPAN

};

#endif // DOXYGEN_SHOULD_SKIP_THIS

/**
 * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
 *
 * Many multi-pass computations require a pair of "ping-pong" storage
 * buffers (e.g., one for reading from and the other for writing to, and then
 * vice-versa for the subsequent pass).  This structure wraps a set of device
 * buffers and a "selector" member to track which is "current".
 */
template <typename T>
struct DoubleBuffer
{
    /// Pair of device buffer pointers
    T *d_buffers[2];

    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
    int selector;

    /// \brief Constructor
    __host__ __device__ __forceinline__ DoubleBuffer()
    {
        selector = 0;
        d_buffers[0] = NULL;
        d_buffers[1] = NULL;
    }

    /// \brief Constructor
    __host__ __device__ __forceinline__ DoubleBuffer(
        T *d_current,         ///< The currently valid buffer
        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
    {
        selector = 0;
        d_buffers[0] = d_current;
        d_buffers[1] = d_alternate;
    }

    /// \brief Return pointer to the currently valid buffer
    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }

    /// \brief Return pointer to the currently invalid buffer
    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }

};



/******************************************************************************
 * Typedef-detection
 ******************************************************************************/

xgboost/cub/examples/device/Makefile view on Meta::CPAN

DEPS =				$(CUB_DEPS) \
					$(CUB_DIR)test/Makefile \
					$(CUB_DIR)test/test_util.h \
					$(CUB_DIR)test/mersenne.h \
		
ALL = 	example_device_partition_flagged \
		example_device_partition_if \
	 	example_device_radix_sort \
		example_device_reduce \
	 	example_device_scan \
	 	example_device_select_unique \
		example_device_select_flagged \
		example_device_select_if \
		example_device_sort_find_non_trivial_runs
		


#-------------------------------------------------------------------------------
# make default
#-------------------------------------------------------------------------------

default:

xgboost/cub/examples/device/Makefile view on Meta::CPAN

#-------------------------------------------------------------------------------

example_device_radix_sort: bin/example_device_radix_sort_$(BIN_SUFFIX)

bin/example_device_radix_sort_$(BIN_SUFFIX) : example_device_radix_sort.cu $(DEPS)
	mkdir -p bin
	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_radix_sort_$(BIN_SUFFIX) example_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3


#-------------------------------------------------------------------------------
# make example_device_select_unique
#-------------------------------------------------------------------------------

example_device_select_unique: bin/example_device_select_unique_$(BIN_SUFFIX)

bin/example_device_select_unique_$(BIN_SUFFIX) : example_device_select_unique.cu $(DEPS)
	mkdir -p bin
	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_unique_$(BIN_SUFFIX) example_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3


#-------------------------------------------------------------------------------
# make example_device_select_flagged
#-------------------------------------------------------------------------------

example_device_select_flagged: bin/example_device_select_flagged_$(BIN_SUFFIX)

bin/example_device_select_flagged_$(BIN_SUFFIX) : example_device_select_flagged.cu $(DEPS)
	mkdir -p bin
	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_flagged_$(BIN_SUFFIX) example_device_select_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3

#-------------------------------------------------------------------------------
# make example_device_select_if
#-------------------------------------------------------------------------------

example_device_select_if: bin/example_device_select_if_$(BIN_SUFFIX)

bin/example_device_select_if_$(BIN_SUFFIX) : example_device_select_if.cu $(DEPS)
	mkdir -p bin
	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_if_$(BIN_SUFFIX) example_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3


#-------------------------------------------------------------------------------
# make example_device_sort_find_non_trivial_runs
#-------------------------------------------------------------------------------

example_device_sort_find_non_trivial_runs: bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX)

bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) : example_device_sort_find_non_trivial_runs.cu $(DEPS)
	mkdir -p bin

xgboost/cub/examples/device/example_device_partition_flagged.cu view on Meta::CPAN


/**
 * Solve unique problem
 */
int Solve(
    int             *h_in,
    unsigned char   *h_flags,
    int             *h_reference,
    int             num_items)
{
    int num_selected = 0;
    for (int i = 0; i < num_items; ++i)
    {
        if (h_flags[i])
        {
            h_reference[num_selected] = h_in[i];
            num_selected++;
        }
        else
        {
            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
        }
    }

    return num_selected;
}


//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */

xgboost/cub/examples/device/example_device_partition_flagged.cu view on Meta::CPAN

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Allocate host arrays
    int             *h_in        = new int[num_items];
    int             *h_reference = new int[num_items];
    unsigned char   *h_flags     = new unsigned char[num_items];

    // Initialize problem and solution
    Initialize(h_in, h_flags, num_items, max_segment);
    int num_selected = Solve(h_in, h_flags, h_reference, num_items);

    printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
    fflush(stdout);

    // Allocate problem device arrays
    int             *d_in = NULL;
    unsigned char   *d_flags = NULL;

    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array and num selected
    int     *d_out            = NULL;
    int     *d_num_selected_out   = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
    printf("\t Data %s ", compare ? "FAIL" : "PASS");
    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s ", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_reference) delete[] h_reference;
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));

    printf("\n\n");

    return 0;
}

xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN

 *
 ******************************************************************************/

/******************************************************************************
 * Simple example of DevicePartition::If().
 *
 * Partitions items from from a sequence of int keys using a
 * section functor (greater-than)
 *
 * To compile using the command line:
 *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>

#include <cub/util_allocator.cuh>
#include <cub/device/device_partition.cuh>

xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN

 */
void Initialize(
    int     *h_in,
    int     num_items,
    int     max_segment)
{
    int key = 0;
    int i = 0;
    while (i < num_items)
    {
        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
        unsigned short max_short = (unsigned short) -1;
        unsigned short repeat;
        RandomBits(repeat);
        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
        repeat = CUB_MAX(1, repeat);

        int j = i;
        while (j < CUB_MIN(i + repeat, num_items))
        {
            h_in[j] = key;

xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN

    }
}


/**
 * Solve unique problem
 */
template <typename SelectOp>
int Solve(
    int             *h_in,
    SelectOp        select_op,
    int             *h_reference,
    int             num_items)
{
    int num_selected = 0;
    for (int i = 0; i < num_items; ++i)
    {
        if (select_op(h_in[i]))
        {
            h_reference[num_selected] = h_in[i];
            num_selected++;
        }
        else
        {
            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
        }
    }

    return num_selected;
}


//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */

xgboost/cub/examples/device/example_device_partition_if.cu view on Meta::CPAN


    // DevicePartition a pivot index
    unsigned int pivot_index;
    unsigned int max_int = (unsigned int) -1;
    RandomBits(pivot_index);
    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);

    // Initialize problem and solution
    Initialize(h_in, num_items, max_segment);
    GreaterThan select_op(h_in[pivot_index]);

    int num_selected = Solve(h_in, select_op, h_reference, num_items);

    printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
    fflush(stdout);

    // Allocate problem device arrays
    int *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array and num selected
    int     *d_out            = NULL;
    int     *d_num_selected_out   = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
    printf("\t Data %s ", compare ? "FAIL" : "PASS");
    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s ", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_reference) delete[] h_reference;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

xgboost/cub/examples/device/example_device_radix_sort.cu view on Meta::CPAN

    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));

    // Allocate temporary storage
    size_t  temp_storage_bytes  = 0;
    void    *d_temp_storage     = NULL;

    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Initialize device arrays
    CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));

    // Run
    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
    printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);
    compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
    printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Cleanup
    if (h_keys) delete[] h_keys;
    if (h_reference_keys) delete[] h_reference_keys;
    if (h_values) delete[] h_values;
    if (h_reference_values) delete[] h_reference_values;

    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));

xgboost/cub/examples/device/example_device_select_flagged.cu view on Meta::CPAN

 *
 ******************************************************************************/

/******************************************************************************
 * Simple example of DeviceSelect::Flagged().
 *
 * Selects flagged items from from a sequence of int keys using a
 * corresponding sequence of unsigned char flags.
 *
 * To compile using the command line:
 *   nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>

#include <cub/util_allocator.cuh>
#include <cub/device/device_select.cuh>

#include "../../test/test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

xgboost/cub/examples/device/example_device_select_flagged.cu view on Meta::CPAN


/**
 * Solve unique problem
 */
int Solve(
    int             *h_in,
    unsigned char   *h_flags,
    int             *h_reference,
    int             num_items)
{
    int num_selected = 0;
    for (int i = 0; i < num_items; ++i)
    {
        if (h_flags[i])
        {
            h_reference[num_selected] = h_in[i];
            num_selected++;
        }
        else
        {
            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
        }
    }

    return num_selected;
}


//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */

xgboost/cub/examples/device/example_device_select_flagged.cu view on Meta::CPAN

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Allocate host arrays
    int             *h_in        = new int[num_items];
    int             *h_reference = new int[num_items];
    unsigned char   *h_flags     = new unsigned char[num_items];

    // Initialize problem and solution
    Initialize(h_in, h_flags, num_items, max_segment);
    int num_selected = Solve(h_in, h_flags, h_reference, num_items);

    printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
    fflush(stdout);

    // Allocate problem device arrays
    int             *d_in = NULL;
    unsigned char   *d_flags = NULL;

    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array and num selected
    int     *d_out            = NULL;
    int     *d_num_selected_out   = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
    printf("\t Data %s ", compare ? "FAIL" : "PASS");
    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s ", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_reference) delete[] h_reference;
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));

    printf("\n\n");

    return 0;
}

xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN

 *
 ******************************************************************************/

/******************************************************************************
 * Simple example of DeviceSelect::If().
 *
 * Selects items from from a sequence of int keys using a
 * section functor (greater-than)
 *
 * To compile using the command line:
 *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>

#include <cub/util_allocator.cuh>
#include <cub/device/device_select.cuh>

#include "../../test/test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN

 */
void Initialize(
    int     *h_in,
    int     num_items,
    int     max_segment)
{
    int key = 0;
    int i = 0;
    while (i < num_items)
    {
        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
        unsigned short max_short = (unsigned short) -1;
        unsigned short repeat;
        RandomBits(repeat);
        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
        repeat = CUB_MAX(1, repeat);

        int j = i;
        while (j < CUB_MIN(i + repeat, num_items))
        {
            h_in[j] = key;

xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN

    }
}


/**
 * Solve unique problem
 */
template <typename SelectOp>
int Solve(
    int             *h_in,
    SelectOp        select_op,
    int             *h_reference,
    int             num_items)
{
    int num_selected = 0;
    for (int i = 0; i < num_items; ++i)
    {
        if (select_op(h_in[i]))
        {
            h_reference[num_selected] = h_in[i];
            num_selected++;
        }
        else
        {
            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
        }
    }

    return num_selected;
}


//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */

xgboost/cub/examples/device/example_device_select_if.cu view on Meta::CPAN


    // Select a pivot index
    unsigned int pivot_index;
    unsigned int max_int = (unsigned int) -1;
    RandomBits(pivot_index);
    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);

    // Initialize problem and solution
    Initialize(h_in, num_items, max_segment);
    GreaterThan select_op(h_in[pivot_index]);

    int num_selected = Solve(h_in, select_op, h_reference, num_items);

    printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
    fflush(stdout);

    // Allocate problem device arrays
    int *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array and num selected
    int     *d_out            = NULL;
    int     *d_num_selected_out   = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
    printf("\t Data %s ", compare ? "FAIL" : "PASS");
    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s ", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_reference) delete[] h_reference;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN

 *
 ******************************************************************************/

/******************************************************************************
 * Simple example of DeviceSelect::Unique().
 *
 * Selects the first element from each run of identical values from a sequence
 * of int keys.
 *
 * To compile using the command line:
 *   nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>

#include <cub/util_allocator.cuh>
#include <cub/device/device_select.cuh>

#include "../../test/test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN

 */
void Initialize(
    int     *h_in,
    int     num_items,
    int     max_segment)
{
    int key = 0;
    int i = 0;
    while (i < num_items)
    {
        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
        unsigned short max_short = (unsigned short) -1;
        unsigned short repeat;
        RandomBits(repeat);
        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
        repeat = CUB_MAX(1, repeat);

        int j = i;
        while (j < CUB_MIN(i + repeat, num_items))
        {
            h_in[j] = key;

xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN



/**
 * Solve unique problem
 */
int Solve(
    int         *h_in,
    int         *h_reference,
    int         num_items)
{
    int num_selected = 0;
    if (num_items > 0)
    {
        h_reference[num_selected] = h_in[0];
        num_selected++;
    }

    for (int i = 1; i < num_items; ++i)
    {
        if (h_in[i] != h_in[i - 1])
        {
            h_reference[num_selected] = h_in[i];
            num_selected++;
        }
    }

    return num_selected;
}


//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------

/**
 * Main
 */

xgboost/cub/examples/device/example_device_select_unique.cu view on Meta::CPAN


    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Allocate host arrays
    int*  h_in        = new int[num_items];
    int*  h_reference = new int[num_items];

    // Initialize problem and solution
    Initialize(h_in, num_items, max_segment);
    int num_selected = Solve(h_in, h_reference, num_items);

    printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
        num_items, (int) sizeof(int), num_selected, num_items / num_selected);
    fflush(stdout);

    // Allocate problem device arrays
    int *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array and num selected
    int     *d_out            = NULL;
    int     *d_num_selected_out   = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
    printf("\t Data %s ", compare ? "FAIL" : "PASS");
    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s ", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_reference) delete[] h_reference;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Simple example of sorting a sequence of keys and values (each pair is a
 * randomly-selected int32 paired with its original offset in the unsorted sequence), and then
 * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
 *
 * To compile using the command line:
 *   nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

    {

        // Allocate and initialize device arrays for sorting
        DoubleBuffer<Key>       d_keys;
        DoubleBuffer<Value>     d_values;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));

        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
        CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));

        // Start timer
        gpu_timer.Start();

        // Allocate temporary storage for sorting
        size_t  temp_storage_bytes  = 0;
        void    *d_temp_storage     = NULL;
        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

        // Do the sort
        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));

        // Free unused buffers and sorting temporary storage
        if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
        if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

        // Start timer
        gpu_rle_timer.Start();

        // Allocate device arrays for enumerating non-trivial runs
        int     *d_offests_out   = NULL;
        int     *d_lengths_out   = NULL;
        int     *d_num_runs      = NULL;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));

        // Allocate temporary storage for isolating non-trivial runs
        d_temp_storage = NULL;
        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
            d_temp_storage,
            temp_storage_bytes,
            d_keys.d_buffers[d_keys.selector],
            d_offests_out,
            d_lengths_out,
            d_num_runs,
            num_items));
        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

        // Do the isolation
        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
            d_temp_storage,
            temp_storage_bytes,
            d_keys.d_buffers[d_keys.selector],
            d_offests_out,
            d_lengths_out,
            d_num_runs,
            num_items));

        // Free keys buffer
        if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));

        //
        // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
        //

        // Stop sort timer
        gpu_timer.Stop();
        gpu_rle_timer.Stop();

        if (i == 0)

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

            AssertEquals(0, compare);
        }
        else
        {
            elapsed_millis += gpu_timer.ElapsedMillis();
            elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
        }

        // GPU cleanup

        if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
        if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
        if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
        if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
    }

    // Host cleanup
    if (h_keys) delete[] h_keys;
    if (h_values) delete[] h_values;
    if (h_offsets_reference) delete[] h_offsets_reference;

( run in 1.191 second using v1.01-cache-2.11-cpan-73692580452 )