Alien-XGBoost

 view release on metacpan or  search on metacpan

MANIFEST  view on Meta::CPAN

xgboost/cub/cub/iterator/tex_ref_input_iterator.cuh
xgboost/cub/cub/iterator/transform_input_iterator.cuh
xgboost/cub/cub/thread/thread_load.cuh
xgboost/cub/cub/thread/thread_operators.cuh
xgboost/cub/cub/thread/thread_reduce.cuh
xgboost/cub/cub/thread/thread_scan.cuh
xgboost/cub/cub/thread/thread_search.cuh
xgboost/cub/cub/thread/thread_store.cuh
xgboost/cub/cub/util_allocator.cuh
xgboost/cub/cub/util_arch.cuh
xgboost/cub/cub/util_debug.cuh
xgboost/cub/cub/util_device.cuh
xgboost/cub/cub/util_macro.cuh
xgboost/cub/cub/util_namespace.cuh
xgboost/cub/cub/util_ptx.cuh
xgboost/cub/cub/util_type.cuh
xgboost/cub/cub/warp/specializations/warp_reduce_shfl.cuh
xgboost/cub/cub/warp/specializations/warp_reduce_smem.cuh
xgboost/cub/cub/warp/specializations/warp_scan_shfl.cuh
xgboost/cub/cub/warp/specializations/warp_scan_smem.cuh
xgboost/cub/cub/warp/warp_reduce.cuh

xgboost/R-package/configure  view on Meta::CPAN

fi
ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
ac_abs_confdir=`(
	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
	pwd)`
# When building in place, set srcdir=.
if test "$ac_abs_confdir" = "$ac_pwd"; then
  srcdir=.
fi
# Remove unnecessary trailing slashes from srcdir.
# Double slashes in file names in object file debugging info
# mess up M-x gdb in Emacs.
case $srcdir in
*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
esac
for ac_var in $ac_precious_vars; do
  eval ac_env_${ac_var}_set=\${${ac_var}+set}
  eval ac_env_${ac_var}_value=\$${ac_var}
  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
  eval ac_cv_env_${ac_var}_value=\$${ac_var}
done

xgboost/R-package/configure  view on Meta::CPAN

gives unlimited permission to copy, distribute and modify it.
_ACEOF
  exit
fi

## ------------------------ ##
## Autoconf initialization. ##
## ------------------------ ##
cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.

It was created by xgboost $as_me 0.6-3, which was
generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@

_ACEOF
exec 5>>config.log
{
cat <<_ASUNAME

xgboost/R-package/configure  view on Meta::CPAN

done
{ ac_configure_args0=; unset ac_configure_args0;}
{ ac_configure_args1=; unset ac_configure_args1;}

# When interrupted or exit'd, cleanup temporary files, and complete
# config.log.  We remove comments because anyway the quotes in there
# would cause problems or look ugly.
# WARNING: Use '\'' to represent an apostrophe within the trap.
# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
trap 'exit_status=$?
  # Save into config.log some information that might help in debugging.
  {
    echo

    $as_echo "## ---------------- ##
## Cache variables. ##
## ---------------- ##"
    echo
    # The following way of writing the cache mishandles newlines in values,
(
  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do

xgboost/R-package/configure  view on Meta::CPAN

ac_write_fail=0
ac_clean_files_save=$ac_clean_files
ac_clean_files="$ac_clean_files $CONFIG_STATUS"
{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
as_write_fail=0
cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
#! $SHELL
# Generated by $as_me.
# Run this file to recreate the current configuration.
# Compiler output produced by configure, useful for debugging
# configure, is in config.log if it exists.

debug=false
ac_cs_recheck=false
ac_cs_silent=false

SHELL=\${CONFIG_SHELL-$SHELL}
export SHELL
_ASEOF
cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
## -------------------- ##
## M4sh Initialization. ##
## -------------------- ##

xgboost/R-package/configure  view on Meta::CPAN

from templates according to the current configuration.  Unless the files
and actions are specified as TAGs, all are instantiated by default.

Usage: $0 [OPTION]... [TAG]...

  -h, --help       print this help, then exit
  -V, --version    print version number and configuration settings, then exit
      --config     print configuration, then exit
  -q, --quiet, --silent
                   do not print progress messages
  -d, --debug      don't remove temporary files
      --recheck    update $as_me by reconfiguring in the same conditions
      --file=FILE[:TEMPLATE]
                   instantiate the configuration file FILE

Configuration files:
$config_files

Report bugs to the package provider."

_ACEOF

xgboost/R-package/configure  view on Meta::CPAN

  esac

  case $ac_option in
  # Handling of the options.
  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
    ac_cs_recheck=: ;;
  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
    $as_echo "$ac_cs_version"; exit ;;
  --config | --confi | --conf | --con | --co | --c )
    $as_echo "$ac_cs_config"; exit ;;
  --debug | --debu | --deb | --de | --d | -d )
    debug=: ;;
  --file | --fil | --fi | --f )
    $ac_shift
    case $ac_optarg in
    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
    '') as_fn_error $? "missing file argument" ;;
    esac
    as_fn_append CONFIG_FILES " '$ac_optarg'"
    ac_need_defaults=false;;
  --he | --h |  --help | --hel | -h )
    $as_echo "$ac_cs_usage"; exit ;;

xgboost/R-package/configure  view on Meta::CPAN

# then the envvar interface is used.  Set only those that are not.
# We use the long form for the default assignment because of an extremely
# bizarre bug on SunOS 4.1.3.
if $ac_need_defaults; then
  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
fi

# Have a temporary directory for convenience.  Make it in the build tree
# simply because there is no reason against having it here, and in addition,
# creating and moving files from /tmp can sometimes cause problems.
# Hook for its removal unless debugging.
# Note that there is a small window in which the directory will not be cleaned:
# after its creation but before its name has been assigned to `$tmp'.
$debug ||
{
  tmp= ac_tmp=
  trap 'exit_status=$?
  : "${ac_tmp:=$tmp}"
  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
' 0
  trap 'as_fn_exit 1' 1 2 13 15
}
# Create a (secure) tmp directory for tmp files.

xgboost/cmake/Utils.cmake  view on Meta::CPAN

      )
      foreach(variable ${variables})
          if(${variable} MATCHES "/MD")
              string(REGEX REPLACE "/MD" "/MT" ${variable} "${${variable}}")
              set(${variable} "${${variable}}"  PARENT_SCOPE)
          endif()
      endforeach()
  endif()
endfunction(msvc_use_static_runtime)

# Set output directory of target, ignoring debug or release
function(set_output_directory target dir)
	set_target_properties(${target} PROPERTIES 
		RUNTIME_OUTPUT_DIRECTORY ${dir}
		RUNTIME_OUTPUT_DIRECTORY_DEBUG ${dir}
		RUNTIME_OUTPUT_DIRECTORY_RELEASE ${dir}
		LIBRARY_OUTPUT_DIRECTORY ${dir} 
		LIBRARY_OUTPUT_DIRECTORY_DEBUG ${dir} 
		LIBRARY_OUTPUT_DIRECTORY_RELEASE ${dir} 
	)
endfunction(set_output_directory)

xgboost/cub/CHANGE_LOG.TXT  view on Meta::CPAN

      different from output sequence types (as long as they are coercible)
      value") for seeding the computation with an arbitrary prefix
    - Reduce repository size (move doxygen binary to doc repository)
    - Minor reductions in block-scan instruction count
    - Bug fixes: 
        - Issue #55: warning in cub/device/dispatch/dispatch_reduce_by_key.cuh 
        - Issue #59: cub::DeviceScan::ExclusiveSum can't prefix sum of float into double
        - Issue #58: Infinite loop in cub::CachingDeviceAllocator::NearestPowerOf
        - Issue #47: Caching allocator needs to clean up cuda error upon successful retry 
        - Issue #46: Very high amount of needed memory from the cub::DeviceHistogram::HistogramEven routine
        - Issue #45: Caching Device Allocator fails with debug output enabled
        - Fix for generic-type reduce-by-key warpscan (sm3.x and newer)

//-----------------------------------------------------------------------------

1.5.2    03/21/2016
	- Improved medium-size scan performance for sm5x (Maxwell)
    - Refactored caching allocator for device memory
   		- Spends less time locked
		- Failure to allocate a block from the runtime will retry once after
		  freeing cached allocations

xgboost/cub/CHANGE_LOG.TXT  view on Meta::CPAN

              allocated dynamically by the host at the kernel call site.   
            - Most CUB programs having typical 1D usage should not require any 
              changes to accomodate these updates.
        - Added new "combination" WarpScan methods for efficiently computing 
          both inclusive and exclusive prefix scans (and sums).
    - Bug fixes: 
        - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
          cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
          returned when parameterized for floating-point types (fp32, fp64).
        - Workaround-fix for ptxas error when compiling with with -G flag on Linux 
          (for debug instrumentation) 
        - Misc. workaround-fixes for certain scan scenarios (using custom 
          scan operators) where code compiled for SM1x is run on newer 
          GPUs of higher compute-capability: the compiler could not tell
          which memory space was being used collective operations and was 
          mistakenly using global ops instead of shared ops. 

//-----------------------------------------------------------------------------

1.2.3    04/01/2014
    - Bug fixes: 

xgboost/cub/CHANGE_LOG.TXT  view on Meta::CPAN

        - Added descending sorting to DeviceRadixSort and BlockRadixSort
        - Added min, max, arg-min, and arg-max to DeviceReduce
        - Added DeviceSelect (select-unique, select-if, and select-flagged)
        - Added DevicePartition (partition-if, partition-flagged)
        - Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
        - Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
    - Performance
        - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
    - Interface changes:
        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
        - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
    - Documentation and testing:
        - Added simple examples of device-wide methods
        - Improved doxygen documentation and example snippets
        - Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform)
    - Bug fixes 
        - Fixed misc DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM10-SM13
        - Fixed DeviceScan / WarpReduction bug: SHFL-based segmented reduction producting incorrect results for multi-word types (size > 4B) on Linux 
        - Fixed BlockScan bug: For warpscan-based scans, not all threads in the first warp were entering the prefix callback functor
        - Fixed DeviceRadixSort bug: race condition with key-value pairs for pre-SM35 architectures
        - Fixed DeviceRadixSort bug: incorrect bitfield-extract behavior with long keys on 64bit Linux

xgboost/cub/common.mk  view on Meta::CPAN

ifeq ($(verbose), 1)
	NVCCFLAGS += -v
endif


# [keep=<0|1>] Keep intermediate compilation artifacts option
ifeq ($(keep), 1)
	NVCCFLAGS += -keep
endif

# [debug=<0|1>] Generate debug mode code
ifeq ($(debug), 1)
	NVCCFLAGS += -G
endif


#-------------------------------------------------------------------------------
# Compiler and compilation platform
#-------------------------------------------------------------------------------

CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))

xgboost/cub/cub/cub.cuh  view on Meta::CPAN

#include "iterator/cache_modified_output_iterator.cuh"
#include "iterator/constant_input_iterator.cuh"
#include "iterator/counting_input_iterator.cuh"
#include "iterator/tex_obj_input_iterator.cuh"
#include "iterator/tex_ref_input_iterator.cuh"
#include "iterator/transform_input_iterator.cuh"

// Util
#include "util_allocator.cuh"
#include "util_arch.cuh"
#include "util_debug.cuh"
#include "util_device.cuh"
#include "util_macro.cuh"
#include "util_ptx.cuh"
#include "util_type.cuh"

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    static cudaError_t HistogramEven(
        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        CounterT*           d_histogram1[1]     = {d_histogram};
        int                 num_levels1[1]      = {num_levels};
        LevelT              lower_level1[1]     = {lower_level};
        LevelT              upper_level1[1]     = {upper_level};

        return MultiHistogramEven<1, 1>(

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            lower_level1,
            upper_level1,
            num_samples,
            1,
            sizeof(SampleT) * num_samples,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
     *
     * \par
     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
     * - The row stride must be a whole multiple of the sample data type

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        CounterT*           d_histogram1[1]     = {d_histogram};
        int                 num_levels1[1]      = {num_levels};
        LevelT              lower_level1[1]     = {lower_level};
        LevelT              upper_level1[1]     = {upper_level};

        return MultiHistogramEven<1, 1>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            lower_level1,
            upper_level1,
            num_row_samples,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    static cudaError_t MultiHistogramEven(
        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram,
            num_levels,
            lower_level,
            upper_level,
            num_pixels,
            1,
            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> s...
        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
        Int2Type<sizeof(SampleT) == 1> is_byte_sample;

        if ((sizeof(OffsetT) > sizeof(int)) &&
            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
        {
            // Down-convert OffsetT data type


            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
                stream, debug_synchronous, is_byte_sample);
        }

        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
            stream, debug_synchronous, is_byte_sample);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Custom bin ranges
     *********************************************************************/
    //@{

    /**

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t HistogramRange(
        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        CounterT*           d_histogram1[1] = {d_histogram};
        int                 num_levels1[1]  = {num_levels};
        LevelT*             d_levels1[1]    = {d_levels};

        return MultiHistogramRange<1, 1>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            d_levels1,
            num_samples,
            1,
            sizeof(SampleT) * num_samples,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
     *
     * \par
     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
     * - The row stride must be a whole multiple of the sample data type

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample valu...
        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        CounterT*           d_histogram1[1]     = {d_histogram};
        int                 num_levels1[1]      = {num_levels};
        LevelT*             d_levels1[1]        = {d_levels};

        return MultiHistogramRange<1, 1>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram1,
            num_levels1,
            d_levels1,
            num_row_samples,
            num_rows,
            row_stride_bytes,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t MultiHistogramRange(
        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            d_samples,
            d_histogram,
            num_levels,
            d_levels,
            num_pixels,
            1,
            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
     *
     * \par
     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms

xgboost/cub/cub/device/device_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> shoul...
        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        /// The sample value type of the input iterator
        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
        Int2Type<sizeof(SampleT) == 1> is_byte_sample;

        if ((sizeof(OffsetT) > sizeof(int)) &&
            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
        {
            // Down-convert OffsetT data type
            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
                stream, debug_synchronous, is_byte_sample);
        }

        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
            stream, debug_synchronous, is_byte_sample);
    }



    //@}  end member group
};

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/device_partition.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
        int                         num_items,                      ///< [in] Total number of items to select from
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
     *
     * \par
     * - Copies of the selected items are compacted into \p d_out and maintain their original
     *   relative ordering, however copies of the unselected items are compacted into the
     *   rear of \p d_out in reverse order.

xgboost/cub/cub/device/device_partition.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t If(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
        int                         num_items,                      ///< [in] Total number of items to select from
        SelectOp                    select_op,                      ///< [in] Unary selection operator
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            select_op,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_partition_flagged.cu
 * \example example_device_partition_if.cu
 */

}               // CUB namespace

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortPairs(
        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] Number of items to sort
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortPairsDescending(
        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] Number of items to sort
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Keys-only
     *********************************************************************/
    //@{


xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeys(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    template <typename KeyT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeys(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
     * - \devicestorage

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeysDescending(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_radix_sort.cuh  view on Meta::CPAN

    template <typename KeyT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t SortKeysDescending(
        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] Number of items to sort
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group


};

/**
 * \example example_device_radix_sort.cu

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Reduce(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
        T                           init,                               ///< [in] Initial value of the reduction
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            reduction_op,
            init,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide sum using the addition (\p +) operator.
     *
     * \par
     * - Uses \p 0 as the initial value of the reduction.
     * - Does not support \p + operators that are non-commutative..
     * - \devicestorage

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t Sum(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The output value type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            cub::Sum(),
            OutputT(),            // zero-initialize
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide minimum using the less-than ('<') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
     * - Does not support \p < operators that are non-commutative.
     * - \devicestorage

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t Min(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            cub::Min(),
            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMin(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN


        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_items,
            cub::ArgMin(),
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
     * - Does not support \p > operators that are non-commutative.
     * - \devicestorage

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t Max(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            cub::Max(),
            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        typename                    InputIteratorT,
        typename                    OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMax(
        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN


        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_items,
            cub::ArgMax(),
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
     *
     * \par
     * This operation computes segmented reductions within \p d_values_in using
     * the specified binary \p reduction_op functor.  The segments are identified by
     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // FlagT iterator type (not used)

        // Selection op (not used)

        // Default == operator
        typedef Equality EqualityOp;

xgboost/cub/cub/device/device_reduce.cuh  view on Meta::CPAN

            temp_storage_bytes,
            d_keys_in,
            d_unique_out,
            d_values_in,
            d_aggregates_out,
            d_num_runs_out,
            EqualityOp(),
            reduction_op,
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_reduce.cu
 */

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/device_run_length_encode.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Encode(
        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int         OffsetT;                    // Signed integer type for global offsets
        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
        typedef NullType    SelectOp;                   // Selection op (not used)
        typedef Equality    EqualityOp;                 // Default == operator
        typedef cub::Sum    ReductionOp;                // Value reduction operator

        // The lengths output value type
        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
            OffsetT,                                                                                                    // ... then the OffsetT type,

xgboost/cub/cub/device/device_run_length_encode.cuh  view on Meta::CPAN

            temp_storage_bytes,
            d_in,
            d_unique_out,
            LengthsInputIteratorT((LengthT) 1),
            d_counts_out,
            d_num_runs_out,
            EqualityOp(),
            ReductionOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
     *
     * \par
     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.

xgboost/cub/cub/device/device_run_length_encode.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t NonTrivialRuns(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int         OffsetT;                    // Signed integer type for global offsets
        typedef Equality    EqualityOp;                 // Default == operator

        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_offsets_out,
            d_lengths_out,
            d_num_runs_out,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


};


}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)


xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

        typename        InputIteratorT,
        typename        OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t ExclusiveSum(
        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The output value type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

        // Initial value

xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN


        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            Sum(),
            init_value,
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
     *
     * \par
     * - Supports non-commutative scan operators.
     * - \devicestorage
     *

xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t ExclusiveScan(
        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
        ScanOpT         scan_op,                            ///< [in] Binary scan functor
        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            scan_op,
            init_value,
            num_items,
            stream,
            debug_synchronous);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Inclusive scans
     *********************************************************************/
    //@{


xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

        typename            InputIteratorT,
        typename            OutputIteratorT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t InclusiveSum(
        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            Sum(),
            NullType(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
     *
     * \par
     * - Supports non-commutative scan operators.
     * - \devicestorage
     *

xgboost/cub/cub/device/device_scan.cuh  view on Meta::CPAN

        typename        ScanOpT>
    CUB_RUNTIME_FUNCTION
    static cudaError_t InclusiveScan(
        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
        ScanOpT         scan_op,                            ///< [in] Binary scan functor
        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            scan_op,
            NullType(),
            num_items,
            stream,
            debug_synchronous);
    }

    //@}  end member group

};

/**
 * \example example_device_scan.cu
 */

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int               *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
        const int               *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);

        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers and a corresponding
     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
     *   structure that indicates which of the two buffers is "current" (and thus

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted...
        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output value...
        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int               *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><...
        const int               *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>...
        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the consol...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group
    /******************************************************************//**
     * \name Keys-only
     *********************************************************************/
    //@{


xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }

    /**
     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
     *
     * \par
     * - The contents of the input data are not altered by the sorting operation
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
        DoubleBuffer<NullType>  d_values;

        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            false,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
     *
     * \par
     * - The sorting operation is given a pair of key buffers managed by a
     *   DoubleBuffer structure that indicates which of the two buffers is
     *   "current" (and thus contains the input data to be sorted).

xgboost/cub/cub/device/device_segmented_radix_sort.cuh  view on Meta::CPAN

        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted out...
        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
        const int           *d_begin_offsets,                       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>...
        const int           *d_end_offsets,                         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</...
        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // Null value type
        DoubleBuffer<NullType> d_values;

        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_keys,
            d_values,
            num_items,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            begin_bit,
            end_bit,
            true,
            stream,
            debug_synchronous);
    }


    //@}  end member group


};

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
        T                   initial_value,                               ///< [in] Initial value of the reduction for each segment
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            reduction_op,
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
     *
     * \par
     * - Uses \p 0 as the initial value of the reduction for each segment.
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Sum(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The output value type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::Sum(),
            OutputT(),            // zero-initialize
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Min(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::Min(),
            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMin(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::ArgMin(),
            initial_value,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
     *
     * \par
     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
     * - When input a contiguous sequence of segments, a single sequence
     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t Max(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input value type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::Max(),
            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
            stream,
            debug_synchronous);
    }


    /**
     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
     *
     * \par
     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION
    static cudaError_t ArgMax(
        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int                 *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</...
        int                 *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>...
        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Defa...
    {
        // Signed integer type for global offsets
        typedef int OffsetT;

        // The input type
        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;

        // The output tuple type
        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

xgboost/cub/cub/device/device_segmented_reduce.cuh  view on Meta::CPAN

            d_temp_storage,
            temp_storage_bytes,
            d_indexed_in,
            d_out,
            num_segments,
            d_begin_offsets,
            d_end_offsets,
            cub::ArgMax(),
            initial_value,
            stream,
            debug_synchronous);
    }

};

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)


xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Flagged(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_flags,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
     *
     * \par
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage
     *

xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t If(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        SelectOp                    select_op,                      ///< [in] Unary selection operator
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                EqualityOp;     // Equality operator (not used)

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            select_op,
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }


    /**
     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.p...
     *
     * \par
     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
     * - \devicestorage

xgboost/cub/cub/device/device_select.cuh  view on Meta::CPAN

        typename                    NumSelectedIteratorT>
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Unique(
        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        typedef int                     OffsetT;         // Signed integer type for global offsets
        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
        typedef NullType                SelectOp;       // Selection op (not used)
        typedef Equality                EqualityOp;     // Default == operator

        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            NULL,
            d_out,
            d_num_selected_out,
            SelectOp(),
            EqualityOp(),
            num_items,
            stream,
            debug_synchronous);
    }

};

/**
 * \example example_device_select_flagged.cu
 * \example example_device_select_if.cu
 * \example example_device_select_unique.cu
 */

xgboost/cub/cub/device/device_spmv.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        SpmvParams<ValueT, int> spmv_params;
        spmv_params.d_values             = d_values;
        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
        spmv_params.d_column_indices     = d_column_indices;
        spmv_params.d_vector_x           = d_vector_x;
        spmv_params.d_vector_y           = d_vector_y;
        spmv_params.num_rows             = num_rows;
        spmv_params.num_cols             = num_cols;
        spmv_params.num_nonzeros         = num_nonzeros;
        spmv_params.alpha                = 1.0;
        spmv_params.beta                 = 0.0;

        return DispatchSpmv<ValueT, int>::Dispatch(
            d_temp_storage,
            temp_storage_bytes,
            spmv_params,
            stream,
            debug_synchronous);
    }

    //@}  end member group
};



}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

 * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
 */

#pragma once

#include <stdio.h>
#include <iterator>
#include <limits>

#include "../../agent/agent_histogram.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i<...
        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
    #ifndef CUB_RUNTIME_ENABLED

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported);

    #else

        cudaError error = cudaSuccess;
        do

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN


            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;

            int histogram_init_block_threads    = 256;
            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;

            // Log DeviceHistogramInitKernel configuration
            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);

            // Invoke histogram_init_kernel
            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
                num_output_bins_wrapper,
                d_output_histograms_wrapper,
                tile_queue);

            // Return if empty problem
            if ((blocks_per_row == 0) || (blocks_per_col == 0))
                break;

            // Log histogram_sweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);

            // Invoke histogram_sweep_kernel
            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
                d_samples,
                num_output_bins_wrapper,
                num_privatized_bins_wrapper,
                d_output_histograms_wrapper,
                d_privatized_histograms_wrapper,

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                num_row_pixels,
                num_rows,
                row_stride_samples,
                tiles_per_row,
                tile_queue);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

        }
        while (0);

        return error;

    #endif // CUB_RUNTIME_ENABLED
    }


xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }
            else
            {
                // Dispatch shared-privatized approach
                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;

                if (CubDebug(error = PrivatizedDispatch(
                    d_temp_storage,
                    temp_storage_bytes,
                    d_samples,

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }

        } while (0);

        return error;
    }


    /**
     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels wher...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> ...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num...
        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries ...
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                num_output_levels,
                output_decode_op,
                max_num_output_bins,
                num_row_pixels,
                num_rows,
                row_stride_samples,
                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                histogram_sweep_config,
                stream,
                debug_synchronous))) break;

        } while (0);

        return error;
    }


    /**
     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
     */

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }
            else
            {
                // Dispatch shared-privatized approach
                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;

                if (CubDebug(error = PrivatizedDispatch(
                    d_temp_storage,
                    temp_storage_bytes,
                    d_samples,

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                    num_output_levels,
                    output_decode_op,
                    max_num_output_bins,
                    num_row_pixels,
                    num_rows,
                    row_stride_samples,
                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                    histogram_sweep_config,
                    stream,
                    debug_synchronous))) break;
            }
        }
        while (0);

        return error;
    }


    /**
     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel c...
        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> s...
        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>nu...
        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else

xgboost/cub/cub/device/dispatch/dispatch_histogram.cuh  view on Meta::CPAN

                num_output_levels,
                output_decode_op,
                max_num_output_bins,
                num_row_pixels,
                num_rows,
                row_stride_samples,
                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
                histogram_sweep_config,
                stream,
                debug_synchronous))) break;

        }
        while (0);

        return error;
    }

};


xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN


#include <stdio.h>
#include <iterator>

#include "../../agent/agent_radix_sort_upsweep.cuh"
#include "../../agent/agent_radix_sort_downsweep.cuh"
#include "../../agent/agent_scan.cuh"
#include "../../block/block_radix_sort.cuh"
#include "../../grid/grid_even_share.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

/******************************************************************************

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

    //------------------------------------------------------------------------------

    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
    OffsetT                 num_items;              ///< [in] Number of items to sort
    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    int                     ptx_version;            ///< [in] PTX version
    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers


    //------------------------------------------------------------------------------
    // Constructor
    //------------------------------------------------------------------------------

    /// Constructor
    CUB_RUNTIME_FUNCTION __forceinline__
    DispatchRadixSort(
        void*                   d_temp_storage,
        size_t                  &temp_storage_bytes,
        DoubleBuffer<KeyT>      &d_keys,
        DoubleBuffer<ValueT>    &d_values,
        OffsetT                 num_items,
        int                     begin_bit,
        int                     end_bit,
        bool                    is_overwrite_okay,
        cudaStream_t            stream,
        bool                    debug_synchronous,
        int                     ptx_version)
    :
        d_temp_storage(d_temp_storage),
        temp_storage_bytes(temp_storage_bytes),
        d_keys(d_keys),
        d_values(d_values),
        num_items(num_items),
        begin_bit(begin_bit),
        end_bit(end_bit),
        stream(stream),
        debug_synchronous(debug_synchronous),
        ptx_version(ptx_version),
        is_overwrite_okay(is_overwrite_okay)
    {}


    //------------------------------------------------------------------------------
    // Small-problem (single tile) invocation
    //------------------------------------------------------------------------------

    /// Invoke a single block to sort in-core

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

            {
                temp_storage_bytes = 1;
                break;
            }

            // Return if empty problem
            if (num_items == 0)
                break;

            // Log single_tile_kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);

            // Invoke upsweep_kernel with same grid size as downsweep_kernel
            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
                d_keys.Current(),
                d_keys.Alternate(),
                d_values.Current(),
                d_values.Alternate(),
                num_items,
                begin_bit,
                end_bit);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update selector
            d_keys.selector ^= 1;
            d_values.selector ^= 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        int             spine_length,
        int             &current_bit,
        PassConfigT     &pass_config)
    {
        cudaError error = cudaSuccess;
        do
        {
            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

            // Log upsweep_kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);

            // Invoke upsweep_kernel with same grid size as downsweep_kernel
            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
                d_keys_in,
                d_spine,
                num_items,
                current_bit,
                pass_bits,
                pass_config.even_share);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Log scan_kernel configuration
            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);

            // Invoke scan_kernel
            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
                d_spine,
                spine_length);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Log downsweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);

            // Invoke downsweep_kernel
            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
                d_keys_in,
                d_keys_out,
                d_values_in,
                d_values_out,
                d_spine,
                num_items,
                current_bit,
                pass_bits,
                pass_config.even_share);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update current bit
            current_bit += pass_bits;
        }
        while (0);

        return error;
    }


xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

    static cudaError_t Dispatch(
        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
        OffsetT                 num_items,              ///< [in] Number of items to sort
        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    {
        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;

        cudaError_t error;
        do {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Create dispatch functor
            DispatchRadixSort dispatch(
                d_temp_storage, temp_storage_bytes,
                d_keys, d_values,
                num_items, begin_bit, end_bit, is_overwrite_okay,
                stream, debug_synchronous, ptx_version);

            // Dispatch to chained policy
            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;

        } while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
    OffsetT                 num_items;              ///< [in] Number of items to sort
    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
    const OffsetT           *d_begin_offsets;       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data se...
    const OffsetT           *d_end_offsets;         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segmen...
    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    int                     ptx_version;            ///< [in] PTX version
    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers


    //------------------------------------------------------------------------------
    // Constructors
    //------------------------------------------------------------------------------

    /// Constructor
    CUB_RUNTIME_FUNCTION __forceinline__

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        DoubleBuffer<KeyT>      &d_keys,
        DoubleBuffer<ValueT>    &d_values,
        OffsetT                 num_items,
        OffsetT                 num_segments,
        const OffsetT           *d_begin_offsets,
        const OffsetT           *d_end_offsets,
        int                     begin_bit,
        int                     end_bit,
        bool                    is_overwrite_okay,
        cudaStream_t            stream,
        bool                    debug_synchronous,
        int                     ptx_version)
    :
        d_temp_storage(d_temp_storage),
        temp_storage_bytes(temp_storage_bytes),
        d_keys(d_keys),
        d_values(d_values),
        num_items(num_items),
        num_segments(num_segments),
        d_begin_offsets(d_begin_offsets),
        d_end_offsets(d_end_offsets),
        begin_bit(begin_bit),
        end_bit(end_bit),
        is_overwrite_okay(is_overwrite_okay),
        stream(stream),
        debug_synchronous(debug_synchronous),
        ptx_version(ptx_version)
    {}


    //------------------------------------------------------------------------------
    // Multi-segment invocation
    //------------------------------------------------------------------------------

    /// Invoke a three-kernel sorting pass at the current bit.
    template <typename PassConfigT>

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        ValueT          *d_values_out,
        int             &current_bit,
        PassConfigT     &pass_config)
    {
        cudaError error = cudaSuccess;
        do
        {
            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

            // Log kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,
                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);

            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
                d_keys_in, d_keys_out,
                d_values_in,  d_values_out,
                d_begin_offsets, d_end_offsets, num_segments,
                current_bit, pass_bits);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update current bit
            current_bit += pass_bits;
        }
        while (0);

        return error;
    }


xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
        int                     num_items,              ///< [in] Number of items to sort
        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
        const int               *d_begin_offsets,       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> dat...
        const int               *d_end_offsets,         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data se...
        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    {
        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;

        cudaError_t error;
        do {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Create dispatch functor
            DispatchSegmentedRadixSort dispatch(
                d_temp_storage, temp_storage_bytes,
                d_keys, d_values,
                num_items, num_segments, d_begin_offsets, d_end_offsets,
                begin_bit, end_bit, is_overwrite_okay,
                stream, debug_synchronous, ptx_version);

            // Dispatch to chained policy
            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;

        } while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN


#include <stdio.h>
#include <iterator>

#include "../../agent/agent_reduce.cuh"
#include "../../iterator/arg_index_input_iterator.cuh"
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_even_share.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../iterator/arg_index_input_iterator.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

/******************************************************************************

xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN

    //------------------------------------------------------------------------------

    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor 
    OutputT             init;                           ///< [in] The initial value of the reduction
    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    int                 ptx_version;                    ///< [in] PTX version

    //------------------------------------------------------------------------------
    // Constructor
    //------------------------------------------------------------------------------

    /// Constructor
    CUB_RUNTIME_FUNCTION __forceinline__
    DispatchReduce(
        void*                   d_temp_storage,
        size_t                  &temp_storage_bytes,
        InputIteratorT          d_in,
        OutputIteratorT         d_out,
        OffsetT                 num_items,
        ReductionOpT            reduction_op,
        OutputT                 init,
        cudaStream_t            stream,
        bool                    debug_synchronous,
        int                     ptx_version)
    :
        d_temp_storage(d_temp_storage),
        temp_storage_bytes(temp_storage_bytes),
        d_in(d_in),
        d_out(d_out),
        num_items(num_items),
        reduction_op(reduction_op),
        init(init),
        stream(stream),
        debug_synchronous(debug_synchronous),
        ptx_version(ptx_version)
    {}


    //------------------------------------------------------------------------------
    // Small-problem (single tile) invocation
    //------------------------------------------------------------------------------

    /// Invoke a single block block to reduce in-core
    template <

xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN

        do
        {
            // Return if the caller is simply requesting the size of the storage allocation
            if (d_temp_storage == NULL)
            {
                temp_storage_bytes = 1;
                break;
            }

            // Log single_reduce_sweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
                (long long) stream,
                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);

            // Invoke single_reduce_sweep_kernel
            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
                d_in,
                d_out,
                num_items,
                reduction_op,
                init);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    //------------------------------------------------------------------------------

xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN

            }
            else if (ActivePolicyT::ReducePolicy::GRID_MAPPING == GRID_MAPPING_DYNAMIC)
            {
                // Work is distributed dynamically
                int num_tiles       = (num_items + reduce_config.tile_size - 1) / reduce_config.tile_size;
                reduce_grid_size    = (num_tiles < reduce_device_occupancy) ?
                                        num_tiles :                 // Not enough to fill the device with threadblocks
                                        reduce_device_occupancy;    // Fill the device with threadblocks

                // Prepare the dynamic queue descriptor if necessary
                if (debug_synchronous) _CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);

                // Invoke prepare_drain_kernel
                prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
            }
            else
            {
                error = CubDebug(cudaErrorNotSupported ); break;
            }

            // Log device_reduce_sweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                reduce_grid_size,
                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
                (long long) stream,
                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
                reduce_config.sm_occupancy);

            // Invoke DeviceReduceKernel
            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
                d_in,
                d_block_reductions,
                num_items,
                even_share,
                queue,
                reduction_op);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Log single_reduce_sweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
                (long long) stream,
                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);

            // Invoke DeviceReduceSingleTileKernel
            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
                d_block_reductions,
                d_out,
                reduce_grid_size,
                reduction_op,
                init);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED

    }


xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
        OutputT         init,                               ///< [in] The initial value of the reduction
        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default ...
    {
        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;

        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Create dispatch functor
            DispatchReduce dispatch(
                d_temp_storage, temp_storage_bytes,
                d_in, d_out, num_items, reduction_op, init,
                stream, debug_synchronous, ptx_version);

            // Dispatch to chained policy
            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
        }
        while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN

    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
    OffsetT             *d_begin_offsets;       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segmen...
    OffsetT             *d_end_offsets;         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in...
    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor 
    OutputT             init;                   ///< [in] The initial value of the reduction
    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    int                 ptx_version;            ///< [in] PTX version

    //------------------------------------------------------------------------------
    // Constructor
    //------------------------------------------------------------------------------

    /// Constructor
    CUB_RUNTIME_FUNCTION __forceinline__
    DispatchSegmentedReduce(
        void*                   d_temp_storage,
        size_t                  &temp_storage_bytes,
        InputIteratorT          d_in,
        OutputIteratorT         d_out,
        OffsetT                 num_segments,
        OffsetT                 *d_begin_offsets,
        OffsetT                 *d_end_offsets,
        ReductionOpT            reduction_op,
        OutputT                 init,
        cudaStream_t            stream,
        bool                    debug_synchronous,
        int                     ptx_version)
    :
        d_temp_storage(d_temp_storage),
        temp_storage_bytes(temp_storage_bytes),
        d_in(d_in),
        d_out(d_out),
        num_segments(num_segments),
        d_begin_offsets(d_begin_offsets),
        d_end_offsets(d_end_offsets),
        reduction_op(reduction_op),
        init(init),
        stream(stream),
        debug_synchronous(debug_synchronous),
        ptx_version(ptx_version)
    {}



    //------------------------------------------------------------------------------
    // Chained policy invocation
    //------------------------------------------------------------------------------

    /// Invocation

xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN

            {
                temp_storage_bytes = 1;
                return cudaSuccess;
            }

            // Init kernel configuration
            KernelConfig segmented_reduce_config;
            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;

            // Log device_reduce_sweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                num_segments,
                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
                (long long) stream,
                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
                segmented_reduce_config.sm_occupancy);

            // Invoke DeviceReduceKernel
            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
                d_in,
                d_out,
                d_begin_offsets,
                d_end_offsets,
                num_segments,
                reduction_op,
                init);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED

    }


xgboost/cub/cub/device/dispatch/dispatch_reduce.cuh  view on Meta::CPAN

        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
        int             *d_begin_offsets,                   ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>...
        int             *d_end_offsets,                     ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> dat...
        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
        OutputT         init,                               ///< [in] The initial value of the reduction
        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default ...
    {
        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;

        if (num_segments <= 0)
            return cudaSuccess;

        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Create dispatch functor
            DispatchSegmentedReduce dispatch(
                d_temp_storage, temp_storage_bytes,
                d_in, d_out,
                num_segments, d_begin_offsets, d_end_offsets,
                reduction_op, init,
                stream, debug_synchronous, ptx_version);

            // Dispatch to chained policy
            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
        }
        while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh  view on Meta::CPAN

        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
        OffsetT                     num_items,                  ///< [in] Total number of items to select from
        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
        ScanInitKernelT            	init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
        ReduceByKeyKernelT         	reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
    {

#ifndef CUB_RUNTIME_ENABLED
      (void)d_temp_storage;
      (void)temp_storage_bytes;
      (void)d_keys_in;
      (void)d_unique_out;
      (void)d_values_in;
      (void)d_aggregates_out;
      (void)d_num_runs_out;
      (void)equality_op;
      (void)reduction_op;
      (void)num_items;
      (void)stream;
      (void)debug_synchronous;
      (void)init_kernel;
      (void)reduce_by_key_kernel;
      (void)reduce_by_key_config;

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported);

#else

        cudaError error = cudaSuccess;

xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh  view on Meta::CPAN

                // Return if the caller is simply requesting the size of the storage allocation
                break;
            }

            // Construct the tile status interface
            ScanTileStateT tile_state;
            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

            // Log init_kernel configuration
            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

            // Invoke init_kernel to initialize tile descriptors
            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
                tile_state,
                num_tiles,
                d_num_runs_out);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Return if empty problem
            if (num_items == 0)
                break;

            // Get SM occupancy for reduce_by_key_kernel
            int reduce_by_key_sm_occupancy;
            if (CubDebug(error = MaxSmOccupancy(
                reduce_by_key_sm_occupancy,            // out
                reduce_by_key_kernel,

xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh  view on Meta::CPAN


            // Get max x-dimension of grid
            int max_dim_x;
            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;

            // Run grids in epochs (in case number of tiles exceeds max x-dimension
            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
            {
                // Log reduce_by_key_kernel configuration
                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);

                // Invoke reduce_by_key_kernel
                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
                    d_keys_in,
                    d_unique_out,
                    d_values_in,
                    d_aggregates_out,
                    d_num_runs_out,
                    tile_state,
                    start_tile,
                    equality_op,
                    reduction_op,
                    num_items);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
            }
        }
        while (0);

        return error;

#endif  // CUB_RUNTIME_ENABLED
    }


xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh  view on Meta::CPAN

        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
        OffsetT                     num_items,                      ///< [in] Total number of items to select from
        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else
            ptx_version = CUB_PTX_ARCH;

xgboost/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh  view on Meta::CPAN

                temp_storage_bytes,
                d_keys_in,
                d_unique_out,
                d_values_in,
                d_aggregates_out,
                d_num_runs_out,
                equality_op,
                reduction_op,
                num_items,
                stream,
                debug_synchronous,
                ptx_version,
                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
                reduce_by_key_config))) break;
        }
        while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_rle.cuh  view on Meta::CPAN

    static cudaError_t Dispatch(
        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
    {

#ifndef CUB_RUNTIME_ENABLED

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported);

xgboost/cub/cub/device/dispatch/dispatch_rle.cuh  view on Meta::CPAN

                // Return if the caller is simply requesting the size of the storage allocation
                break;
            }

            // Construct the tile status interface
            ScanTileStateT tile_status;
            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

            // Log device_scan_init_kernel configuration
            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
                tile_status,
                num_tiles,
                d_num_runs_out);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Return if empty problem
            if (num_items == 0)
                break;

            // Get SM occupancy for device_rle_sweep_kernel
            int device_rle_kernel_sm_occupancy;
            if (CubDebug(error = MaxSmOccupancy(
                device_rle_kernel_sm_occupancy,            // out
                device_rle_sweep_kernel,

xgboost/cub/cub/device/dispatch/dispatch_rle.cuh  view on Meta::CPAN

            int max_dim_x;
            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;

            // Get grid size for scanning tiles
            dim3 scan_grid_size;
            scan_grid_size.z = 1;
            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);

            // Log device_rle_sweep_kernel configuration
            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);

            // Invoke device_rle_sweep_kernel
            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
                d_in,
                d_offsets_out,
                d_lengths_out,
                d_num_runs_out,
                tile_status,
                equality_op,
                num_items,
                num_tiles);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

        }
        while (0);

        return error;

#endif  // CUB_RUNTIME_ENABLED
    }


xgboost/cub/cub/device/dispatch/dispatch_rle.cuh  view on Meta::CPAN

    static cudaError_t Dispatch(
        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else
            ptx_version = CUB_PTX_ARCH;

xgboost/cub/cub/device/dispatch/dispatch_rle.cuh  view on Meta::CPAN

            if (CubDebug(error = Dispatch(
                d_temp_storage,
                temp_storage_bytes,
                d_in,
                d_offsets_out,
                d_lengths_out,
                d_num_runs_out,
                equality_op,
                num_items,
                stream,
                debug_synchronous,
                ptx_version,
                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
                device_rle_config))) break;
        }
        while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_scan.cuh  view on Meta::CPAN


#pragma once

#include <stdio.h>
#include <iterator>

#include "../../agent/agent_scan.cuh"
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_arch.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


xgboost/cub/cub/device/dispatch/dispatch_scan.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*               d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&             temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT      d_in,                   ///< [in] Pointer to the input sequence of data items
        OutputIteratorT     d_out,                  ///< [out] Pointer to the output sequence of data items
        ScanOpT             scan_op,                ///< [in] Binary scan functor 
        InitValueT          init_value,             ///< [in] Initial value to seed the exclusive scan
        OffsetT             num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t        stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                debug_synchronous,      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
        int                 /*ptx_version*/,        ///< [in] PTX version of dispatch kernels
        ScanInitKernelPtrT  init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
        ScanSweepKernelPtrT scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
        KernelConfig        scan_kernel_config)     ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
    {

#ifndef CUB_RUNTIME_ENABLED
        (void)d_temp_storage;
        (void)temp_storage_bytes;
        (void)d_in;
        (void)d_out;
        (void)scan_op;
        (void)init_value;
        (void)num_items;
        (void)stream;
        (void)debug_synchronous;
        (void)init_kernel;
        (void)scan_kernel;
        (void)scan_kernel_config;

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported);

#else
        cudaError error = cudaSuccess;
        do

xgboost/cub/cub/device/dispatch/dispatch_scan.cuh  view on Meta::CPAN

            // Return if empty problem
            if (num_items == 0)
                break;

            // Construct the tile status interface
            ScanTileStateT tile_state;
            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

            // Log init_kernel configuration
            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

            // Invoke init_kernel to initialize tile descriptors
            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
                tile_state,
                num_tiles);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Get SM occupancy for scan_kernel
            int scan_sm_occupancy;
            if (CubDebug(error = MaxSmOccupancy(
                scan_sm_occupancy,            // out
                scan_kernel,
                scan_kernel_config.block_threads))) break;

            // Get max x-dimension of grid
            int max_dim_x;
            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;

            // Run grids in epochs (in case number of tiles exceeds max x-dimension
            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
            {
                // Log scan_kernel configuration
                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                    start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);

                // Invoke scan_kernel
                scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
                    d_in,
                    d_out,
                    tile_state,
                    start_tile,
                    scan_op,
                    init_value,
                    num_items);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
            }
        }
        while (0);

        return error;

#endif  // CUB_RUNTIME_ENABLED
    }


xgboost/cub/cub/device/dispatch/dispatch_scan.cuh  view on Meta::CPAN

    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
        ScanOpT         scan_op,                ///< [in] Binary scan functor 
        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Get kernel kernel dispatch configurations
            KernelConfig scan_kernel_config;

xgboost/cub/cub/device/dispatch/dispatch_scan.cuh  view on Meta::CPAN

            // Dispatch
            if (CubDebug(error = Dispatch(
                d_temp_storage,
                temp_storage_bytes,
                d_in,
                d_out,
                scan_op,
                init_value,
                num_items,
                stream,
                debug_synchronous,
                ptx_version,
                DeviceScanInitKernel<ScanTileStateT>,
                DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
                scan_kernel_config))) break;
        }
        while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh  view on Meta::CPAN

        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
        SelectOpT                   select_op,                      ///< [in] Selection operator
        EqualityOpT                 equality_op,                    ///< [in] Equality operator
        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
    {

#ifndef CUB_RUNTIME_ENABLED
        (void)d_temp_storage;
        (void)temp_storage_bytes;
        (void)d_in;
        (void)d_flags;
        (void)d_selected_out;
        (void)d_num_selected_out;
        (void)select_op;
        (void)equality_op;
        (void)num_items;
        (void)stream;
        (void)debug_synchronous;
        (void)scan_init_kernel;
        (void)select_if_kernel;
        (void)select_if_config;

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported);

#else

        cudaError error = cudaSuccess;

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh  view on Meta::CPAN

                // Return if the caller is simply requesting the size of the storage allocation
                break;
            }

            // Construct the tile status interface
            ScanTileStateT tile_status;
            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

            // Log scan_init_kernel configuration
            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

            // Invoke scan_init_kernel to initialize tile descriptors
            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
                tile_status,
                num_tiles,
                d_num_selected_out);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Return if empty problem
            if (num_items == 0)
                break;

            // Get SM occupancy for select_if_kernel
            int range_select_sm_occupancy;
            if (CubDebug(error = MaxSmOccupancy(
                range_select_sm_occupancy,            // out
                select_if_kernel,

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh  view on Meta::CPAN

            int max_dim_x;
            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;

            // Get grid size for scanning tiles
            dim3 scan_grid_size;
            scan_grid_size.z = 1;
            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);

            // Log select_if_kernel configuration
            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);

            // Invoke select_if_kernel
            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
                d_in,
                d_flags,
                d_selected_out,
                d_num_selected_out,
                tile_status,
                select_op,
                equality_op,
                num_items,
                num_tiles);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
        }
        while (0);

        return error;

#endif  // CUB_RUNTIME_ENABLED
    }


    /**

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh  view on Meta::CPAN

        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
        SelectOpT                   select_op,                      ///< [in] Selection operator
        EqualityOpT                 equality_op,                    ///< [in] Equality operator
        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  ...
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else
            ptx_version = CUB_PTX_ARCH;

xgboost/cub/cub/device/dispatch/dispatch_select_if.cuh  view on Meta::CPAN

                d_temp_storage,
                temp_storage_bytes,
                d_in,
                d_flags,
                d_selected_out,
                d_num_selected_out,
                select_op,
                equality_op,
                num_items,
                stream,
                debug_synchronous,
                ptx_version,
                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
                select_if_config))) break;
        }
        while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh  view on Meta::CPAN

 */

#pragma once

#include <stdio.h>
#include <iterator>

#include "dispatch_scan.cuh"
#include "../../agent/agent_spmv_orig.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh  view on Meta::CPAN

     * kernel invocations.
     */
    template <
        typename                SpmvKernelT>                        ///< Function type of cub::AgentSpmvKernel
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
        KernelConfig            spmv_config)                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
    {
#ifndef CUB_RUNTIME_ENABLED

        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported );

#else
        cudaError error = cudaSuccess;

xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh  view on Meta::CPAN

            void* allocations[1];
            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
            if (d_temp_storage == NULL)
            {
                // Return if the caller is simply requesting the size of the storage allocation
                return cudaSuccess;
            }
            KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[0];  // Agent carry-out pairs

            // Log spmv_kernel configuration
            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                spmv_grid_size, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);

            // Invoke spmv_kernel
            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
                spmv_params,
                merge_items_per_block,
                d_tile_carry_pairs);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    /**
     * Internal dispatch routine for computing a device-wide reduction
     */
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else
            ptx_version = CUB_PTX_ARCH;

xgboost/cub/cub/device/dispatch/dispatch_spmv_csrt.cuh  view on Meta::CPAN


            // Get kernel kernel dispatch configurations
            KernelConfig spmv_config;
            InitConfigs(ptx_version, spmv_config);

            if (CubDebug(error = Dispatch(
                d_temp_storage, 
                temp_storage_bytes, 
                spmv_params, 
                stream, 
                debug_synchronous,
                DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, false, false>,
                spmv_config))) break;

        }
        while (0);

        return error;
    }
};

xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh  view on Meta::CPAN


#pragma once

#include <stdio.h>
#include <iterator>

#include "../../agent/single_pass_scan_operators.cuh"
#include "../../agent/agent_segment_fixup.cuh"
#include "../../agent/agent_spmv_orig.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh  view on Meta::CPAN

        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
    {
#ifndef CUB_RUNTIME_ENABLED

        // Kernel launch not supported from this device

xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh  view on Meta::CPAN

                {
                    // Return if the caller is simply requesting the size of the storage allocation
                    temp_storage_bytes = 1;
                    break;
                }

                // Get search/init grid dims
                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;

                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);

                // Invoke spmv_search_kernel
                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
                    spmv_params);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

                break;
            }

            // Get device ordinal
            int device_ordinal;
            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;

            // Get SM count
            int sm_count;

xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh  view on Meta::CPAN

//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
            {
                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
                d_tile_coordinates = NULL;
            }
            else
            {
                // Use separate search kernel if we have enough spmv tiles to saturate the device

                // Log spmv_search_kernel configuration
                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
                    search_grid_size, search_block_size, (long long) stream);

                // Invoke spmv_search_kernel
                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
                    num_merge_tiles,
                    d_tile_coordinates,
                    spmv_params);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
            }

            // Log spmv_kernel configuration
            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);

            // Invoke spmv_kernel
            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
                spmv_params,
                d_tile_coordinates,
                d_tile_carry_pairs,
                num_merge_tiles,
                tile_state,
                num_segment_fixup_tiles);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Run reduce-by-key fixup if necessary
            if (num_merge_tiles > 1)
            {
                // Log segment_fixup_kernel configuration
                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);

                // Invoke segment_fixup_kernel
                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
                    d_tile_carry_pairs,
                    spmv_params.d_vector_y,
                    num_merge_tiles,
                    num_segment_fixup_tiles,
                    tile_state);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
            }

#if (CUB_PTX_ARCH == 0)
            // Free textures
            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
#endif
        }
        while (0);

        return error;

xgboost/cub/cub/device/dispatch/dispatch_spmv_orig.cuh  view on Meta::CPAN


    /**
     * Internal dispatch routine for computing a device-wide reduction
     */
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else
            ptx_version = CUB_PTX_ARCH;
    #endif

            // Get kernel kernel dispatch configurations
            KernelConfig spmv_config, segment_fixup_config;
            InitConfigs(ptx_version, spmv_config, segment_fixup_config);

            if (CubDebug(error = Dispatch(
                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                spmv_config, segment_fixup_config))) break;

/*
            // Dispatch
            if (spmv_params.beta == 0.0)
            {
                if (spmv_params.alpha == 1.0)
                {
                    // Dispatch y = A*x
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, segment_fixup_config))) break;
                }
                else
                {
                    // Dispatch y = alpha*A*x
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, false>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, segment_fixup_config))) break;
                }
            }
            else
            {
                if (spmv_params.alpha == 1.0)
                {
                    // Dispatch y = A*x + beta*y
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false, true>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, segment_fixup_config))) break;
                }
                else
                {
                    // Dispatch y = alpha*A*x + beta*y
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, true>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, segment_fixup_config))) break;
                }
            }
*/
        }
        while (0);

xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh  view on Meta::CPAN


#pragma once

#include <stdio.h>
#include <iterator>

#include "../../agent/single_pass_scan_operators.cuh"
#include "../../agent/agent_segment_fixup.cuh"
#include "../../agent/agent_spmv_row_based.cuh"
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {

xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh  view on Meta::CPAN

//        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
//        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
        typename                SpmvKernelT>                        ///< Function type of cub::AgentSpmvKernel
//        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p fals...
//        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
//        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
//        SegmentFixupKernelT     fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
        KernelConfig            fixup_config)               ///< [in] Dispatch parameters that match the policy that \p fixup_kernel was compiled for
    {
#ifndef CUB_RUNTIME_ENABLED

        // Kernel launch not supported from this device

xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh  view on Meta::CPAN

                {
                    // Return if the caller is simply requesting the size of the storage allocation
                    temp_storage_bytes = 1;
                    return cudaSuccess;
                }

                // Get search/init grid dims
                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;

                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);

                // Invoke spmv_search_kernel
                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
                    spmv_params);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

                break;
            }
*/
            // Get device ordinal
            int device_ordinal;
            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;

            // Get SM count
            int sm_count;

xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh  view on Meta::CPAN


                // Decrease rows per tile if needed to accomodate minimum parallelism
                unsigned int spmv_device_occupancy = sm_count * 2;
//                unsigned int spmv_device_occupancy = sm_count * ((spmv_sm_occupancy + 1) / 2);
                if (spmv_params.num_rows < spmv_device_occupancy * rows_per_tile)
                    rows_per_tile = (spmv_params.num_rows) / spmv_device_occupancy;
            }

            rows_per_tile = CUB_MAX(rows_per_tile, 2);

            if (debug_synchronous) _CubLog("Rows per tile: %d\n", rows_per_tile);

            // Number of tiles for kernels
            unsigned int num_spmv_tiles     = (spmv_params.num_rows + rows_per_tile - 1) / rows_per_tile;
//            unsigned int num_fixup_tiles    = (num_spmv_tiles + fixup_tile_size - 1) / fixup_tile_size;

            // Get grid dimensions
            dim3 spmv_grid_size(
                CUB_MIN(num_spmv_tiles, max_dim_x),
                (num_spmv_tiles + max_dim_x - 1) / max_dim_x,
                1);

xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh  view on Meta::CPAN

            if (search_grid_size < sm_count)
            {
                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
                d_tile_coordinates = NULL;
            }
            else
            {
                // Use separate search kernel if we have enough spmv tiles to saturate the device

                // Log spmv_search_kernel configuration
                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
                    search_grid_size, search_block_size, (long long) stream);

                // Invoke spmv_search_kernel
                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
                    num_spmv_tiles,
                    d_tile_coordinates,
                    spmv_params);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
            }
*/
            // Log spmv_kernel configuration
            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);

            // Invoke spmv_kernel
            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
                spmv_params,
//                d_tile_coordinates,
//                d_tile_carry_pairs,
//                num_spmv_tiles,
//                tile_state,
//                num_fixup_tiles,
                rows_per_tile);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
/*
            // Run reduce-by-key fixup if necessary
            if (num_spmv_tiles > 1)
            {
                // Log fixup_kernel configuration
                if (debug_synchronous) _CubLog("Invoking fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
                    fixup_grid_size.x, fixup_grid_size.y, fixup_grid_size.z, fixup_config.block_threads, (long long) stream, fixup_config.items_per_thread, fixup_sm_occupancy);

                // Invoke fixup_kernel
                fixup_kernel<<<fixup_grid_size, fixup_config.block_threads, 0, stream>>>(
                    d_tile_carry_pairs,
                    spmv_params.d_vector_y,
                    num_spmv_tiles,
                    num_fixup_tiles,
                    tile_state);

                // Check for failure to launch
                if (CubDebug(error = cudaPeekAtLastError())) break;

                // Sync the stream if specified to flush runtime errors
                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
            }
*/
#if (CUB_PTX_ARCH == 0)
            // Free textures
//            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
#endif
        }
        while (0);

        return error;

xgboost/cub/cub/device/dispatch/dispatch_spmv_row_based.cuh  view on Meta::CPAN


    /**
     * Internal dispatch routine for computing a device-wide reduction
     */
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
    {
        cudaError error = cudaSuccess;
        do
        {
            // Get PTX version
            int ptx_version;
    #if (CUB_PTX_ARCH == 0)
            if (CubDebug(error = PtxVersion(ptx_version))) break;
    #else
            ptx_version = CUB_PTX_ARCH;
    #endif

            // Get kernel kernel dispatch configurations
            KernelConfig spmv_config, fixup_config;
            InitConfigs(ptx_version, spmv_config, fixup_config);

            if (CubDebug(error = Dispatch(
                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
//                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
//                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
//                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                spmv_config, fixup_config))) break;

/*
            // Dispatch
            if (spmv_params.beta == 0.0)
            {
                if (spmv_params.alpha == 1.0)
                {
                    // Dispatch y = A*x
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, fixup_config))) break;
                }
                else
                {
                    // Dispatch y = alpha*A*x
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, false>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, fixup_config))) break;
                }
            }
            else
            {
                if (spmv_params.alpha == 1.0)
                {
                    // Dispatch y = A*x + beta*y
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false, true>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, fixup_config))) break;
                }
                else
                {
                    // Dispatch y = alpha*A*x + beta*y
                    if (CubDebug(error = Dispatch(
                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, true>,
                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                        spmv_config, fixup_config))) break;
                }
            }
*/
        }
        while (0);

xgboost/cub/cub/grid/grid_barrier.cuh  view on Meta::CPAN

 *
 ******************************************************************************/

/**
 * \file
 * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 */

#pragma once

#include "../util_debug.cuh"
#include "../util_namespace.cuh"
#include "../thread/thread_load.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


xgboost/cub/cub/grid/grid_queue.cuh  view on Meta::CPAN

 ******************************************************************************/

/**
 * \file
 * cub::GridQueue is a descriptor utility for dynamic queue management.
 */

#pragma once

#include "../util_namespace.cuh"
#include "../util_debug.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/**
 * \addtogroup GridModule

xgboost/cub/cub/iterator/tex_obj_input_iterator.cuh  view on Meta::CPAN

 */

#pragma once

#include <iterator>
#include <iostream>

#include "../thread/thread_load.cuh"
#include "../thread/thread_store.cuh"
#include "../util_device.cuh"
#include "../util_debug.cuh"
#include "../util_namespace.cuh"

#if (THRUST_VERSION >= 100700)
    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION


/// Optional outer namespace(s)

xgboost/cub/cub/iterator/tex_ref_input_iterator.cuh  view on Meta::CPAN

 */

#pragma once

#include <iterator>
#include <iostream>

#include "../thread/thread_load.cuh"
#include "../thread/thread_store.cuh"
#include "../util_device.cuh"
#include "../util_debug.cuh"
#include "../util_namespace.cuh"

#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer

#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
    #include <thrust/iterator/iterator_facade.h>
    #include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION




( run in 1.210 second using v1.01-cache-2.11-cpan-49f99fa48dc )