Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        stream(stream),
        debug_synchronous(debug_synchronous),
        ptx_version(ptx_version),
        is_overwrite_okay(is_overwrite_okay)
    {}


    //------------------------------------------------------------------------------
    // Small-problem (single tile) invocation
    //------------------------------------------------------------------------------

    /// Invoke a single block to sort in-core
    template <
        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
    CUB_RUNTIME_FUNCTION __forceinline__
    cudaError_t InvokeSingleTile(
        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
    {
#ifndef CUB_RUNTIME_ENABLED
        (void)single_tile_kernel;
        // Kernel launch not supported from this device
        return CubDebug(cudaErrorNotSupported );
#else
        cudaError error = cudaSuccess;
        do
        {
            // Return if the caller is simply requesting the size of the storage allocation
            if (d_temp_storage == NULL)
            {
                temp_storage_bytes = 1;
                break;
            }

            // Return if empty problem
            if (num_items == 0)
                break;

            // Log single_tile_kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);

            // Invoke upsweep_kernel with same grid size as downsweep_kernel
            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
                d_keys.Current(),
                d_keys.Alternate(),
                d_values.Current(),
                d_values.Alternate(),
                num_items,
                begin_bit,
                end_bit);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Update selector
            d_keys.selector ^= 1;
            d_values.selector ^= 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    //------------------------------------------------------------------------------
    // Normal problem size invocation
    //------------------------------------------------------------------------------

    /**
     * Invoke a three-kernel sorting pass at the current bit.
     */
    template <typename PassConfigT>
    CUB_RUNTIME_FUNCTION __forceinline__
    cudaError_t InvokePass(
        const KeyT      *d_keys_in,
        KeyT            *d_keys_out,
        const ValueT    *d_values_in,
        ValueT          *d_values_out,
        OffsetT         *d_spine,
        int             spine_length,
        int             &current_bit,
        PassConfigT     &pass_config)
    {
        cudaError error = cudaSuccess;
        do
        {
            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));

            // Log upsweep_kernel configuration
            if (debug_synchronous)
                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);

            // Invoke upsweep_kernel with same grid size as downsweep_kernel
            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
                d_keys_in,
                d_spine,
                num_items,
                current_bit,
                pass_bits,
                pass_config.even_share);

            // Check for failure to launch
            if (CubDebug(error = cudaPeekAtLastError())) break;

            // Sync the stream if specified to flush runtime errors
            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;

            // Log scan_kernel configuration
            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);

            // Invoke scan_kernel
            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

                    typename ActivePolicyT::ScanPolicy, 
                    typename ActivePolicyT::DownsweepPolicy>(
                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;

            if ((error = alt_pass_config.template InitPassConfig<
                    typename ActivePolicyT::AltUpsweepPolicy, 
                    typename ActivePolicyT::ScanPolicy, 
                    typename ActivePolicyT::AltDownsweepPolicy>(
                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;

            // Get maximum spine length
            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;

            // Temporary storage allocation requirements
            void* allocations[3];
            size_t allocation_sizes[3] =
            {
                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
            };

            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;

            // Return if the caller is simply requesting the size of the storage allocation
            if (d_temp_storage == NULL)
                return cudaSuccess;

            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
            int num_bits            = end_bit - begin_bit;
            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
            bool is_num_passes_odd  = num_passes & 1;
            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));

            // Alias the temporary storage allocations
            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);

            DoubleBuffer<KeyT> d_keys_remaining_passes(
                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());

            DoubleBuffer<ValueT> d_values_remaining_passes(
                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());

            // Run first pass, consuming from the input's current buffers
            int current_bit = begin_bit;
            if (CubDebug(error = InvokePass(
                d_keys.Current(), d_keys_remaining_passes.Current(),
                d_values.Current(), d_values_remaining_passes.Current(),
                d_spine, spine_length, current_bit,
                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;

            // Run remaining passes
            while (current_bit < end_bit)
            {
                if (CubDebug(error = InvokePass(
                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    d_spine, spine_length, current_bit,
                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;

                // Invert selectors
                d_keys_remaining_passes.selector ^= 1;
                d_values_remaining_passes.selector ^= 1;
            }

            // Update selector
            if (!is_overwrite_okay) {
                num_passes = 1; // Sorted data always ends up in the other vector
            }

            d_keys.selector = (d_keys.selector + num_passes) & 1;
            d_values.selector = (d_values.selector + num_passes) & 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    //------------------------------------------------------------------------------
    // Chained policy invocation
    //------------------------------------------------------------------------------

    /// Invocation
    template <typename ActivePolicyT>
    CUB_RUNTIME_FUNCTION __forceinline__
    cudaError_t Invoke()
    {
        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;

        // Force kernel code-generation in all compiler passes
        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
        {
            // Small, single tile size
            return InvokeSingleTile<ActivePolicyT>(
                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
        }
        else
        {
            // Regular size
            return InvokePasses<ActivePolicyT>(
                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
        }
    }


    //------------------------------------------------------------------------------
    // Dispatch entrypoints
    //------------------------------------------------------------------------------

    /**
     * Internal dispatch routine
     */
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
        OffsetT                 num_items,              ///< [in] Number of items to sort
        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.

xgboost/cub/cub/device/dispatch/dispatch_radix_sort.cuh  view on Meta::CPAN

        return CubDebug(cudaErrorNotSupported );
#else

        cudaError error = cudaSuccess;
        do
        {
            // Init regular and alternate kernel configurations
            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;

            // Temporary storage allocation requirements
            void* allocations[2];
            size_t allocation_sizes[2] =
            {
                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
            };

            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;

            // Return if the caller is simply requesting the size of the storage allocation
            if (d_temp_storage == NULL)
            {
                if (temp_storage_bytes == 0)
                    temp_storage_bytes = 1;
                return cudaSuccess;
            }

            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
            int num_bits            = end_bit - begin_bit;
            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
            bool is_num_passes_odd  = num_passes & 1;
            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));

            DoubleBuffer<KeyT> d_keys_remaining_passes(
                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());

            DoubleBuffer<ValueT> d_values_remaining_passes(
                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());

            // Run first pass, consuming from the input's current buffers
            int current_bit = begin_bit;

            if (CubDebug(error = InvokePass(
                d_keys.Current(), d_keys_remaining_passes.Current(),
                d_values.Current(), d_values_remaining_passes.Current(),
                current_bit,
                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;

            // Run remaining passes
            while (current_bit < end_bit)
            {
                if (CubDebug(error = InvokePass(
                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                    current_bit,
                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;

                // Invert selectors and update current bit
                d_keys_remaining_passes.selector ^= 1;
                d_values_remaining_passes.selector ^= 1;
            }

            // Update selector
            if (!is_overwrite_okay) {
                num_passes = 1; // Sorted data always ends up in the other vector
            }

            d_keys.selector = (d_keys.selector + num_passes) & 1;
            d_values.selector = (d_values.selector + num_passes) & 1;
        }
        while (0);

        return error;

#endif // CUB_RUNTIME_ENABLED
    }


    //------------------------------------------------------------------------------
    // Chained policy invocation
    //------------------------------------------------------------------------------

    /// Invocation
    template <typename ActivePolicyT>
    CUB_RUNTIME_FUNCTION __forceinline__
    cudaError_t Invoke()
    {
        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;

        // Force kernel code-generation in all compiler passes
        return InvokePasses<ActivePolicyT>(
            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
    }


    //------------------------------------------------------------------------------
    // Dispatch entrypoints
    //------------------------------------------------------------------------------


    /// Internal dispatch routine
    CUB_RUNTIME_FUNCTION __forceinline__
    static cudaError_t Dispatch(
        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
        int                     num_items,              ///< [in] Number of items to sort
        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
        const int               *d_begin_offsets,       ///< [in] %Device-accessible pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> dat...
        const int               *d_end_offsets,         ///< [in] %Device-accessible pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data se...
        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
    {
        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;

        cudaError_t error;
        do {
            // Get PTX version
            int ptx_version;
            if (CubDebug(error = PtxVersion(ptx_version))) break;

            // Create dispatch functor
            DispatchSegmentedRadixSort dispatch(
                d_temp_storage, temp_storage_bytes,



( run in 1.647 second using v1.01-cache-2.11-cpan-13bb782fe5a )