iterations results from the CPAN

iterations

Alien-XGBoost

view release on metacpan or search on metacpan

xgboost/R-package/R/callbacks.R view on Meta::CPAN

#' Callback closures for booster training.
#'
#' These are used to perform various service tasks either during boosting iterations or at the end.
#' This approach helps to modularize many of such tasks without bloating the main training methods, 
#' and it offers .
#' 
#' @details
#' By default, a callback function is run after each boosting iteration.
#' An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
#' 
#' When a callback function has \code{finalize} parameter, its finalizer part will also be run after 
#' the boosting is completed.
#'

xgboost/R-package/R/callbacks.R view on Meta::CPAN

#
# Callbacks -------------------------------------------------------------------
# 

#' Callback closure for printing the result of evaluation
#' 
#' @param period  results would be printed every number of periods
#' @param showsd  whether standard deviations should be printed (when available)
#' 
#' @details
#' The callback function prints the result of evaluation at every \code{period} iterations.
#' The initial and the last iteration's evaluations are always printed.
#' 
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst_evaluation} (also \code{bst_evaluation_err} when available),
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration}.
#' 
#' @seealso
#' \code{\link{callbacks}}

xgboost/R-package/R/callbacks.R view on Meta::CPAN

  }
  attr(callback, 'call') <- match.call()
  attr(callback, 'name') <- 'cb.early.stop'
  callback
}


#' Callback closure for saving a model file.
#' 
#' @param save_period save the model to disk after every 
#'        \code{save_period} iterations; 0 means save the model at the end.
#' @param save_name the name or path for the saved model file.
#'        It can contain a \code{\link[base]{sprintf}} formatting specifier 
#'        to include the integer iteration number in the file name.
#'        E.g., with \code{save_name} = 'xgboost_%04d.model', 
#'        the file saved at iteration 50 would be named "xgboost_0050.model".
#' 
#' @details 
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
#' 
#' Callback function expects the following values to be set in its calling frame:

xgboost/R-package/R/xgb.Booster.R view on Meta::CPAN


#' Predict method for eXtreme Gradient Boosting model
#' 
#' Predicted values based on either xgboost model or model handle object.
#' 
#' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix. Pick a float value that represents
#'        missing values in data (e.g., sometimes 0 or some other extreme value is used).
#' @param outputmargin whether the prediction should be returned in the for of original untransformed 
#'        sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for 
#'        logistic regression would result in predictions for log-odds instead of probabilities.
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
#'        It will use all the trees by default (\code{NULL} value).
#' @param predleaf whether predict leaf index instead.
#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several 
#'        prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
#' @param ... Parameters passed to \code{predict.xgb.Booster}
#' 
#' @details  
#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
#' and it is not necessarily equal to the number of trees in a model.
#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
#' But for multiclass classification, while there are multiple trees per iteration, 
#' \code{ntreelimit} limits the number of boosting iterations.
#' 
#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
#' since gblinear doesn't keep its boosting history.
#' 
#' One possible practical applications of the \code{predleaf} option is to use the model 
#' as a generator of new features which capture non-linearity and interactions, 
#' e.g., as implemented in \code{\link{xgb.create.features}}.
#' 
#' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
#' individual predictions. For "gblinear" booster, feature contributions are simply linear terms

xgboost/R-package/R/xgb.Booster.R view on Meta::CPAN

#' sum(pred_labels != lb)/length(lb)
#' 
#' # compare that to the predictions from softmax:
#' set.seed(11)
#' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
#'                max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
#'                objective = "multi:softmax", num_class = num_class)
#' pred <- predict(bst, as.matrix(iris[, -5]))
#' str(pred)
#' all.equal(pred, pred_labels)
#' # prediction from using only 5 iterations should result 
#' # in the same error as seen in iteration 5:
#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
#' sum(pred5 != lb)/length(lb)
#' 
#' 
#' ## random forest-like model of 25 trees for binary classification:
#' 
#' set.seed(11)
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
#'                nthread = 2, nrounds = 1, objective = "binary:logistic",

xgboost/R-package/R/xgb.cv.R view on Meta::CPAN

#'     \item \code{binary:logistic} logistic regression for classification
#'   }
#'   \item \code{eta} step size of each boosting step
#'   \item \code{max_depth} maximum depth of the tree
#'   \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#'   See \code{\link{xgb.train}} for further details.
#'   See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. 
#' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param missing is only used when input is a dense matrix. By default is set to NA, which means 
#'        that NA values should be considered as 'missing' by the algorithm. 
#'        Sometimes, 0 or other extreme value might be used to represent missing values.
#' @param prediction A logical value indicating whether to return the test fold predictions 
#'        from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in cross validation,
#'   when it is not specified, the evaluation metric is chosen according to objective function.

xgboost/R-package/R/xgb.cv.R view on Meta::CPAN

#' \itemize{
#'   \item \code{call} a function call.
#'   \item \code{params} parameters that were passed to the xgboost library. Note that it does not 
#'         capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
#'   \item \code{callbacks} callback functions that were either automatically assigned or 
#'         explicitely passed.
#'   \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
#'         first column corresponding to iteration number and the rest corresponding to the 
#'         CV-based evaluation means and standard deviations for the training and test CV-sets.
#'         It is created by the \code{\link{cb.evaluation.log}} callback.
#'   \item \code{niter} number of boosting iterations.
#'   \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds} 
#'         parameter or randomly generated.
#'   \item \code{best_iteration} iteration number with the best evaluation metric value
#'         (only available with early stopping).
#'   \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, 
#'         which could further be used in \code{predict} method
#'         (only available with early stopping).
#'   \item \code{pred} CV prediction values available when \code{prediction} is set. 
#'         It is either vector or matrix (see \code{\link{cb.cv.predict}}).
#'   \item \code{models} a liost of the CV folds' models. It is only available with the explicit

xgboost/R-package/R/xgb.cv.R view on Meta::CPAN

  dall <- xgb.get.DMatrix(data, label, missing)
  bst_folds <- lapply(seq_along(folds), function(k) {
    dtest  <- slice(dall, folds[[k]])
    dtrain <- slice(dall, unlist(folds[-k]))
    handle <- xgb.Booster.handle(params, list(dtrain, dtest))
    list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test=dtest), index = folds[[k]])
  })
  # a "basket" to collect some results from callbacks
  basket <- list()

  # extract parameters that can affect the relationship b/w #trees and #iterations
  num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)

  # those are fixed for CV (no training continuation)
  begin_iteration <- 1
  end_iteration <- nrounds
  
  # synchronous CV boosting: run CV folds' models within each iteration
  for (iteration in begin_iteration:end_iteration) {

xgboost/R-package/R/xgb.plot.deepness.R view on Meta::CPAN

#' \itemize{
#'  \item the distribution of the number of leafs in a tree model at a certain depth;
#'  \item the distribution of average weighted number of observations ("cover") 
#'        ending up in leafs at certain depth.
#' }
#' Those could be helpful in determining sensible ranges of the \code{max_depth} 
#' and \code{min_child_weight} parameters.
#' 
#' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
#' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
#' a tree's median absolute leaf weight changes through the iterations.
#'
#' This function was inspired by the blog post
#' \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
#' 
#' @return
#' 
#' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
#' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
#' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
#'

xgboost/R-package/R/xgb.train.R view on Meta::CPAN

#'     \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
#'     \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each cl...
#'     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#'   }
#'   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#'   \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ...
#' }
#' 
#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
#'        \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
#' @param nrounds max number of boosting iterations.
#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
#'        Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
#'        of these datasets during each boosting iteration, and stored in the end as a field named 
#'        \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or 
#'        \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
#'        printed out during the training. 
#'        E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
#'        the performance of each round's model on mat1 and mat2.
#' @param obj customized objective function. Returns gradient and second order 
#'        gradient with given prediction and dtrain.

xgboost/R-package/R/xgb.train.R view on Meta::CPAN

#'   \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
#'   \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
#'   \item \code{cb.save.model}: when \code{save_period > 0} is set.
#' }
#' 
#' @return 
#' An object of class \code{xgb.Booster} with the following elements:
#' \itemize{
#'   \item \code{handle} a handle (pointer) to the xgboost model in memory.
#'   \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
#'   \item \code{niter} number of boosting iterations.
#'   \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
#'         first column corresponding to iteration number and the rest corresponding to evaluation
#'         metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
#'   \item \code{call} a function call.
#'   \item \code{params} parameters that were passed to the xgboost library. Note that it does not 
#'         capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
#'   \item \code{callbacks} callback functions that were either automatically assigned or 
#'         explicitely passed.
#'   \item \code{best_iteration} iteration number with the best evaluation metric value
#'         (only available with early stopping).

xgboost/R-package/R/xgb.train.R view on Meta::CPAN

  # Sort the callbacks into categories
  cb <- categorize.callbacks(callbacks)

  # The tree updating process would need slightly different handling
  is_update <- NVL(params[['process_type']], '.') == 'update'

  # Construct a booster (either a new one or load from xgb_model)
  handle <- xgb.Booster.handle(params, append(watchlist, dtrain), xgb_model)
  bst <- xgb.handleToBooster(handle)

  # extract parameters that can affect the relationship b/w #trees and #iterations
  num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)

  # When the 'xgb_model' was set, find out how many boosting iterations it has
  niter_init <- 0
  if (!is.null(xgb_model)) {
    niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1
    if (length(niter_init) == 0) {
      niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
    }
  }
  if(is_update && nrounds > niter_init)
    stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")

  # TODO: distributed code
  rank <- 0
  
  niter_skip <- ifelse(is_update, 0, niter_init)
  begin_iteration <- niter_skip + 1
  end_iteration <- niter_skip + nrounds
  
  # the main loop for boosting iterations
  for (iteration in begin_iteration:end_iteration) {
    
    for (f in cb$pre_iter) f()
    
    xgb.iter.update(bst$handle, dtrain, iteration - 1, obj)
    
    bst_evaluation <- numeric(0)
    if (length(watchlist) > 0)
      bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval)
    
    xgb.attr(bst$handle, 'niter') <- iteration - 1

    for (f in cb$post_iter) f()

    if (stop_condition) break
  }
  for (f in cb$finalize) f(finalize = TRUE)
  
  bst <- xgb.Booster.complete(bst, saveraw = TRUE)
  
  # store the total number of boosting iterations
  bst$niter = end_iteration

  # store the evaluation results
  if (length(evaluation_log) > 0 &&
      nrow(evaluation_log) > 0) {
    # include the previous compatible history when available
    if (inherits(xgb_model, 'xgb.Booster') &&
        !is_update &&
        !is.null(xgb_model$evaluation_log) &&
        all.equal(colnames(evaluation_log),

xgboost/R-package/man/callbacks.Rd view on Meta::CPAN

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/callbacks.R
\name{callbacks}
\alias{callbacks}
\title{Callback closures for booster training.}
\description{
These are used to perform various service tasks either during boosting iterations or at the end.
This approach helps to modularize many of such tasks without bloating the main training methods, 
and it offers .
}
\details{
By default, a callback function is run after each boosting iteration.
An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.

When a callback function has \code{finalize} parameter, its finalizer part will also be run after 
the boosting is completed.

xgboost/R-package/man/cb.print.evaluation.Rd view on Meta::CPAN

}
\arguments{
\item{period}{results would be printed every number of periods}

\item{showsd}{whether standard deviations should be printed (when available)}
}
\description{
Callback closure for printing the result of evaluation
}
\details{
The callback function prints the result of evaluation at every \code{period} iterations.
The initial and the last iteration's evaluations are always printed.

Callback function expects the following values to be set in its calling frame:
\code{bst_evaluation} (also \code{bst_evaluation_err} when available),
\code{iteration},
\code{begin_iteration},
\code{end_iteration}.
}
\seealso{
\code{\link{callbacks}}

xgboost/R-package/man/cb.save.model.Rd view on Meta::CPAN

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/callbacks.R
\name{cb.save.model}
\alias{cb.save.model}
\title{Callback closure for saving a model file.}
\usage{
cb.save.model(save_period = 0, save_name = "xgboost.model")
}
\arguments{
\item{save_period}{save the model to disk after every 
\code{save_period} iterations; 0 means save the model at the end.}

\item{save_name}{the name or path for the saved model file.
It can contain a \code{\link[base]{sprintf}} formatting specifier 
to include the integer iteration number in the file name.
E.g., with \code{save_name} = 'xgboost_%04d.model', 
the file saved at iteration 50 would be named "xgboost_0050.model".}
}
\description{
Callback closure for saving a model file.
}

xgboost/R-package/man/predict.xgb.Booster.Rd view on Meta::CPAN

}
\arguments{
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}}

\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.}

\item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
missing values in data (e.g., sometimes 0 or some other extreme value is used).}

\item{outputmargin}{whether the prediction should be returned in the for of original untransformed 
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for 
logistic regression would result in predictions for log-odds instead of probabilities.}

\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
It will use all the trees by default (\code{NULL} value).}

\item{predleaf}{whether predict leaf index instead.}

\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}

\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several 
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}

\item{...}{Parameters passed to \code{predict.xgb.Booster}}

xgboost/R-package/man/predict.xgb.Booster.Rd view on Meta::CPAN

When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
\code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
such a matrix. The contribution values are on the scale of untransformed margin 
(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
}
\description{
Predicted values based on either xgboost model or model handle object.
}
\details{
Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
and it is not necessarily equal to the number of trees in a model.
E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
But for multiclass classification, while there are multiple trees per iteration, 
\code{ntreelimit} limits the number of boosting iterations.

Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
since gblinear doesn't keep its boosting history.

One possible practical applications of the \code{predleaf} option is to use the model 
as a generator of new features which capture non-linearity and interactions, 
e.g., as implemented in \code{\link{xgb.create.features}}.

Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
individual predictions. For "gblinear" booster, feature contributions are simply linear terms

xgboost/R-package/man/predict.xgb.Booster.Rd view on Meta::CPAN

sum(pred_labels != lb)/length(lb)

# compare that to the predictions from softmax:
set.seed(11)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
               max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
               objective = "multi:softmax", num_class = num_class)
pred <- predict(bst, as.matrix(iris[, -5]))
str(pred)
all.equal(pred, pred_labels)
# prediction from using only 5 iterations should result 
# in the same error as seen in iteration 5:
pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
sum(pred5 != lb)/length(lb)


## random forest-like model of 25 trees for binary classification:

set.seed(11)
bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
               nthread = 2, nrounds = 1, objective = "binary:logistic",

xgboost/R-package/man/xgb.cv.Rd view on Meta::CPAN

  \item \code{eta} step size of each boosting step
  \item \code{max_depth} maximum depth of the tree
  \item \code{nthread} number of thread used in training, if not set, all threads are used
}

  See \code{\link{xgb.train}} for further details.
  See also demo/ for walkthrough example in R.}

\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}

\item{nrounds}{the max number of iterations}

\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}

\item{label}{vector of response values. Should be provided only when data is an R-matrix.}

\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means 
that NA values should be considered as 'missing' by the algorithm. 
Sometimes, 0 or other extreme value might be used to represent missing values.}

\item{prediction}{A logical value indicating whether to return the test fold predictions

xgboost/R-package/man/xgb.cv.Rd view on Meta::CPAN

\itemize{
  \item \code{call} a function call.
  \item \code{params} parameters that were passed to the xgboost library. Note that it does not 
        capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
  \item \code{callbacks} callback functions that were either automatically assigned or 
        explicitely passed.
  \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
        first column corresponding to iteration number and the rest corresponding to the 
        CV-based evaluation means and standard deviations for the training and test CV-sets.
        It is created by the \code{\link{cb.evaluation.log}} callback.
  \item \code{niter} number of boosting iterations.
  \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds} 
        parameter or randomly generated.
  \item \code{best_iteration} iteration number with the best evaluation metric value
        (only available with early stopping).
  \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, 
        which could further be used in \code{predict} method
        (only available with early stopping).
  \item \code{pred} CV prediction values available when \code{prediction} is set. 
        It is either vector or matrix (see \code{\link{cb.cv.predict}}).
  \item \code{models} a liost of the CV folds' models. It is only available with the explicit

xgboost/R-package/man/xgb.plot.deepness.Rd view on Meta::CPAN

\itemize{
 \item the distribution of the number of leafs in a tree model at a certain depth;
 \item the distribution of average weighted number of observations ("cover") 
       ending up in leafs at certain depth.
}
Those could be helpful in determining sensible ranges of the \code{max_depth} 
and \code{min_child_weight} parameters.

When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
a tree's median absolute leaf weight changes through the iterations.

This function was inspired by the blog post
\url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
}
\examples{

data(agaricus.train, package='xgboost')

# Change max_depth to a higher number to get a more significant result
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,

xgboost/R-package/man/xgb.train.Rd view on Meta::CPAN

    \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class...
    \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
  }
  \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
  \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ran...
}}

\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.}

\item{nrounds}{max number of boosting iterations.}

\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
of these datasets during each boosting iteration, and stored in the end as a field named 
\code{evaluation_log} in the resulting object. When either \code{verbose>=1} or 
\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
printed out during the training. 
E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
the performance of each round's model on mat1 and mat2.}

xgboost/R-package/man/xgb.train.Rd view on Meta::CPAN

by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
This parameter is only used when input is a dense matrix.}

\item{weight}{a vector indicating the weight for each row of the input.}
}
\value{
An object of class \code{xgb.Booster} with the following elements:
\itemize{
  \item \code{handle} a handle (pointer) to the xgboost model in memory.
  \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
  \item \code{niter} number of boosting iterations.
  \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
        first column corresponding to iteration number and the rest corresponding to evaluation
        metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
  \item \code{call} a function call.
  \item \code{params} parameters that were passed to the xgboost library. Note that it does not 
        capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
  \item \code{callbacks} callback functions that were either automatically assigned or 
        explicitely passed.
  \item \code{best_iteration} iteration number with the best evaluation metric value
        (only available with early stopping).

xgboost/R-package/tests/testthat/test_basic.R view on Meta::CPAN

test_that("train and predict RF with softprob", {
  lb <- as.numeric(iris$Species) - 1
  nrounds <- 15
  set.seed(11)
  bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
                 max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
                 objective = "multi:softprob", num_class=3, verbose = 0,
                 num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
  expect_equal(bst$niter, 15)
  expect_equal(xgb.ntree(bst), 15*3*4)
  # predict for all iterations:
  pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE)
  expect_equal(dim(pred), c(nrow(iris), 3))
  pred_labels <- max.col(pred) - 1
  err <- sum(pred_labels != lb)/length(lb)
  expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
  # predict for 7 iterations and adjust for 4 parallel trees per iteration
  pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE, ntreelimit = 7 * 4)
  err <- sum((max.col(pred) - 1) != lb)/length(lb)
  expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
})

test_that("use of multiple eval metrics works", {
  expect_output(
    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
                  eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
                  eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")

xgboost/R-package/tests/testthat/test_basic.R view on Meta::CPAN

  expect_equal(dim(bst$evaluation_log), c(2, 4))
  expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
})


test_that("training continuation works", {
  dtrain <- xgb.DMatrix(train$data, label = train$label)
  watchlist = list(train=dtrain)
  param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)

  # for the reference, use 4 iterations at once:
  set.seed(11)
  bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
  # first two iterations:
  set.seed(11)
  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
  # continue for two more:
  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
  if (!windows_flag)
    expect_equal(bst$raw, bst2$raw)
  expect_false(is.null(bst2$evaluation_log))
  expect_equal(dim(bst2$evaluation_log), c(4, 2))
  expect_equal(bst2$evaluation_log, bst$evaluation_log)
  # test continuing from raw model data

xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN


using namespace cub;

//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

/// Verbose output
bool g_verbose = false;

/// Timing iterations
int g_timing_iterations = 100;

/// Default grid size
int g_grid_size = 1;

/// Uniform key samples
bool g_uniform_keys;


//---------------------------------------------------------------------
// Kernels

xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN

        printf("\n\n");
    }

    // Kernel props
    int max_sm_occupancy;
    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));

    // Copy problem to device
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));

    printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
        TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
    fflush(stdout);

    // Run kernel once to prime caches and check result
    BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
        d_in,
        d_out,
        d_elapsed);

    // Check for kernel errors and STDIO from the kernel, if any
    CubDebugExit(cudaPeekAtLastError());

xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN

    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);
    fflush(stdout);

    // Run this several times and average the performance results
    GpuTimer            timer;
    float               elapsed_millis          = 0.0;
    unsigned long long  elapsed_clocks          = 0;

    for (int i = 0; i < g_timing_iterations; ++i)
    {
        timer.Start();

        // Run kernel
        BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
            d_in,
            d_out,
            d_elapsed);

        timer.Stop();

xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN

        // Copy clocks from device
        CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
        for (int i = 0; i < g_grid_size; i++)
            elapsed_clocks += h_elapsed[i];
    }

    // Check for kernel errors and STDIO from the kernel, if any
    CubDebugExit(cudaDeviceSynchronize());

    // Display timing results
    float avg_millis            = elapsed_millis / g_timing_iterations;
    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
    double avg_clocks           = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
    double avg_clocks_per_item  = avg_clocks / TILE_SIZE;

    printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
    printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
    printf("\tAverage kernel millis: %.4f\n", avg_millis);
    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
    fflush(stdout);

    // Cleanup
    if (h_in) delete[] h_in;

xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN


/**
 * Main
 */
int main(int argc, char** argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    g_uniform_keys = args.CheckCmdLineFlag("uniform");
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("grid-size", g_grid_size);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>] "
            "[--i=<timing iterations (default:%d)>]"
            "[--grid-size=<grid size (default:%d)>]"
            "[--v] "
            "\n", argv[0], g_timing_iterations, g_grid_size);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());
    fflush(stdout);

    // Run tests
    printf("\nuint32:\n"); fflush(stdout);
    Test<unsigned int, 128, 13>();

xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN


using namespace cub;

//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

/// Verbose output
bool g_verbose = false;

/// Timing iterations
int g_timing_iterations = 100;

/// Default grid size
int g_grid_size = 1;



//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------

xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN

        printf("\n\n");
    }

    // Kernel props
    int max_sm_occupancy;
    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));

    // Copy problem to device
    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

    printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);

    // Run aggregate/prefix kernel
    BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
        d_in,
        d_out,
        d_elapsed);

    // Check total aggregate
    printf("\tAggregate: ");
    int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Run this several times and average the performance results
    GpuTimer    timer;
    float       elapsed_millis          = 0.0;
    clock_t     elapsed_clocks          = 0;

    for (int i = 0; i < g_timing_iterations; ++i)
    {
        // Copy problem to device
        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

        timer.Start();

        // Run aggregate/prefix kernel
        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
            d_in,
            d_out,

xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN

        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
        elapsed_clocks += clocks;

    }

    // Check for kernel errors and STDIO from the kernel, if any
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(cudaDeviceSynchronize());

    // Display timing results
    float avg_millis            = elapsed_millis / g_timing_iterations;
    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;

    printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
    printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
    printf("\tAverage kernel millis: %.4f\n", avg_millis);
    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_gpu) delete[] h_gpu;

xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN



/**
 * Main
 */
int main(int argc, char** argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("grid-size", g_grid_size);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>] "
            "[--i=<timing iterations>] "
            "[--grid-size=<grid size>] "
            "[--v] "
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Run tests

xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN


using namespace cub;

//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

/// Verbose output
bool g_verbose = false;

/// Timing iterations
int g_timing_iterations = 100;

/// Default grid size
int g_grid_size = 1;



//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------

xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN

        printf("\n\n");
    }

    // Kernel props
    int max_sm_occupancy;
    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));

    // Copy problem to device
    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

    printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
        (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);

    // Run aggregate/prefix kernel
    BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
        d_in,
        d_out,
        d_elapsed);

    // Check results
    printf("\tOutput items: ");
    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);

xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN

    printf("\tAggregate: ");
    compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

    // Run this several times and average the performance results
    GpuTimer    timer;
    float       elapsed_millis          = 0.0;
    clock_t     elapsed_clocks          = 0;

    for (int i = 0; i < g_timing_iterations; ++i)
    {
        // Copy problem to device
        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

        timer.Start();

        // Run aggregate/prefix kernel
        BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
            d_in,
            d_out,

xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN

        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
        elapsed_clocks += clocks;

    }

    // Check for kernel errors and STDIO from the kernel, if any
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(cudaDeviceSynchronize());

    // Display timing results
    float avg_millis            = elapsed_millis / g_timing_iterations;
    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;

    printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
    printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
    printf("\tAverage kernel millis: %.4f\n", avg_millis);
    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);

    // Cleanup
    if (h_in) delete[] h_in;
    if (h_reference) delete[] h_reference;

xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN



/**
 * Main
 */
int main(int argc, char** argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("grid-size", g_grid_size);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>] "
            "[--i=<timing iterations (default:%d)>]"
            "[--grid-size=<grid size (default:%d)>]"
            "[--v] "
            "\n", argv[0], g_timing_iterations, g_grid_size);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Run tests
    Test<1024, 1, BLOCK_SCAN_RAKING>();
    Test<512, 2, BLOCK_SCAN_RAKING>();
    Test<256, 4, BLOCK_SCAN_RAKING>();

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

//---------------------------------------------------------------------

/**
 * Main
 */
int main(int argc, char** argv)
{
    typedef unsigned int    Key;
    typedef int             Value;

    int timing_iterations   = 0;
    int num_items           = 40;
    Key max_key             = 20;       // Max item

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_items);
    args.GetCmdLineArgument("maxkey", max_key);
    args.GetCmdLineArgument("i", timing_iterations);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>] "
            "[--i=<timing iterations> "
            "[--n=<input items, default 40> "
            "[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
            "[--v] "
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

    int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);

    printf("%d non-trivial runs\n", num_runs);
    fflush(stdout);

    // Repeat for performance timing
    GpuTimer gpu_timer;
    GpuTimer gpu_rle_timer;
    float elapsed_millis = 0.0;
    float elapsed_rle_millis = 0.0;
    for (int i = 0; i <= timing_iterations; ++i)
    {

        // Allocate and initialize device arrays for sorting
        DoubleBuffer<Key>       d_keys;
        DoubleBuffer<Value>     d_values;
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));

xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN

    }

    // Host cleanup
    if (h_keys) delete[] h_keys;
    if (h_values) delete[] h_values;
    if (h_offsets_reference) delete[] h_offsets_reference;
    if (h_lengths_reference) delete[] h_lengths_reference;

    printf("\n\n");

    if (timing_iterations > 0)
    {
        printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n",
            timing_iterations,
            elapsed_millis / timing_iterations,
            elapsed_rle_millis / timing_iterations);
    }

    return 0;
}

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN



/******************************************************************************
 * Globals, constants, and typedefs
 ******************************************************************************/

typedef int         VertexId;   // uint32s as vertex ids
typedef double      Value;      // double-precision floating point values

bool                    g_verbose       = false;
int                     g_timing_iterations    = 1;
CachingDeviceAllocator  g_allocator;


/******************************************************************************
 * Texture referencing
 ******************************************************************************/

/**
 * Templated texture reference type for multiplicand vector
 */

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN

    {
        printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
    }
    fflush(stdout);

    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));

    // Run kernel (always run one iteration without timing)
    GpuTimer gpu_timer;
    float elapsed_millis = 0.0;
    for (int i = 0; i <= g_timing_iterations; i++)
    {
        gpu_timer.Start();

        // Initialize output
        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));

        // Run the COO kernel
        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
            even_share,
            d_block_partials,

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN


        if (i > 0)
            elapsed_millis += gpu_timer.ElapsedMillis();
    }

    // Force any kernel stdio to screen
    CubDebugExit(cudaThreadSynchronize());
    fflush(stdout);

    // Display timing
    if (g_timing_iterations > 0)
    {
        float avg_elapsed = elapsed_millis / g_timing_iterations;
        int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
        printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
            g_timing_iterations,
            avg_elapsed,
            total_bytes / avg_elapsed / 1000.0 / 1000.0,
            num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
    }

    // Check results
    int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
    printf("%s\n", compare ? "FAIL" : "PASS");
    AssertEquals(0, compare);

xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN



/**
 * Main
 */
int main(int argc, char** argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("i", g_timing_iterations);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
            "\t--type=wheel --spokes=<spokes>\n"
            "\t--type=grid2d --width=<width> [--no-self-loops]\n"
            "\t--type=grid3d --width=<width> [--no-self-loops]\n"
            "\t--type=market --file=<file>\n"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN


using namespace cub;
using namespace std;


/******************************************************************************
 * Globals, constants, and typedefs
 ******************************************************************************/

bool                    g_verbose           = false;
int                     g_timing_iterations = 1;
CachingDeviceAllocator  g_allocator(true);


/******************************************************************************
 * Utility routines
 ******************************************************************************/


/**
 * An pair of index offsets

xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN

    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    for (int i = 0; i < g_timing_iterations; ++i)
    {
        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
    }
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
        float giga_bandwidth = giga_rate *
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

    // Device cleanup
    if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
    if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN

int main(int argc, char** argv)
{
    int num_values          = 32 * 1024 * 1024;
    int avg_segment_size    = 500;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_values);
    args.GetCmdLineArgument("ss", avg_segment_size);
    args.GetCmdLineArgument("i", g_timing_iterations);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>] "
            "[--v] "
            "[--i=<timing iterations>] "
            "[--n=<input samples>]\n"
            "[--ss=<average segment size>]\n"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

    Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));

xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN

    int         ACTIVE_CHANNELS,
    int         NUM_BINS,
    typename    PixelType>
void RunTest(
    std::vector<std::pair<std::string, double> >&   timings,
    PixelType*                                      d_pixels,
    const int                                       width,
    const int                                       height,
    unsigned int *                                  d_hist,
    unsigned int *                                  h_hist,
    int                                             timing_iterations,
    const char *                                    long_name,
    const char *                                    short_name,
    double (*f)(PixelType*, int, int, unsigned int*, bool))
{
    if (!g_report) printf("%s ", long_name); fflush(stdout);

    // Run single test to verify (and code cache)
    (*f)(d_pixels, width, height, d_hist, !g_report);

    int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
    if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);

    double elapsed_ms = 0;
    for (int i = 0; i < timing_iterations; i++)
    {
        elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
    }
    double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
    timings.push_back(std::pair<std::string, double>(short_name, avg_us));

    if (!g_report)
    {
        printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
    }
    else
    {
        printf("%.3f, ", avg_us); fflush(stdout);
    }

    AssertEquals(0, compare);
}

xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN

 */
template <
    int         NUM_CHANNELS,
    int         ACTIVE_CHANNELS,
    int         NUM_BINS,
    typename    PixelType>
void TestMethods(
    PixelType*  h_pixels,
    int         height,
    int         width,
    int         timing_iterations,
    double      bandwidth_GBs)
{
    // Copy data to gpu
    PixelType* d_pixels;
    size_t pixel_bytes = width * height * sizeof(PixelType);
    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
    CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));

    if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);

xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN

    h_hist = (unsigned int *) malloc(histogram_bytes);
    g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);

    // Compute reference cpu histogram
    HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);

    // Store timings
    std::vector<std::pair<std::string, double> > timings;

    // Run experiments
    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
        "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
        "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
        "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);

    // Report timings
    if (!g_report)
    {
        std::sort(timings.begin(), timings.end(), less_than_value());
        printf("Timings (us):\n");
        for (int i = 0; i < timings.size(); i++)
        {
            double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;

xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN

}


/**
 * Test different problem genres
 */
void TestGenres(
    uchar4*     uchar4_pixels,
    int         height,
    int         width,
    int         timing_iterations,
    double      bandwidth_GBs)
{
    int num_pixels = width * height;

    {
        if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);

        size_t      image_bytes     = num_pixels * sizeof(uchar1);
        uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);

        // Convert to 1-channel (averaging first 3 channels)
        for (int i = 0; i < num_pixels; ++i)
        {
            uchar1_pixels[i].x = (unsigned char)
                (((unsigned int) uchar4_pixels[i].x +
                  (unsigned int) uchar4_pixels[i].y +
                  (unsigned int) uchar4_pixels[i].z) / 3);
        }

        TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
        free(uchar1_pixels);
        if (g_report) printf(", ");
    }

    {
        if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
        TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
        if (g_report) printf(", ");
    }

    {
        if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
        size_t      image_bytes     = num_pixels * sizeof(float4);
        float4*     float4_pixels   = (float4*) malloc(image_bytes);

        // Convert to float4 with range [0.0, 1.0)
        for (int i = 0; i < num_pixels; ++i)
        {
            float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
            float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
            float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
            float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
        }
        TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
        free(float4_pixels);
        if (g_report) printf("\n");
    }
}


/**
 * Main
 */
int main(int argc, char **argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    if (args.CheckCmdLineFlag("help"))
    {
        printf(
            "%s "
            "[--device=<device-id>] "
            "[--v] "
            "[--i=<timing iterations>] "
            "\n\t"
                "--file=<.tga filename> "
            "\n\t"
                "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
                "[--height=<default: 1080>] "
                "[--width=<default: 1920>] "
            "\n", argv[0]);
        exit(0);
    }

    std::string         filename;
    int                 timing_iterations   = 100;
    int                 entropy_reduction   = 0;
    int                 height              = 1080;
    int                 width               = 1920;

    g_verbose = args.CheckCmdLineFlag("v");
    g_report = args.CheckCmdLineFlag("report");
    args.GetCmdLineArgument("i", timing_iterations);
    args.GetCmdLineArgument("file", filename);
    args.GetCmdLineArgument("height", height);
    args.GetCmdLineArgument("width", width);
    args.GetCmdLineArgument("entropy", entropy_reduction);

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Get GPU device bandwidth (GB/s)
    int device_ordinal, bus_width, mem_clock_khz;

xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN

            ReadTga(uchar4_pixels, width, height, filename.c_str());
            printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
        }
        else
        {
            // Generate image
            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
            printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
        }

        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
    }
    else
    {
        // Run test suite
        printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");

        // Entropy reduction tests
        for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
        {
            printf("entropy reduction %d, ", entropy_reduction);
            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
        }
        printf("entropy reduction -1, ");
        GenerateRandomImage(uchar4_pixels, width, height, -1);
        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
        printf("\n");

        // File image tests
        std::vector<std::string> file_tests;
        file_tests.push_back("animals");
        file_tests.push_back("apples");
        file_tests.push_back("sunset");
        file_tests.push_back("cheetah");
        file_tests.push_back("nature");
        file_tests.push_back("operahouse");
        file_tests.push_back("austin");
        file_tests.push_back("cityscape");

        for (int i = 0; i < file_tests.size(); ++i)
        {
            printf("%s, ", file_tests[i].c_str());
            std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
            ReadTga(uchar4_pixels, width, height, filename.c_str());
            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
        }
    }

    free(uchar4_pixels);

    CubDebugExit(cudaDeviceSynchronize());
    printf("\n\n");

    return 0;
}

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN



/**
 * Run GPU I/O proxy
 */
template <
    typename ValueT,
    typename OffsetT>
float TestGpuCsrIoProxy(
    SpmvParams<ValueT, OffsetT>&    params,
    int                             timing_iterations)
{
    enum {
        BLOCK_THREADS       = 128,
        ITEMS_PER_THREAD    = 7,
        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
    };

//    size_t smem = 1024 * 16;
    size_t smem = 1024 * 0;

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);

    // Check for failures
    CubDebugExit(cudaPeekAtLastError());
    CubDebugExit(SyncStream(0));

    // Timing
    GpuTimer timer;
    float elapsed_millis = 0.0;
    timer.Start();
    for (int it = 0; it < timing_iterations; ++it)
    {
        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    CubDebugExit(x_itr.UnbindTexture());

    return elapsed_millis / timing_iterations;
}



//---------------------------------------------------------------------
// cuSparse HybMV
//---------------------------------------------------------------------

/**
 * Run cuSparse HYB SpMV (specialized for fp32)
 */
template <
    typename OffsetT>
float TestCusparseHybmv(
    float*                          vector_y_in,
    float*                          reference_vector_y_out,
    SpmvParams<float, OffsetT>&     params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    CpuTimer cpu_timer;
    cpu_timer.Start();

    // Construct Hyb matrix
    cusparseMatDescr_t mat_desc;
    cusparseHybMat_t hyb_desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
            cusparse,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            &params.alpha, mat_desc,
            hyb_desc,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    // Cleanup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));

    return elapsed_millis / timing_iterations;
}


/**
 * Run cuSparse HYB SpMV (specialized for fp64)
 */
template <
    typename OffsetT>
float TestCusparseHybmv(
    double*                         vector_y_in,
    double*                         reference_vector_y_out,
    SpmvParams<double, OffsetT>&    params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    CpuTimer cpu_timer;
    cpu_timer.Start();

    // Construct Hyb matrix
    cusparseMatDescr_t mat_desc;
    cusparseHybMat_t hyb_desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
            cusparse,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            &params.alpha, mat_desc,
            hyb_desc,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    // Cleanup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));

    return elapsed_millis / timing_iterations;
}



//---------------------------------------------------------------------
// cuSparse CsrMV
//---------------------------------------------------------------------

/**
 * Run cuSparse SpMV (specialized for fp32)
 */
template <
    typename OffsetT>
float TestCusparseCsrmv(
    float*                          vector_y_in,
    float*                          reference_vector_y_out,
    SpmvParams<float, OffsetT>&     params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    cusparseMatDescr_t desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    float elapsed_millis    = 0.0;
    GpuTimer timer;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, &params.beta, params.d_vector_y));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
    return elapsed_millis / timing_iterations;
}


/**
 * Run cuSparse SpMV (specialized for fp64)
 */
template <
    typename OffsetT>
float TestCusparseCsrmv(
    double*                         vector_y_in,
    double*                         reference_vector_y_out,
    SpmvParams<double, OffsetT>&    params,
    int                             timing_iterations,
    cusparseHandle_t                cusparse)
{
    cusparseMatDescr_t desc;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));

    // Reset input/output vector y
    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));

    // Warmup
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    if (!g_quiet)
    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    float elapsed_millis = 0.0;
    GpuTimer timer;
    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, &params.beta, params.d_vector_y));

    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
    return elapsed_millis / timing_iterations;
}

//---------------------------------------------------------------------
// GPU Merge-based SpMV
//---------------------------------------------------------------------

/**
 * Run CUB SpMV
 */
template <
    typename ValueT,
    typename OffsetT>
float TestGpuMergeCsrmv(
    ValueT*                         vector_y_in,
    ValueT*                         reference_vector_y_out,
    SpmvParams<ValueT, OffsetT>&    params,
    int                             timing_iterations)
{
    // Allocate temporary storage
    size_t temp_storage_bytes = 0;
    void *d_temp_storage = NULL;

    // Get amount of temporary storage needed
    CubDebugExit(DeviceSpmv::CsrMV(
        d_temp_storage, temp_storage_bytes,
        params.d_values, params.d_row_end_offsets, params.d_column_indices,
        params.d_vector_x, params.d_vector_y,

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    {
        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Timing
    GpuTimer timer;
    float elapsed_millis = 0.0;

    timer.Start();
    for(int it = 0; it < timing_iterations; ++it)
    {
        CubDebugExit(DeviceSpmv::CsrMV(
            d_temp_storage, temp_storage_bytes,
            params.d_values, params.d_row_end_offsets, params.d_column_indices,
            params.d_vector_x, params.d_vector_y,
            params.num_rows, params.num_cols, params.num_nonzeros, 
// params.alpha, params.beta,
            (cudaStream_t) 0, false));
    }
    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    return elapsed_millis / timing_iterations;
}

//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------

/**
 * Display perf
 */
template <typename ValueT, typename OffsetT>

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

 * Run tests
 */
template <
    typename ValueT,
    typename OffsetT>
void RunTest(
    bool                        rcm_relabel,
    ValueT                      alpha,
    ValueT                      beta,
    CooMatrix<ValueT, OffsetT>& coo_matrix,
    int                         timing_iterations,
    CommandLineArgs&            args)
{
    // Adaptive timing iterations: run 16 billion nonzeros through
    if (timing_iterations == -1)
        timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));

    if (!g_quiet)
        printf("\t%d timing iterations\n", timing_iterations);

    // Convert to CSR
    CsrMatrix<ValueT, OffsetT> csr_matrix;
    csr_matrix.FromCoo(coo_matrix);
    if (!args.CheckCmdLineFlag("csrmv"))
        coo_matrix.Clear();

    // Relabel
    if (rcm_relabel)
    {

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    params.alpha            = alpha;
    params.beta             = beta;

    CubDebugExit(cudaMemcpy(params.d_values,            csr_matrix.values,          sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(params.d_row_end_offsets,   csr_matrix.row_offsets,     sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(params.d_column_indices,    csr_matrix.column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(params.d_vector_x,          vector_x,                   sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice));

    if (!g_quiet) printf("\n\n");
    printf("GPU CSR I/O Prox, "); fflush(stdout);
    avg_millis = TestGpuCsrIoProxy(params, timing_iterations);
    DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);

    if (args.CheckCmdLineFlag("csrmv"))
    {
        if (!g_quiet) printf("\n\n");
        printf("CUB, "); fflush(stdout);
        avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations);
        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
    }

    // Initialize cuSparse
    cusparseHandle_t cusparse;
    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse));

    if (args.CheckCmdLineFlag("csrmv"))
    {
        if (!g_quiet) printf("\n\n");
        printf("Cusparse CsrMV, "); fflush(stdout);
        avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
    }

    if (args.CheckCmdLineFlag("hybmv"))
    {
        if (!g_quiet) printf("\n\n");
        printf("Cusparse HybMV, "); fflush(stdout);

        avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
    }


    // Cleanup
    if (params.d_values)            CubDebugExit(g_allocator.DeviceFree(params.d_values));
    if (params.d_row_end_offsets)   CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets));
    if (params.d_column_indices)    CubDebugExit(g_allocator.DeviceFree(params.d_column_indices));
    if (params.d_vector_x)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_x));
    if (params.d_vector_y)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_y));

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    typename OffsetT>
void RunTests(
    bool                rcm_relabel,
    ValueT              alpha,
    ValueT              beta,
    const std::string&  mtx_filename,
    int                 grid2d,
    int                 grid3d,
    int                 wheel,
    int                 dense,
    int                 timing_iterations,
    CommandLineArgs&    args)
{
    // Initialize matrix in COO form
    CooMatrix<ValueT, OffsetT> coo_matrix;

    if (!mtx_filename.empty())
    {
        // Parse matrix market file
        printf("%s, ", mtx_filename.c_str()); fflush(stdout);
        coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

    {
        fprintf(stderr, "No graph type specified.\n");
        exit(1);
    }

    RunTest(
        rcm_relabel,
        alpha,
        beta,
        coo_matrix,
        timing_iterations,
        args);
}



/**
 * Main
 */
int main(int argc, char **argv)
{
    // Initialize command line
    CommandLineArgs args(argc, argv);
    if (args.CheckCmdLineFlag("help"))
    {
        printf(
            "%s "
            "[--csrmv | --hybmv | --bsrmv ] "
            "[--device=<device-id>] "
            "[--quiet] "
            "[--v] "
            "[--i=<timing iterations>] "
            "[--fp64] "
            "[--rcm] "
            "[--alpha=<alpha scalar (default: 1.0)>] "
            "[--beta=<beta scalar (default: 0.0)>] "
            "\n\t"
                "--mtx=<matrix market file> "
            "\n\t"
                "--dense=<cols>"
            "\n\t"
                "--grid2d=<width>"

xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN

        exit(0);
    }

    bool                fp64;
    bool                rcm_relabel;
    std::string         mtx_filename;
    int                 grid2d              = -1;
    int                 grid3d              = -1;
    int                 wheel               = -1;
    int                 dense               = -1;
    int                 timing_iterations   = -1;
    float               alpha               = 1.0;
    float               beta                = 0.0;

    g_verbose = args.CheckCmdLineFlag("v");
    g_verbose2 = args.CheckCmdLineFlag("v2");
    g_quiet = args.CheckCmdLineFlag("quiet");
    fp64 = args.CheckCmdLineFlag("fp64");
    rcm_relabel = args.CheckCmdLineFlag("rcm");
    args.GetCmdLineArgument("i", timing_iterations);
    args.GetCmdLineArgument("mtx", mtx_filename);
    args.GetCmdLineArgument("grid2d", grid2d);
    args.GetCmdLineArgument("grid3d", grid3d);
    args.GetCmdLineArgument("wheel", wheel);
    args.GetCmdLineArgument("dense", dense);
    args.GetCmdLineArgument("alpha", alpha);
    args.GetCmdLineArgument("beta", beta);

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Run test(s)
    if (fp64)
    {
        RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
    }
    else
    {
        RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
    }

    CubDebugExit(cudaDeviceSynchronize());
    printf("\n");

    return 0;
}

xgboost/cub/test/test_allocator.cu view on Meta::CPAN

{
    // Initialize command line
    CommandLineArgs args(argc, argv);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--device=<device-id>]"
            "[--bytes=<timing bytes>]"
            "[--i=<timing iterations>]"
            "\n", argv[0]);
        exit(0);
    }

#if (CUB_PTX_ARCH == 0)

    // Initialize device
    CubDebugExit(args.DeviceInit());

    // Get number of GPUs and current GPU
    int num_gpus;
    int initial_gpu;
    int timing_iterations           = 10000;
    int timing_bytes                = 1024 * 1024;

    if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
    if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
    args.GetCmdLineArgument("i", timing_iterations);
    args.GetCmdLineArgument("bytes", timing_bytes);

    // Create default allocator (caches up to 6MB in device allocations per GPU)
    CachingDeviceAllocator allocator;
    allocator.debug = true;

    printf("Running single-gpu tests...\n"); fflush(stdout);

    //
    // Test0

xgboost/cub/test/test_allocator.cu view on Meta::CPAN


        // Check that that still we have 0 live block across all GPUs
        AssertEquals(allocator.live_blocks.size(), 0);
    }
#endif  // CUB_CDP

    //
    // Performance
    //

    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
    fflush(stdout); fflush(stderr);

    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
    CpuTimer    cpu_timer;
    char        *d_1024MB                       = NULL;
    allocator.debug                             = false;

    // Prime the caching allocator and the kernel
    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
    CubDebugExit(allocator.DeviceFree(d_1024MB));
    cub::EmptyKernel<void><<<1, 32>>>();

    // CUDA
    cpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
        CubDebugExit(cudaFree(d_1024MB));
    }
    cpu_timer.Stop();
    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();

    // CUB
    cpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
        CubDebugExit(allocator.DeviceFree(d_1024MB));
    }
    cpu_timer.Stop();
    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();

    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
        cuda_malloc_elapsed_millis / timing_iterations,
        cub_calloc_elapsed_millis / timing_iterations);

    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
    GpuTimer gpu_timer;

    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
    fflush(stdout); fflush(stderr);

    // Kernel-only
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        cub::EmptyKernel<void><<<1, 32>>>();
    }
    gpu_timer.Stop();
    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();

    // CUDA
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
        cub::EmptyKernel<void><<<1, 32>>>();
        CubDebugExit(cudaFree(d_1024MB));
    }
    gpu_timer.Stop();
    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

    // CUB
    gpu_timer.Start();
    for (int i = 0; i < timing_iterations; ++i)
    {
        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
        cub::EmptyKernel<void><<<1, 32>>>();
        CubDebugExit(allocator.DeviceFree(d_1024MB));
    }
    gpu_timer.Stop();
    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
        cuda_malloc_elapsed_millis / timing_iterations,
        cub_calloc_elapsed_millis / timing_iterations);


#endif

    printf("Success\n");

    return 0;
}

xgboost/cub/test/test_block_histogram.cu view on Meta::CPAN

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);


//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------

/**
 * BlockHistogram test kernel.

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN

enum Backend
{
    CUB,        // CUB method
    NPP,        // NPP method
    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
};


bool                    g_verbose_input     = false;
bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);




//---------------------------------------------------------------------
// Dispatch to NPP histogram
//---------------------------------------------------------------------

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


/**
 * Dispatch to single-channel 8b NPP histo-even
 */
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<1>             num_channels,
    Int2Type<1>             num_active_channels,
    Int2Type<NPP>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
    CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
    int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
    LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN

    };

    if (d_temp_storage_bytes == NULL)
    {
        int nDeviceBufferSize;
        nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
        temp_storage_bytes = nDeviceBufferSize;
    }
    else
    {
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            // compute the histogram
            nppiHistogramEven_8u_C1R(
                d_samples,
                row_stride_bytes,
                oSizeROI,
                d_histogram[0],
                num_levels[0],
                lower_level[0],
                upper_level[0],

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


/**
 * Dispatch to 3/4 8b NPP histo-even
 */
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<4>          num_channels,
    Int2Type<3>   num_active_channels,
    Int2Type<NPP>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
    CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
    int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
    LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN

    };

    if (d_temp_storage_bytes == NULL)
    {
        int nDeviceBufferSize;
        nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
        temp_storage_bytes = nDeviceBufferSize;
    }
    else
    {
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            // compute the histogram
            nppiHistogramEven_8u_AC4R(
                d_samples,
                row_stride_bytes,
                oSizeROI,
                d_histogram,
                num_levels,
                lower_level,
                upper_level,

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


/**
 * Dispatch to CUB single histogram-even entrypoint
 */
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<1>             num_channels,
    Int2Type<1>             num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              lower_level[1],                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[1],                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::HistogramEven(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram[0],
            num_levels[0],
            lower_level[0],
            upper_level[0],
            num_row_pixels,

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


/**
 * Dispatch to CUB multi histogram-even entrypoint
 */
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
    Int2Type<NUM_CHANNELS>          num_channels,
    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
    LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram,
            num_levels,
            lower_level,
            upper_level,
            num_row_pixels,

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


/**
 * Dispatch to CUB single histogram-range entrypoint
 */
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
    Int2Type<1>             num_channels,
    Int2Type<1>             num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              *d_levels[1],                               ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::HistogramRange(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram[0],
            num_levels[0],
            d_levels[0],
            num_row_pixels,
            num_rows,

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN


/**
 * Dispatch to CUB multi histogram-range entrypoint
 */
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
    Int2Type<NUM_CHANNELS>          num_channels,
    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
    Int2Type<CUB>           dispatch_to,
    int                     timing_timing_iterations,
    size_t                  *d_temp_storage_bytes,
    cudaError_t             *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
    LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;

    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
            d_temp_storage,
            temp_storage_bytes,
            (const SampleT *) d_samples,
            d_histogram,
            num_levels,
            d_levels,
            num_row_pixels,
            num_rows,

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN

//---------------------------------------------------------------------
// CUDA nested-parallelism test kernel
//---------------------------------------------------------------------

/**
 * Simple wrapper kernel to invoke DeviceHistogram
 * /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
__global__ void CnpDispatchKernel(
    Int2Type<ALGORITHM> algorithm,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t              temp_storage_bytes,
    SampleT             *d_samples,
    SampleIteratorT      d_sample_itr,
    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
    int                 num_samples,
    bool                debug_synchronous)
{
#ifndef CUB_CDP
    *d_cdp_error = cudaErrorNotSupported;
#else
    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_s...
    *d_temp_storage_bytes = temp_storage_bytes;
#endif
}


/ **
 * Dispatch to CDP kernel
 * /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
cudaError_t Dispatch(
    Int2Type<ALGORITHM> algorithm,
    Int2Type<true>      use_cdp,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    SampleT             *d_samples,
    SampleIteratorT      d_sample_itr,
    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
    int                 num_samples,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];

    // Invoke kernel to invoke device-side dispatch
    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo...

    // Copy out temp_storage_bytes
    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

    // Copy out error
    cudaError_t retval;
    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
    return retval;
}
*/

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN

        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
        error |= channel_error;
    }

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();

    DispatchEven(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, lower_level, upper_level,
        num_row_pixels, num_rows, row_stride_bytes,
        0, false);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(SampleT);
        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
            avg_millis,
            giga_rate,
            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
            giga_rate / NUM_CHANNELS,
            giga_bandwidth);
    }

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN

        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
        error |= channel_error;
    }

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();

    DispatchRange(
        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes,
        d_samples, d_histogram, num_levels, d_levels,
        num_row_pixels, num_rows, row_stride_bytes,
        0, false);

    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(SampleT);
        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
            avg_millis,
            giga_rate,
            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
            giga_rate / NUM_CHANNELS,
            giga_bandwidth);
    }

xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    g_verbose_input = args.CheckCmdLineFlag("v2");
    args.GetCmdLineArgument("n", num_row_pixels);

    int row_stride_pixels = num_row_pixels;

    args.GetCmdLineArgument("rows", num_rows);
    args.GetCmdLineArgument("stride", row_stride_pixels);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);
    args.GetCmdLineArgument("entropy", entropy_reduction);

    bool compare_npp = args.CheckCmdLineFlag("npp");


    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<pixels per row> "
            "[--rows=<number of rows> "
            "[--stride=<row stride in pixels> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--repeat=<repetitions of entire test suite>]"
            "[--entropy=<entropy-reduction factor (default 0)>]"
            "[--v] "
            "[--cdp]"
            "[--npp]"
            "\n", argv[0]);
        exit(0);
    }

xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);

// Dispatch types
enum Backend
{
    CUB,                        // CUB method (allows overwriting of input)
    CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)

    CUB_SEGMENTED,              // CUB method (allows overwriting of input)

xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN

    }
    if (BACKEND == CUB_NO_OVERWRITE)
    {
        // Check that input isn't overwritten
        int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
        compare |= input_compare;
        printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
    }

    // Performance
    if (g_timing_iterations)
        printf("\nPerforming timing iterations:\n"); fflush(stdout);

    GpuTimer gpu_timer;
    float elapsed_millis = 0.0f;
    for (int i = 0; i < g_timing_iterations; ++i)
    {
        // Initialize/clear device arrays
        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
        if (!KEYS_ONLY)
        {
            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
        }

xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN

        CubDebugExit(Dispatch(
            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
            num_items, num_segments, d_segment_offsets,
            begin_bit, end_bit, 0, false));
        gpu_timer.Stop();
        elapsed_millis += gpu_timer.ElapsedMillis();
    }

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = (KEYS_ONLY) ?
            giga_rate * sizeof(KeyT) * 2 :
            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
        printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
    }

    printf("\n\n");

    // Cleanup

xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN

    int bits = -1;
    int num_items = -1;
    int num_segments = -1;
    int entropy_reduction = 0;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_items);
    args.GetCmdLineArgument("s", num_segments);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);
    args.GetCmdLineArgument("bits", bits);
    args.GetCmdLineArgument("entropy", entropy_reduction);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--bits=<valid key bits>]"
            "[--n=<input items> "
            "[--s=<num segments> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--repeat=<repetitions of entire test suite>]"
            "[--v] "
            "[--entropy=<entropy-reduction factor (default 0)>]"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN



//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

int                     g_ptx_version;
int                     g_sm_count;
bool                    g_verbose           = false;
bool                    g_verbose_input     = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);


// Dispatch types
enum Backend
{
    CUB,            // CUB method
    CUB_SEGMENTED,  // CUB segmented method
    CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

// Dispatch to different CUB DeviceReduce entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to reduce entrypoint (custom-max)
 */
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

    // The output value type
    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    // Max-identity
    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
            d_in, d_out, num_items, reduction_op, identity,
            stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::Sum            reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to min entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::Min            reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to max entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::Max            reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to argmin entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::ArgMin         reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to argmax entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::ArgMax         reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
    }
    return error;
}


//---------------------------------------------------------------------
// Dispatch to different CUB DeviceSegmentedReduce entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to reduce entrypoint (custom-max)
 */
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

    // The output value type
    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    // Max-identity
    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
            stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::Sum            reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
            stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to min entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::Min            reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
            stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to max entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::Max            reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
            stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to argmin entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::ArgMin         reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
            stream, debug_synchronous);
    }
    return error;
}

/**
 * Dispatch to argmax entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    cub::ArgMax         reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
            stream, debug_synchronous);
    }
    return error;
}


//---------------------------------------------------------------------
// Dispatch to different Thrust entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to reduction entrypoint (min or max specialization)
 */
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
cudaError_t Dispatch(
    Int2Type<THRUST>    dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

    {
        temp_storage_bytes = 1;
    }
    else
    {
        OutputT init;
        CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));

        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
        OutputT retval;
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
        }

        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
    }

    return cudaSuccess;
}

/**
 * Dispatch to reduction entrypoint (sum specialization)
 */
template <typename InputIteratorT, typename OutputIteratorT>
cudaError_t Dispatch(
    Int2Type<THRUST>    dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    if (d_temp_storage == 0)
    {
        temp_storage_bytes = 1;
    }
    else
    {
        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
        OutputT retval;
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
        }

        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
    }

    return cudaSuccess;
}

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

//---------------------------------------------------------------------

/**
 * Simple wrapper kernel to invoke DeviceReduce
 */
template <
    typename            InputIteratorT,
    typename            OutputIteratorT,
    typename            ReductionOpT>
__global__ void CnpDispatchKernel(
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t              temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    ReductionOpT         reduction_op,
    bool                debug_synchronous)
{
#ifndef CUB_CDP
    *d_cdp_error = cudaErrorNotSupported;
#else
    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
    *d_temp_storage_bytes = temp_storage_bytes;
#endif
}


/**
 * Dispatch to CUB_CDP kernel
 */
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB_CDP>       dispatch_to,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 max_segments,
    int                 *d_segment_offsets,
    ReductionOpT         reduction_op,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to invoke device-side dispatch
    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);

    // Copy out temp_storage_bytes
    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

    // Copy out error
    cudaError_t retval;
    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
    return retval;
}

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN


    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    if (g_timing_iterations > 0)
    {
        GpuTimer gpu_timer;
        gpu_timer.Start();

        CubDebugExit(Dispatch(backend, g_timing_iterations,
            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
            d_in, d_out, num_items, num_segments, d_segment_offsets,
            reduction_op, 0, false));

        gpu_timer.Stop();
        float elapsed_millis = gpu_timer.ElapsedMillis();

        // Display performance
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(InputT);
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }

    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

        //
        // Black-box testing on all backends
        //

        // Test 0, 1, many
        TestByGenMode<InputT, OutputT>(0,           max_segments);
        TestByGenMode<InputT, OutputT>(1,           max_segments);
        TestByGenMode<InputT, OutputT>(max_items,   max_segments);

        // Test random problem sizes from a log-distribution [8, max_items-ish)
        int     num_iterations = 8;
        double  max_exp = log(double(max_items)) / log(double(2.0));
        for (int i = 0; i < num_iterations; ++i)
        {
            int num_items = (int) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
            TestByGenMode<InputT, OutputT>(num_items, max_segments);
        }

        //
        // White-box testing of single-segment problems around specific sizes
        //

        // Tile-boundaries: multiple blocks, one tile per block

xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN

{
    int max_items      = 27000000;
    int max_segments   = 34000;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    g_verbose_input = args.CheckCmdLineFlag("v2");
    args.GetCmdLineArgument("n", max_items);
    args.GetCmdLineArgument("s", max_segments);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<input items> "
            "[--s=<num segments> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--repeat=<repetitions of entire test suite>]"
            "[--v] "
            "[--cdp]"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);

// Dispatch types
enum Backend
{
    CUB,        // CUB method
    THRUST,     // Thrust method
    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
};

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

    typename                    KeyOutputIteratorT,
    typename                    ValueInputIteratorT,
    typename                    ValueOutputIteratorT,
    typename                    NumRunsIteratorT,
    typename                    EqualityOpT,
    typename                    ReductionOpT,
    typename                    OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>               dispatch_to,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void                        *d_temp_storage,
    size_t                      &temp_storage_bytes,
    KeyInputIteratorT           d_keys_in,
    KeyOutputIteratorT          d_keys_out,
    ValueInputIteratorT         d_values_in,
    ValueOutputIteratorT        d_values_out,
    NumRunsIteratorT            d_num_runs,
    EqualityOpT                  equality_op,
    ReductionOpT                 reduction_op,
    OffsetT                     num_items,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceReduce::ReduceByKey(
            d_temp_storage,
            temp_storage_bytes,
            d_keys_in,
            d_keys_out,
            d_values_in,
            d_values_out,
            d_num_runs,
            reduction_op,

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

    typename                    KeyInputIteratorT,
    typename                    KeyOutputIteratorT,
    typename                    ValueInputIteratorT,
    typename                    ValueOutputIteratorT,
    typename                    NumRunsIteratorT,
    typename                    EqualityOpT,
    typename                    ReductionOpT,
    typename                    OffsetT>
cudaError_t Dispatch(
    Int2Type<THRUST>            dispatch_to,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void                        *d_temp_storage,
    size_t                      &temp_storage_bytes,
    KeyInputIteratorT           d_keys_in,
    KeyOutputIteratorT          d_keys_out,
    ValueInputIteratorT         d_values_in,
    ValueOutputIteratorT        d_values_out,
    NumRunsIteratorT            d_num_runs,

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

    else
    {
        thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
        thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);

        thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
        thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);

        thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;

        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            d_out_ends = thrust::reduce_by_key(
                d_keys_in_wrapper,
                d_keys_in_wrapper + num_items,
                d_values_in_wrapper,
                d_keys_out_wrapper,
                d_values_out_wrapper);
        }

        OffsetT num_segments = d_out_ends.first - d_keys_out_wrapper;

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

template <
    typename                    KeyInputIteratorT,
    typename                    KeyOutputIteratorT,
    typename                    ValueInputIteratorT,
    typename                    ValueOutputIteratorT,
    typename                    NumRunsIteratorT,
    typename                    EqualityOpT,
    typename                    ReductionOpT,
    typename                    OffsetT>
__global__ void CnpDispatchKernel(
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void                        *d_temp_storage,
    size_t                      temp_storage_bytes,
    KeyInputIteratorT           d_keys_in,
    KeyOutputIteratorT          d_keys_out,
    ValueInputIteratorT         d_values_in,
    ValueOutputIteratorT        d_values_out,
    NumRunsIteratorT            d_num_runs,
    EqualityOpT                 equality_op,
    ReductionOpT                reduction_op,
    OffsetT                     num_items,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{

#ifndef CUB_CDP
    *d_cdp_error = cudaErrorNotSupported;
#else
    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);

    *d_temp_storage_bytes = temp_storage_bytes;
#endif
}


/**
 * Dispatch to CDP kernel
 */

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

    typename                    KeyOutputIteratorT,
    typename                    ValueInputIteratorT,
    typename                    ValueOutputIteratorT,
    typename                    NumRunsIteratorT,
    typename                    EqualityOpT,
    typename                    ReductionOpT,
    typename                    OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CDP>               dispatch_to,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void                        *d_temp_storage,
    size_t                      &temp_storage_bytes,
    KeyInputIteratorT           d_keys_in,
    KeyOutputIteratorT          d_keys_out,
    ValueInputIteratorT         d_values_in,
    ValueOutputIteratorT        d_values_out,
    NumRunsIteratorT            d_num_runs,
    EqualityOpT                 equality_op,
    ReductionOpT                reduction_op,
    OffsetT                     num_items,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    // Invoke kernel to invoke device-side dispatch
    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);

    // Copy out temp_storage_bytes
    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

    // Copy out error
    cudaError_t retval;
    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
    return retval;
}

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float   avg_millis  = elapsed_millis / g_timing_iterations;
        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }
    printf("\n\n");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN

int main(int argc, char** argv)
{
    int num_items           = -1;
    int entropy_reduction   = 0;
    int maxseg              = 1000;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_items);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);
    args.GetCmdLineArgument("maxseg", maxseg);
    args.GetCmdLineArgument("entropy", entropy_reduction);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<input items> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--maxseg=<max segment length>]"
            "[--entropy=<segment length bit entropy reduction rounds>]"
            "[--repeat=<repetitions of entire test suite>]"
            "[--v] "
            "[--cdp]"
            "\n", argv[0]);
        exit(0);
    }

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
CachingDeviceAllocator  g_allocator(true);

// Dispatch types
enum Backend
{
    CUB,        // CUB method
    THRUST,     // Thrust method
    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
};

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

    typename                    InputIteratorT,
    typename                    UniqueOutputIteratorT,
    typename                    OffsetsOutputIteratorT,
    typename                    LengthsOutputIteratorT,
    typename                    NumRunsIterator,
    typename                    OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<RLE>               method,
    Int2Type<CUB>               dispatch_to,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void*               d_temp_storage,
    size_t                      &temp_storage_bytes,
    InputIteratorT              d_in,
    UniqueOutputIteratorT       d_unique_out,
    OffsetsOutputIteratorT      d_offsets_out,
    LengthsOutputIteratorT      d_lengths_out,
    NumRunsIterator             d_num_runs,
    cub::Equality               equality_op,
    OffsetT                     num_items,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceRunLengthEncode::Encode(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_unique_out,
            d_lengths_out,
            d_num_runs,
            num_items,
            stream,

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

    typename                    InputIteratorT,
    typename                    UniqueOutputIteratorT,
    typename                    OffsetsOutputIteratorT,
    typename                    LengthsOutputIteratorT,
    typename                    NumRunsIterator,
    typename                    OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<NON_TRIVIAL>       method,
    Int2Type<CUB>               dispatch_to,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void*               d_temp_storage,
    size_t                      &temp_storage_bytes,
    InputIteratorT              d_in,
    UniqueOutputIteratorT       d_unique_out,
    OffsetsOutputIteratorT      d_offsets_out,
    LengthsOutputIteratorT      d_lengths_out,
    NumRunsIterator             d_num_runs,
    cub::Equality               equality_op,
    OffsetT                     num_items,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceRunLengthEncode::NonTrivialRuns(
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_offsets_out,
            d_lengths_out,
            d_num_runs,
            num_items,
            stream,

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

template <
    typename                    InputIteratorT,
    typename                    UniqueOutputIteratorT,
    typename                    OffsetsOutputIteratorT,
    typename                    LengthsOutputIteratorT,
    typename                    NumRunsIterator,
    typename                    OffsetT>
cudaError_t Dispatch(
    Int2Type<RLE>               method,
    Int2Type<THRUST>            dispatch_to,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void                        *d_temp_storage,
    size_t                      &temp_storage_bytes,
    InputIteratorT              d_in,
    UniqueOutputIteratorT       d_unique_out,
    OffsetsOutputIteratorT      d_offsets_out,
    LengthsOutputIteratorT      d_lengths_out,
    NumRunsIterator             d_num_runs,

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
        thrust::device_ptr<UniqueT>     d_unique_out_wrapper(d_unique_out);
        thrust::device_ptr<LengthT>     d_lengths_out_wrapper(d_lengths_out);

        thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;

        LengthT one_val;
        InitValue(INTEGER_SEED, one_val, 1);
        thrust::constant_iterator<LengthT> constant_one(one_val);

        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            d_out_ends = thrust::reduce_by_key(
                d_in_wrapper,
                d_in_wrapper + num_items,
                constant_one,
                d_unique_out_wrapper,
                d_lengths_out_wrapper);
        }

        OffsetT num_runs = d_out_ends.first - d_unique_out_wrapper;

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

    int                         RLE_METHOD,
    typename                    InputIteratorT,
    typename                    UniqueOutputIteratorT,
    typename                    OffsetsOutputIteratorT,
    typename                    LengthsOutputIteratorT,
    typename                    NumRunsIterator,
    typename                    EqualityOp,
    typename                    OffsetT>
__global__ void CnpDispatchKernel(
    Int2Type<RLE_METHOD>            method,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void*               d_temp_storage,
    size_t                      temp_storage_bytes,
    InputIteratorT              d_in,
    UniqueOutputIteratorT       d_unique_out,
    OffsetsOutputIteratorT      d_offsets_out,
    LengthsOutputIteratorT      d_lengths_out,
    NumRunsIterator             d_num_runs,
    cub::Equality               equality_op,
    OffsetT                     num_items,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{

#ifndef CUB_CDP
    *d_cdp_error = cudaErrorNotSupported;
#else
    *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);

    *d_temp_storage_bytes = temp_storage_bytes;
#endif
}


/**
 * Dispatch to CDP kernel
 */

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

    typename                    UniqueOutputIteratorT,
    typename                    OffsetsOutputIteratorT,
    typename                    LengthsOutputIteratorT,
    typename                    NumRunsIterator,
    typename                    EqualityOp,
    typename                    OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<RLE_METHOD>        method,
    Int2Type<CDP>               dispatch_to,
    int                         timing_timing_iterations,
    size_t                      *d_temp_storage_bytes,
    cudaError_t                 *d_cdp_error,

    void*               d_temp_storage,
    size_t                      &temp_storage_bytes,
    InputIteratorT              d_in,
    UniqueOutputIteratorT       d_unique_out,
    OffsetsOutputIteratorT      d_offsets_out,
    LengthsOutputIteratorT      d_lengths_out,
    NumRunsIterator             d_num_runs,
    EqualityOp                  equality_op,
    OffsetT                     num_items,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    // Invoke kernel to invoke device-side dispatch
    CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);

    // Copy out temp_storage_bytes
    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

    // Copy out error
    cudaError_t retval;
    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
    return retval;
}

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0...
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
    }
    printf("\n\n");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN

int main(int argc, char** argv)
{
    int num_items           = -1;
    int entropy_reduction   = 0;
    int max_segment              = 1000;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_items);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);
    args.GetCmdLineArgument("maxseg", max_segment);
    args.GetCmdLineArgument("entropy", entropy_reduction);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<input items> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--maxseg=<max segment length>]"
            "[--entropy=<segment length bit entropy reduction rounds>]"
            "[--repeat=<repetitions of entire test suite>]"
            "[--v] "
            "[--cdp]"
            "\n", argv[0]);
        exit(0);
    }

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose           = false;
int                     g_timing_iterations = 0;
int                     g_repeat            = 0;
double                  g_device_giga_bandwidth;
CachingDeviceAllocator  g_allocator(true);

// Dispatch types
enum Backend
{
    CUB,        // CUB method
    THRUST,     // Thrust method
    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

//---------------------------------------------------------------------

/**
 * Dispatch to exclusive scan entrypoint
 */
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    IsPrimitiveT        is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    ScanOpT             scan_op,
    InitialValueT       initial_value,
    OffsetT             num_items,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to exclusive sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    Int2Type<true>      is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    Sum                 scan_op,
    InitialValueT       initial_value,
    OffsetT             num_items,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to inclusive scan entrypoint
 */
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    IsPrimitiveT        is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    ScanOpT             scan_op,
    NullType            initial_value,
    OffsetT             num_items,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to inclusive sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>       dispatch_to,
    Int2Type<true>      is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    Sum                 scan_op,
    NullType            initial_value,
    OffsetT             num_items,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
    }
    return error;
}

//---------------------------------------------------------------------
// Dispatch to different Thrust entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to exclusive scan entrypoint
 */
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
cudaError_t Dispatch(
    Int2Type<THRUST>    dispatch_to,
    IsPrimitiveT        is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    ScanOpT             scan_op,
    InitialValueT       initial_value,
    OffsetT             num_items,

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    if (d_temp_storage == 0)
    {
        temp_storage_bytes = 1;
    }
    else
    {
        thrust::device_ptr<InputT> d_in_wrapper(d_in);
        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op);
        }
    }

    return cudaSuccess;
}


/**
 * Dispatch to exclusive sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
cudaError_t Dispatch(
    Int2Type<THRUST>    dispatch_to,
    Int2Type<true>      is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    Sum                 scan_op,
    InitialValueT       initial_value,
    OffsetT             num_items,

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    if (d_temp_storage == 0)
    {
        temp_storage_bytes = 1;
    }
    else
    {
        thrust::device_ptr<InputT> d_in_wrapper(d_in);
        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
        }
    }

    return cudaSuccess;
}


/**
 * Dispatch to inclusive scan entrypoint
 */
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
cudaError_t Dispatch(
    Int2Type<THRUST>    dispatch_to,
    IsPrimitiveT        is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    ScanOpT             scan_op,
    NullType            initial_value,
    OffsetT             num_items,

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    if (d_temp_storage == 0)
    {
        temp_storage_bytes = 1;
    }
    else
    {
        thrust::device_ptr<InputT> d_in_wrapper(d_in);
        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op);
        }
    }

    return cudaSuccess;
}


/**
 * Dispatch to inclusive sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
cudaError_t Dispatch(
    Int2Type<THRUST>    dispatch_to,
    Int2Type<true>      is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    Sum                 scan_op,
    NullType            initial_value,
    OffsetT             num_items,

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type

    if (d_temp_storage == 0)
    {
        temp_storage_bytes = 1;
    }
    else
    {
        thrust::device_ptr<InputT> d_in_wrapper(d_in);
        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
        }
    }

    return cudaSuccess;
}



//---------------------------------------------------------------------
// CUDA Nested Parallelism Test Kernel
//---------------------------------------------------------------------

/**
 * Simple wrapper kernel to invoke DeviceScan
 */
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
__global__ void CnpDispatchKernel(
    IsPrimitiveT        is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t              temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    ScanOpT             scan_op,
    InitialValueT       initial_value,
    OffsetT             num_items,
    bool                debug_synchronous)
{
#ifndef CUB_CDP
    *d_cdp_error = cudaErrorNotSupported;
#else
    *d_cdp_error = Dispatch(
        Int2Type<CUB>(),
        is_primitive,
        timing_timing_iterations,
        d_temp_storage_bytes,
        d_cdp_error,
        d_temp_storage,
        temp_storage_bytes,
        d_in,
        d_out,
        scan_op,
        initial_value,
        num_items,
        0,

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

}


/**
 * Dispatch to CDP kernel
 */
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
cudaError_t Dispatch(
    Int2Type<CDP>       dispatch_to,
    IsPrimitiveT        is_primitive,
    int                 timing_timing_iterations,
    size_t              *d_temp_storage_bytes,
    cudaError_t         *d_cdp_error,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    ScanOpT             scan_op,
    InitialValueT       initial_value,
    OffsetT             num_items,
    cudaStream_t        stream,
    bool                debug_synchronous)
{
    // Invoke kernel to invoke device-side dispatch
    CnpDispatchKernel<<<1,1>>>(
        is_primitive,
        timing_timing_iterations,
        d_temp_storage_bytes,
        d_cdp_error,
        d_temp_storage,
        temp_storage_bytes,
        d_in,
        d_out,
        scan_op,
        initial_value,
        num_items,
        debug_synchronous);

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN


    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
        Int2Type<Traits<OutputT>::PRIMITIVE>(),
        g_timing_iterations,
        d_temp_storage_bytes,
        d_cdp_error,
        d_temp_storage,
        temp_storage_bytes,
        d_in,
        d_out,
        scan_op,
        initial_value,
        num_items,
        0,
        false));
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
    }

    printf("\n\n");

    // Cleanup
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));

xgboost/cub/test/test_device_scan.cu view on Meta::CPAN

 * Main
 */
int main(int argc, char** argv)
{
    int num_items = -1;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_items);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<input items> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--repeat=<repetitions of entire test suite>]"
            "[--v] "
            "[--cdp]"
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

#include "test_util.h"

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

bool                    g_verbose               = false;
int                     g_timing_iterations     = 0;
int                     g_repeat                = 0;
float                   g_device_giga_bandwidth;
CachingDeviceAllocator  g_allocator(true);

// Dispatch types
enum Backend
{
    CUB,        // CUB method
    THRUST,     // Thrust method
    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN


/**
 * Dispatch to select if entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>               dispatch_to,
    Int2Type<false>             is_flagged,
    Int2Type<false>             is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,
    SelectOpT                   select_op,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to partition if entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>               dispatch_to,
    Int2Type<false>             is_flagged,
    Int2Type<true>              is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,
    SelectOpT                   select_op,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to select flagged entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>               dispatch_to,
    Int2Type<true>              is_flagged,
    Int2Type<false>             partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,
    SelectOpT                   select_op,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
    }
    return error;
}


/**
 * Dispatch to partition flagged entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
    Int2Type<CUB>               dispatch_to,
    Int2Type<true>              is_flagged,
    Int2Type<true>              partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,
    SelectOpT                   select_op,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_timing_iterations; ++i)
    {
        error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
    }
    return error;
}


//---------------------------------------------------------------------
// Dispatch to different Thrust entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to select if entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
    Int2Type<THRUST>            dispatch_to,
    Int2Type<false>             is_flagged,
    Int2Type<false>             is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

    if (d_temp_storage == 0)
    {
        temp_storage_bytes = 1;
    }
    else
    {
        thrust::device_ptr<OutputT>         d_out_wrapper_end;
        thrust::device_ptr<InputT>          d_in_wrapper(d_in);
        thrust::device_ptr<OutputT>         d_out_wrapper(d_out);

        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
        }

        OffsetT num_selected = d_out_wrapper_end - d_out_wrapper;
        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
    }

    return cudaSuccess;
}

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN


/**
 * Dispatch to partition if entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
    Int2Type<THRUST>            dispatch_to,
    Int2Type<false>             is_flagged,
    Int2Type<true>              is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

    }
    else
    {
        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;

        thrust::device_ptr<InputT>       d_in_wrapper(d_in);
        thrust::device_ptr<OutputT>       d_out_wrapper(d_out);

        ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);

        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            d_out_wrapper_end = thrust::partition_copy(
                d_in_wrapper,
                d_in_wrapper + num_items,
                d_out_wrapper,
                d_out_unselected,
                select_op);
        }

        OffsetT num_selected = d_out_wrapper_end.first - d_out_wrapper;

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN


/**
 * Dispatch to select flagged entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
    Int2Type<THRUST>            dispatch_to,
    Int2Type<true>              is_flagged,
    Int2Type<false>             is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

    {
        temp_storage_bytes = 1;
    }
    else
    {
        thrust::device_ptr<OutputT>     d_out_wrapper_end;
        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
        thrust::device_ptr<OutputT>     d_out_wrapper(d_out);
        thrust::device_ptr<FlagT>       d_flags_wrapper(d_flags);

        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, Cast<bool>());
        }

        OffsetT num_selected = d_out_wrapper_end - d_out_wrapper;
        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
    }

    return cudaSuccess;
}

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN


/**
 * Dispatch to partition flagged entrypoint
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
    Int2Type<THRUST>            dispatch_to,
    Int2Type<true>              is_flagged,
    Int2Type<true>              is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

    }
    else
    {
        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;

        thrust::device_ptr<InputT>  d_in_wrapper(d_in);
        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
        thrust::device_ptr<FlagT>   d_flags_wrapper(d_flags);
        ReverseOutputIteratorT      d_out_unselected(d_out_wrapper + num_items);

        for (int i = 0; i < timing_timing_iterations; ++i)
        {
            d_out_wrapper_end = thrust::partition_copy(
                d_in_wrapper,
                d_in_wrapper + num_items,
                d_flags_wrapper,
                d_out_wrapper,
                d_out_unselected,
                Cast<bool>());
        }

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

// CUDA Nested Parallelism Test Kernel
//---------------------------------------------------------------------

/**
 * Simple wrapper kernel to invoke DeviceSelect
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
__global__ void CnpDispatchKernel(
    IsFlaggedTag                is_flagged,
    IsPartitionTag              is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t                      temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,
    SelectOpT                   select_op,
    bool                        debug_synchronous)
{

#ifndef CUB_CDP
    *d_cdp_error = cudaErrorNotSupported;
#else
    *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
    *d_temp_storage_bytes = temp_storage_bytes;
#endif
}


/**
 * Dispatch to CDP kernel
 */
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
cudaError_t Dispatch(
    Int2Type<CDP>               dispatch_to,
    IsFlaggedTag                is_flagged,
    IsPartitionTag              is_partition,
    int                         timing_timing_iterations,
    size_t*                     d_temp_storage_bytes,
    cudaError_t*                d_cdp_error,

    void*                       d_temp_storage,
    size_t&                     temp_storage_bytes,
    InputIteratorT              d_in,
    FlagIteratorT               d_flags,
    OutputIteratorT             d_out,
    NumSelectedIteratorT        d_num_selected_out,
    OffsetT                     num_items,
    SelectOpT                   select_op,
    cudaStream_t                stream,
    bool                        debug_synchronous)
{
    // Invoke kernel to invoke device-side dispatch
    CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);

    // Copy out temp_storage_bytes
    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

    // Copy out error
    cudaError_t retval;
    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
    return retval;
}

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    GpuTimer gpu_timer;
    gpu_timer.Start();
    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
    gpu_timer.Stop();
    float elapsed_millis = gpu_timer.ElapsedMillis();

    // Display performance
    if (g_timing_iterations > 0)
    {
        float   avg_millis          = elapsed_millis / g_timing_iterations;
        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;

        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
    }
    printf("\n\n");

xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN

 */
int main(int argc, char** argv)
{
    int num_items           = -1;
    float select_ratio      = 0.5;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    args.GetCmdLineArgument("n", num_items);
    args.GetCmdLineArgument("i", g_timing_iterations);
    args.GetCmdLineArgument("repeat", g_repeat);
    args.GetCmdLineArgument("ratio", select_ratio);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<input items> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--ratio=<selection ratio, default 0.5>] "
            "[--repeat=<repetitions of entire test suite>] "
            "[--v] "
            "[--cdp] "
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device

( run in 1.846 second using v1.01-cache-2.11-cpan-96521ef73a4 )