view release on metacpan or search on metacpan
xgboost/R-package/R/callbacks.R view on Meta::CPAN
#' Callback closures for booster training.
#'
#' These are used to perform various service tasks either during boosting iterations or at the end.
#' This approach helps to modularize many of such tasks without bloating the main training methods,
#' and it offers .
#'
#' @details
#' By default, a callback function is run after each boosting iteration.
#' An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
#'
#' When a callback function has \code{finalize} parameter, its finalizer part will also be run after
#' the boosting is completed.
#'
xgboost/R-package/R/callbacks.R view on Meta::CPAN
#
# Callbacks -------------------------------------------------------------------
#
#' Callback closure for printing the result of evaluation
#'
#' @param period results would be printed every number of periods
#' @param showsd whether standard deviations should be printed (when available)
#'
#' @details
#' The callback function prints the result of evaluation at every \code{period} iterations.
#' The initial and the last iteration's evaluations are always printed.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst_evaluation} (also \code{bst_evaluation_err} when available),
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration}.
#'
#' @seealso
#' \code{\link{callbacks}}
xgboost/R-package/R/callbacks.R view on Meta::CPAN
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.early.stop'
callback
}
#' Callback closure for saving a model file.
#'
#' @param save_period save the model to disk after every
#' \code{save_period} iterations; 0 means save the model at the end.
#' @param save_name the name or path for the saved model file.
#' It can contain a \code{\link[base]{sprintf}} formatting specifier
#' to include the integer iteration number in the file name.
#' E.g., with \code{save_name} = 'xgboost_%04d.model',
#' the file saved at iteration 50 would be named "xgboost_0050.model".
#'
#' @details
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
#'
#' Callback function expects the following values to be set in its calling frame:
xgboost/R-package/R/xgb.Booster.R view on Meta::CPAN
#' Predict method for eXtreme Gradient Boosting model
#'
#' Predicted values based on either xgboost model or model handle object.
#'
#' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix. Pick a float value that represents
#' missing values in data (e.g., sometimes 0 or some other extreme value is used).
#' @param outputmargin whether the prediction should be returned in the for of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
#' logistic regression would result in predictions for log-odds instead of probabilities.
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
#' It will use all the trees by default (\code{NULL} value).
#' @param predleaf whether predict leaf index instead.
#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
#' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
#' @param ... Parameters passed to \code{predict.xgb.Booster}
#'
#' @details
#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
#' and it is not necessarily equal to the number of trees in a model.
#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
#' But for multiclass classification, while there are multiple trees per iteration,
#' \code{ntreelimit} limits the number of boosting iterations.
#'
#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
#' since gblinear doesn't keep its boosting history.
#'
#' One possible practical applications of the \code{predleaf} option is to use the model
#' as a generator of new features which capture non-linearity and interactions,
#' e.g., as implemented in \code{\link{xgb.create.features}}.
#'
#' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
#' individual predictions. For "gblinear" booster, feature contributions are simply linear terms
xgboost/R-package/R/xgb.Booster.R view on Meta::CPAN
#' sum(pred_labels != lb)/length(lb)
#'
#' # compare that to the predictions from softmax:
#' set.seed(11)
#' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
#' max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
#' objective = "multi:softmax", num_class = num_class)
#' pred <- predict(bst, as.matrix(iris[, -5]))
#' str(pred)
#' all.equal(pred, pred_labels)
#' # prediction from using only 5 iterations should result
#' # in the same error as seen in iteration 5:
#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
#' sum(pred5 != lb)/length(lb)
#'
#'
#' ## random forest-like model of 25 trees for binary classification:
#'
#' set.seed(11)
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
#' nthread = 2, nrounds = 1, objective = "binary:logistic",
xgboost/R-package/R/xgb.cv.R view on Meta::CPAN
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max_depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' See \code{\link{xgb.train}} for further details.
#' See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param missing is only used when input is a dense matrix. By default is set to NA, which means
#' that NA values should be considered as 'missing' by the algorithm.
#' Sometimes, 0 or other extreme value might be used to represent missing values.
#' @param prediction A logical value indicating whether to return the test fold predictions
#' from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in cross validation,
#' when it is not specified, the evaluation metric is chosen according to objective function.
xgboost/R-package/R/xgb.cv.R view on Meta::CPAN
#' \itemize{
#' \item \code{call} a function call.
#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not
#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
#' \item \code{callbacks} callback functions that were either automatically assigned or
#' explicitely passed.
#' \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
#' first column corresponding to iteration number and the rest corresponding to the
#' CV-based evaluation means and standard deviations for the training and test CV-sets.
#' It is created by the \code{\link{cb.evaluation.log}} callback.
#' \item \code{niter} number of boosting iterations.
#' \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
#' parameter or randomly generated.
#' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping).
#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
#' which could further be used in \code{predict} method
#' (only available with early stopping).
#' \item \code{pred} CV prediction values available when \code{prediction} is set.
#' It is either vector or matrix (see \code{\link{cb.cv.predict}}).
#' \item \code{models} a liost of the CV folds' models. It is only available with the explicit
xgboost/R-package/R/xgb.cv.R view on Meta::CPAN
dall <- xgb.get.DMatrix(data, label, missing)
bst_folds <- lapply(seq_along(folds), function(k) {
dtest <- slice(dall, folds[[k]])
dtrain <- slice(dall, unlist(folds[-k]))
handle <- xgb.Booster.handle(params, list(dtrain, dtest))
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test=dtest), index = folds[[k]])
})
# a "basket" to collect some results from callbacks
basket <- list()
# extract parameters that can affect the relationship b/w #trees and #iterations
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
# those are fixed for CV (no training continuation)
begin_iteration <- 1
end_iteration <- nrounds
# synchronous CV boosting: run CV folds' models within each iteration
for (iteration in begin_iteration:end_iteration) {
xgboost/R-package/R/xgb.plot.deepness.R view on Meta::CPAN
#' \itemize{
#' \item the distribution of the number of leafs in a tree model at a certain depth;
#' \item the distribution of average weighted number of observations ("cover")
#' ending up in leafs at certain depth.
#' }
#' Those could be helpful in determining sensible ranges of the \code{max_depth}
#' and \code{min_child_weight} parameters.
#'
#' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
#' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
#' a tree's median absolute leaf weight changes through the iterations.
#'
#' This function was inspired by the blog post
#' \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
#'
#' @return
#'
#' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
#' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
#' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
#'
xgboost/R-package/R/xgb.train.R view on Meta::CPAN
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
#' \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each cl...
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#' }
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ...
#' }
#'
#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
#' @param nrounds max number of boosting iterations.
#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
#' Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
#' of these datasets during each boosting iteration, and stored in the end as a field named
#' \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
#' \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
#' printed out during the training.
#' E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
#' the performance of each round's model on mat1 and mat2.
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain.
xgboost/R-package/R/xgb.train.R view on Meta::CPAN
#' \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
#' \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
#' \item \code{cb.save.model}: when \code{save_period > 0} is set.
#' }
#'
#' @return
#' An object of class \code{xgb.Booster} with the following elements:
#' \itemize{
#' \item \code{handle} a handle (pointer) to the xgboost model in memory.
#' \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
#' \item \code{niter} number of boosting iterations.
#' \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
#' first column corresponding to iteration number and the rest corresponding to evaluation
#' metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
#' \item \code{call} a function call.
#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not
#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
#' \item \code{callbacks} callback functions that were either automatically assigned or
#' explicitely passed.
#' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping).
xgboost/R-package/R/xgb.train.R view on Meta::CPAN
# Sort the callbacks into categories
cb <- categorize.callbacks(callbacks)
# The tree updating process would need slightly different handling
is_update <- NVL(params[['process_type']], '.') == 'update'
# Construct a booster (either a new one or load from xgb_model)
handle <- xgb.Booster.handle(params, append(watchlist, dtrain), xgb_model)
bst <- xgb.handleToBooster(handle)
# extract parameters that can affect the relationship b/w #trees and #iterations
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
# When the 'xgb_model' was set, find out how many boosting iterations it has
niter_init <- 0
if (!is.null(xgb_model)) {
niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1
if (length(niter_init) == 0) {
niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
}
}
if(is_update && nrounds > niter_init)
stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
# TODO: distributed code
rank <- 0
niter_skip <- ifelse(is_update, 0, niter_init)
begin_iteration <- niter_skip + 1
end_iteration <- niter_skip + nrounds
# the main loop for boosting iterations
for (iteration in begin_iteration:end_iteration) {
for (f in cb$pre_iter) f()
xgb.iter.update(bst$handle, dtrain, iteration - 1, obj)
bst_evaluation <- numeric(0)
if (length(watchlist) > 0)
bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval)
xgb.attr(bst$handle, 'niter') <- iteration - 1
for (f in cb$post_iter) f()
if (stop_condition) break
}
for (f in cb$finalize) f(finalize = TRUE)
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
# store the total number of boosting iterations
bst$niter = end_iteration
# store the evaluation results
if (length(evaluation_log) > 0 &&
nrow(evaluation_log) > 0) {
# include the previous compatible history when available
if (inherits(xgb_model, 'xgb.Booster') &&
!is_update &&
!is.null(xgb_model$evaluation_log) &&
all.equal(colnames(evaluation_log),
xgboost/R-package/man/callbacks.Rd view on Meta::CPAN
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/callbacks.R
\name{callbacks}
\alias{callbacks}
\title{Callback closures for booster training.}
\description{
These are used to perform various service tasks either during boosting iterations or at the end.
This approach helps to modularize many of such tasks without bloating the main training methods,
and it offers .
}
\details{
By default, a callback function is run after each boosting iteration.
An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
When a callback function has \code{finalize} parameter, its finalizer part will also be run after
the boosting is completed.
xgboost/R-package/man/cb.print.evaluation.Rd view on Meta::CPAN
}
\arguments{
\item{period}{results would be printed every number of periods}
\item{showsd}{whether standard deviations should be printed (when available)}
}
\description{
Callback closure for printing the result of evaluation
}
\details{
The callback function prints the result of evaluation at every \code{period} iterations.
The initial and the last iteration's evaluations are always printed.
Callback function expects the following values to be set in its calling frame:
\code{bst_evaluation} (also \code{bst_evaluation_err} when available),
\code{iteration},
\code{begin_iteration},
\code{end_iteration}.
}
\seealso{
\code{\link{callbacks}}
xgboost/R-package/man/cb.save.model.Rd view on Meta::CPAN
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/callbacks.R
\name{cb.save.model}
\alias{cb.save.model}
\title{Callback closure for saving a model file.}
\usage{
cb.save.model(save_period = 0, save_name = "xgboost.model")
}
\arguments{
\item{save_period}{save the model to disk after every
\code{save_period} iterations; 0 means save the model at the end.}
\item{save_name}{the name or path for the saved model file.
It can contain a \code{\link[base]{sprintf}} formatting specifier
to include the integer iteration number in the file name.
E.g., with \code{save_name} = 'xgboost_%04d.model',
the file saved at iteration 50 would be named "xgboost_0050.model".}
}
\description{
Callback closure for saving a model file.
}
xgboost/R-package/man/predict.xgb.Booster.Rd view on Meta::CPAN
}
\arguments{
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}}
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
missing values in data (e.g., sometimes 0 or some other extreme value is used).}
\item{outputmargin}{whether the prediction should be returned in the for of original untransformed
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
logistic regression would result in predictions for log-odds instead of probabilities.}
\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
It will use all the trees by default (\code{NULL} value).}
\item{predleaf}{whether predict leaf index instead.}
\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
\item{...}{Parameters passed to \code{predict.xgb.Booster}}
xgboost/R-package/man/predict.xgb.Booster.Rd view on Meta::CPAN
When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
\code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
such a matrix. The contribution values are on the scale of untransformed margin
(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
}
\description{
Predicted values based on either xgboost model or model handle object.
}
\details{
Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
and it is not necessarily equal to the number of trees in a model.
E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
But for multiclass classification, while there are multiple trees per iteration,
\code{ntreelimit} limits the number of boosting iterations.
Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
since gblinear doesn't keep its boosting history.
One possible practical applications of the \code{predleaf} option is to use the model
as a generator of new features which capture non-linearity and interactions,
e.g., as implemented in \code{\link{xgb.create.features}}.
Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
individual predictions. For "gblinear" booster, feature contributions are simply linear terms
xgboost/R-package/man/predict.xgb.Booster.Rd view on Meta::CPAN
sum(pred_labels != lb)/length(lb)
# compare that to the predictions from softmax:
set.seed(11)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
objective = "multi:softmax", num_class = num_class)
pred <- predict(bst, as.matrix(iris[, -5]))
str(pred)
all.equal(pred, pred_labels)
# prediction from using only 5 iterations should result
# in the same error as seen in iteration 5:
pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
sum(pred5 != lb)/length(lb)
## random forest-like model of 25 trees for binary classification:
set.seed(11)
bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
nthread = 2, nrounds = 1, objective = "binary:logistic",
xgboost/R-package/man/xgb.cv.Rd view on Meta::CPAN
\item \code{eta} step size of each boosting step
\item \code{max_depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \code{\link{xgb.train}} for further details.
See also demo/ for walkthrough example in R.}
\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}
\item{nrounds}{the max number of iterations}
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means
that NA values should be considered as 'missing' by the algorithm.
Sometimes, 0 or other extreme value might be used to represent missing values.}
\item{prediction}{A logical value indicating whether to return the test fold predictions
xgboost/R-package/man/xgb.cv.Rd view on Meta::CPAN
\itemize{
\item \code{call} a function call.
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
\item \code{callbacks} callback functions that were either automatically assigned or
explicitely passed.
\item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
first column corresponding to iteration number and the rest corresponding to the
CV-based evaluation means and standard deviations for the training and test CV-sets.
It is created by the \code{\link{cb.evaluation.log}} callback.
\item \code{niter} number of boosting iterations.
\item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
parameter or randomly generated.
\item \code{best_iteration} iteration number with the best evaluation metric value
(only available with early stopping).
\item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
which could further be used in \code{predict} method
(only available with early stopping).
\item \code{pred} CV prediction values available when \code{prediction} is set.
It is either vector or matrix (see \code{\link{cb.cv.predict}}).
\item \code{models} a liost of the CV folds' models. It is only available with the explicit
xgboost/R-package/man/xgb.plot.deepness.Rd view on Meta::CPAN
\itemize{
\item the distribution of the number of leafs in a tree model at a certain depth;
\item the distribution of average weighted number of observations ("cover")
ending up in leafs at certain depth.
}
Those could be helpful in determining sensible ranges of the \code{max_depth}
and \code{min_child_weight} parameters.
When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
a tree's median absolute leaf weight changes through the iterations.
This function was inspired by the blog post
\url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
}
\examples{
data(agaricus.train, package='xgboost')
# Change max_depth to a higher number to get a more significant result
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
xgboost/R-package/man/xgb.train.Rd view on Meta::CPAN
\item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class...
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
}
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ran...
}}
\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.}
\item{nrounds}{max number of boosting iterations.}
\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
of these datasets during each boosting iteration, and stored in the end as a field named
\code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
printed out during the training.
E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
the performance of each round's model on mat1 and mat2.}
xgboost/R-package/man/xgb.train.Rd view on Meta::CPAN
by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
This parameter is only used when input is a dense matrix.}
\item{weight}{a vector indicating the weight for each row of the input.}
}
\value{
An object of class \code{xgb.Booster} with the following elements:
\itemize{
\item \code{handle} a handle (pointer) to the xgboost model in memory.
\item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
\item \code{niter} number of boosting iterations.
\item \code{evaluation_log} evaluation history storead as a \code{data.table} with the
first column corresponding to iteration number and the rest corresponding to evaluation
metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
\item \code{call} a function call.
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
\item \code{callbacks} callback functions that were either automatically assigned or
explicitely passed.
\item \code{best_iteration} iteration number with the best evaluation metric value
(only available with early stopping).
xgboost/R-package/tests/testthat/test_basic.R view on Meta::CPAN
test_that("train and predict RF with softprob", {
lb <- as.numeric(iris$Species) - 1
nrounds <- 15
set.seed(11)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
objective = "multi:softprob", num_class=3, verbose = 0,
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
expect_equal(bst$niter, 15)
expect_equal(xgb.ntree(bst), 15*3*4)
# predict for all iterations:
pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE)
expect_equal(dim(pred), c(nrow(iris), 3))
pred_labels <- max.col(pred) - 1
err <- sum(pred_labels != lb)/length(lb)
expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
# predict for 7 iterations and adjust for 4 parallel trees per iteration
pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE, ntreelimit = 7 * 4)
err <- sum((max.col(pred) - 1) != lb)/length(lb)
expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
})
test_that("use of multiple eval metrics works", {
expect_output(
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
xgboost/R-package/tests/testthat/test_basic.R view on Meta::CPAN
expect_equal(dim(bst$evaluation_log), c(2, 4))
expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
})
test_that("training continuation works", {
dtrain <- xgb.DMatrix(train$data, label = train$label)
watchlist = list(train=dtrain)
param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)
# for the reference, use 4 iterations at once:
set.seed(11)
bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
# first two iterations:
set.seed(11)
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
# continue for two more:
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
if (!windows_flag)
expect_equal(bst$raw, bst2$raw)
expect_false(is.null(bst2$evaluation_log))
expect_equal(dim(bst2$evaluation_log), c(4, 2))
expect_equal(bst2$evaluation_log, bst$evaluation_log)
# test continuing from raw model data
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
/// Verbose output
bool g_verbose = false;
/// Timing iterations
int g_timing_iterations = 100;
/// Default grid size
int g_grid_size = 1;
/// Uniform key samples
bool g_uniform_keys;
//---------------------------------------------------------------------
// Kernels
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
printf("\n\n");
}
// Kernel props
int max_sm_occupancy;
CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
// Copy problem to device
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
fflush(stdout);
// Run kernel once to prime caches and check result
BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaPeekAtLastError());
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
fflush(stdout);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
unsigned long long elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
timer.Start();
// Run kernel
BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
timer.Stop();
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
// Copy clocks from device
CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
for (int i = 0; i < g_grid_size; i++)
elapsed_clocks += h_elapsed[i];
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaDeviceSynchronize());
// Display timing results
float avg_millis = elapsed_millis / g_timing_iterations;
float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
double avg_clocks = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
double avg_clocks_per_item = avg_clocks / TILE_SIZE;
printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
printf("\tAverage kernel millis: %.4f\n", avg_millis);
printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
fflush(stdout);
// Cleanup
if (h_in) delete[] h_in;
xgboost/cub/examples/block/example_block_radix_sort.cu view on Meta::CPAN
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
g_uniform_keys = args.CheckCmdLineFlag("uniform");
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("grid-size", g_grid_size);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--i=<timing iterations (default:%d)>]"
"[--grid-size=<grid size (default:%d)>]"
"[--v] "
"\n", argv[0], g_timing_iterations, g_grid_size);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
fflush(stdout);
// Run tests
printf("\nuint32:\n"); fflush(stdout);
Test<unsigned int, 128, 13>();
xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
/// Verbose output
bool g_verbose = false;
/// Timing iterations
int g_timing_iterations = 100;
/// Default grid size
int g_grid_size = 1;
//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------
xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN
printf("\n\n");
}
// Kernel props
int max_sm_occupancy;
CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
(ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
// Run aggregate/prefix kernel
BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
// Check total aggregate
printf("\tAggregate: ");
int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
clock_t elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
timer.Start();
// Run aggregate/prefix kernel
BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN
CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
elapsed_clocks += clocks;
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Display timing results
float avg_millis = elapsed_millis / g_timing_iterations;
float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
float avg_clocks = float(elapsed_clocks) / g_timing_iterations;
float avg_clocks_per_item = avg_clocks / TILE_SIZE;
printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
printf("\tAverage kernel millis: %.4f\n", avg_millis);
printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
// Cleanup
if (h_in) delete[] h_in;
if (h_gpu) delete[] h_gpu;
xgboost/cub/examples/block/example_block_reduce.cu view on Meta::CPAN
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("grid-size", g_grid_size);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--i=<timing iterations>] "
"[--grid-size=<grid size>] "
"[--v] "
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Run tests
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
/// Verbose output
bool g_verbose = false;
/// Timing iterations
int g_timing_iterations = 100;
/// Default grid size
int g_grid_size = 1;
//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
printf("\n\n");
}
// Kernel props
int max_sm_occupancy;
CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
(ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
// Run aggregate/prefix kernel
BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
// Check results
printf("\tOutput items: ");
int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
printf("\tAggregate: ");
compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
clock_t elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
timer.Start();
// Run aggregate/prefix kernel
BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
elapsed_clocks += clocks;
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Display timing results
float avg_millis = elapsed_millis / g_timing_iterations;
float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
float avg_clocks = float(elapsed_clocks) / g_timing_iterations;
float avg_clocks_per_item = avg_clocks / TILE_SIZE;
printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
printf("\tAverage kernel millis: %.4f\n", avg_millis);
printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
xgboost/cub/examples/block/example_block_scan.cu view on Meta::CPAN
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("grid-size", g_grid_size);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--i=<timing iterations (default:%d)>]"
"[--grid-size=<grid size (default:%d)>]"
"[--v] "
"\n", argv[0], g_timing_iterations, g_grid_size);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Run tests
Test<1024, 1, BLOCK_SCAN_RAKING>();
Test<512, 2, BLOCK_SCAN_RAKING>();
Test<256, 4, BLOCK_SCAN_RAKING>();
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
typedef unsigned int Key;
typedef int Value;
int timing_iterations = 0;
int num_items = 40;
Key max_key = 20; // Max item
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("maxkey", max_key);
args.GetCmdLineArgument("i", timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--i=<timing iterations> "
"[--n=<input items, default 40> "
"[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
"[--v] "
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
printf("%d non-trivial runs\n", num_runs);
fflush(stdout);
// Repeat for performance timing
GpuTimer gpu_timer;
GpuTimer gpu_rle_timer;
float elapsed_millis = 0.0;
float elapsed_rle_millis = 0.0;
for (int i = 0; i <= timing_iterations; ++i)
{
// Allocate and initialize device arrays for sorting
DoubleBuffer<Key> d_keys;
DoubleBuffer<Value> d_values;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
xgboost/cub/examples/device/example_device_sort_find_non_trivial_runs.cu view on Meta::CPAN
}
// Host cleanup
if (h_keys) delete[] h_keys;
if (h_values) delete[] h_values;
if (h_offsets_reference) delete[] h_offsets_reference;
if (h_lengths_reference) delete[] h_lengths_reference;
printf("\n\n");
if (timing_iterations > 0)
{
printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n",
timing_iterations,
elapsed_millis / timing_iterations,
elapsed_rle_millis / timing_iterations);
}
return 0;
}
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
/******************************************************************************
* Globals, constants, and typedefs
******************************************************************************/
typedef int VertexId; // uint32s as vertex ids
typedef double Value; // double-precision floating point values
bool g_verbose = false;
int g_timing_iterations = 1;
CachingDeviceAllocator g_allocator;
/******************************************************************************
* Texture referencing
******************************************************************************/
/**
* Templated texture reference type for multiplicand vector
*/
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
{
printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
}
fflush(stdout);
CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
// Run kernel (always run one iteration without timing)
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i <= g_timing_iterations; i++)
{
gpu_timer.Start();
// Initialize output
CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
// Run the COO kernel
CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
even_share,
d_block_partials,
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
if (i > 0)
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Force any kernel stdio to screen
CubDebugExit(cudaThreadSynchronize());
fflush(stdout);
// Display timing
if (g_timing_iterations > 0)
{
float avg_elapsed = elapsed_millis / g_timing_iterations;
int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
g_timing_iterations,
avg_elapsed,
total_bytes / avg_elapsed / 1000.0 / 1000.0,
num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
}
// Check results
int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
xgboost/cub/experimental/defunct/example_coo_spmv.cu view on Meta::CPAN
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
"\t--type=wheel --spokes=<spokes>\n"
"\t--type=grid2d --width=<width> [--no-self-loops]\n"
"\t--type=grid3d --width=<width> [--no-self-loops]\n"
"\t--type=market --file=<file>\n"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN
using namespace cub;
using namespace std;
/******************************************************************************
* Globals, constants, and typedefs
******************************************************************************/
bool g_verbose = false;
int g_timing_iterations = 1;
CachingDeviceAllocator g_allocator(true);
/******************************************************************************
* Utility routines
******************************************************************************/
/**
* An pair of index offsets
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN
int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
for (int i = 0; i < g_timing_iterations; ++i)
{
CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
}
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
float giga_bandwidth = giga_rate *
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
// Device cleanup
if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
xgboost/cub/experimental/defunct/test_device_seg_reduce.cu view on Meta::CPAN
int main(int argc, char** argv)
{
int num_values = 32 * 1024 * 1024;
int avg_segment_size = 500;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_values);
args.GetCmdLineArgument("ss", avg_segment_size);
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--v] "
"[--i=<timing iterations>] "
"[--n=<input samples>]\n"
"[--ss=<average segment size>]\n"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void RunTest(
std::vector<std::pair<std::string, double> >& timings,
PixelType* d_pixels,
const int width,
const int height,
unsigned int * d_hist,
unsigned int * h_hist,
int timing_iterations,
const char * long_name,
const char * short_name,
double (*f)(PixelType*, int, int, unsigned int*, bool))
{
if (!g_report) printf("%s ", long_name); fflush(stdout);
// Run single test to verify (and code cache)
(*f)(d_pixels, width, height, d_hist, !g_report);
int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
double elapsed_ms = 0;
for (int i = 0; i < timing_iterations; i++)
{
elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
}
double avg_us = (elapsed_ms / timing_iterations) * 1000; // average in us
timings.push_back(std::pair<std::string, double>(short_name, avg_us));
if (!g_report)
{
printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
}
else
{
printf("%.3f, ", avg_us); fflush(stdout);
}
AssertEquals(0, compare);
}
xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN
*/
template <
int NUM_CHANNELS,
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void TestMethods(
PixelType* h_pixels,
int height,
int width,
int timing_iterations,
double bandwidth_GBs)
{
// Copy data to gpu
PixelType* d_pixels;
size_t pixel_bytes = width * height * sizeof(PixelType);
CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN
h_hist = (unsigned int *) malloc(histogram_bytes);
g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
// Compute reference cpu histogram
HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
// Store timings
std::vector<std::pair<std::string, double> > timings;
// Run experiments
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
// Report timings
if (!g_report)
{
std::sort(timings.begin(), timings.end(), less_than_value());
printf("Timings (us):\n");
for (int i = 0; i < timings.size(); i++)
{
double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN
}
/**
* Test different problem genres
*/
void TestGenres(
uchar4* uchar4_pixels,
int height,
int width,
int timing_iterations,
double bandwidth_GBs)
{
int num_pixels = width * height;
{
if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
size_t image_bytes = num_pixels * sizeof(uchar1);
uchar1* uchar1_pixels = (uchar1*) malloc(image_bytes);
// Convert to 1-channel (averaging first 3 channels)
for (int i = 0; i < num_pixels; ++i)
{
uchar1_pixels[i].x = (unsigned char)
(((unsigned int) uchar4_pixels[i].x +
(unsigned int) uchar4_pixels[i].y +
(unsigned int) uchar4_pixels[i].z) / 3);
}
TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
free(uchar1_pixels);
if (g_report) printf(", ");
}
{
if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
if (g_report) printf(", ");
}
{
if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
size_t image_bytes = num_pixels * sizeof(float4);
float4* float4_pixels = (float4*) malloc(image_bytes);
// Convert to float4 with range [0.0, 1.0)
for (int i = 0; i < num_pixels; ++i)
{
float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
}
TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
free(float4_pixels);
if (g_report) printf("\n");
}
}
/**
* Main
*/
int main(int argc, char **argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
if (args.CheckCmdLineFlag("help"))
{
printf(
"%s "
"[--device=<device-id>] "
"[--v] "
"[--i=<timing iterations>] "
"\n\t"
"--file=<.tga filename> "
"\n\t"
"--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
"[--height=<default: 1080>] "
"[--width=<default: 1920>] "
"\n", argv[0]);
exit(0);
}
std::string filename;
int timing_iterations = 100;
int entropy_reduction = 0;
int height = 1080;
int width = 1920;
g_verbose = args.CheckCmdLineFlag("v");
g_report = args.CheckCmdLineFlag("report");
args.GetCmdLineArgument("i", timing_iterations);
args.GetCmdLineArgument("file", filename);
args.GetCmdLineArgument("height", height);
args.GetCmdLineArgument("width", width);
args.GetCmdLineArgument("entropy", entropy_reduction);
// Initialize device
CubDebugExit(args.DeviceInit());
// Get GPU device bandwidth (GB/s)
int device_ordinal, bus_width, mem_clock_khz;
xgboost/cub/experimental/histogram_compare.cu view on Meta::CPAN
ReadTga(uchar4_pixels, width, height, filename.c_str());
printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
}
else
{
// Generate image
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
}
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
else
{
// Run test suite
printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
// Entropy reduction tests
for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
{
printf("entropy reduction %d, ", entropy_reduction);
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
printf("entropy reduction -1, ");
GenerateRandomImage(uchar4_pixels, width, height, -1);
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
printf("\n");
// File image tests
std::vector<std::string> file_tests;
file_tests.push_back("animals");
file_tests.push_back("apples");
file_tests.push_back("sunset");
file_tests.push_back("cheetah");
file_tests.push_back("nature");
file_tests.push_back("operahouse");
file_tests.push_back("austin");
file_tests.push_back("cityscape");
for (int i = 0; i < file_tests.size(); ++i)
{
printf("%s, ", file_tests[i].c_str());
std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
ReadTga(uchar4_pixels, width, height, filename.c_str());
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
}
free(uchar4_pixels);
CubDebugExit(cudaDeviceSynchronize());
printf("\n\n");
return 0;
}
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
/**
* Run GPU I/O proxy
*/
template <
typename ValueT,
typename OffsetT>
float TestGpuCsrIoProxy(
SpmvParams<ValueT, OffsetT>& params,
int timing_iterations)
{
enum {
BLOCK_THREADS = 128,
ITEMS_PER_THREAD = 7,
TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// size_t smem = 1024 * 16;
size_t smem = 1024 * 0;
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
// Check for failures
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(SyncStream(0));
// Timing
GpuTimer timer;
float elapsed_millis = 0.0;
timer.Start();
for (int it = 0; it < timing_iterations; ++it)
{
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
CubDebugExit(x_itr.UnbindTexture());
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// cuSparse HybMV
//---------------------------------------------------------------------
/**
* Run cuSparse HYB SpMV (specialized for fp32)
*/
template <
typename OffsetT>
float TestCusparseHybmv(
float* vector_y_in,
float* reference_vector_y_out,
SpmvParams<float, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
CpuTimer cpu_timer;
cpu_timer.Start();
// Construct Hyb matrix
cusparseMatDescr_t mat_desc;
cusparseHybMat_t hyb_desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Cleanup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
return elapsed_millis / timing_iterations;
}
/**
* Run cuSparse HYB SpMV (specialized for fp64)
*/
template <
typename OffsetT>
float TestCusparseHybmv(
double* vector_y_in,
double* reference_vector_y_out,
SpmvParams<double, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
CpuTimer cpu_timer;
cpu_timer.Start();
// Construct Hyb matrix
cusparseMatDescr_t mat_desc;
cusparseHybMat_t hyb_desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
¶ms.alpha, mat_desc,
hyb_desc,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Cleanup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// cuSparse CsrMV
//---------------------------------------------------------------------
/**
* Run cuSparse SpMV (specialized for fp32)
*/
template <
typename OffsetT>
float TestCusparseCsrmv(
float* vector_y_in,
float* reference_vector_y_out,
SpmvParams<float, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
cusparseMatDescr_t desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
float elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
return elapsed_millis / timing_iterations;
}
/**
* Run cuSparse SpMV (specialized for fp64)
*/
template <
typename OffsetT>
float TestCusparseCsrmv(
double* vector_y_in,
double* reference_vector_y_out,
SpmvParams<double, OffsetT>& params,
int timing_iterations,
cusparseHandle_t cusparse)
{
cusparseMatDescr_t desc;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
// Reset input/output vector y
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
if (!g_quiet)
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
float elapsed_millis = 0.0;
GpuTimer timer;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, ¶ms.beta, params.d_vector_y));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// GPU Merge-based SpMV
//---------------------------------------------------------------------
/**
* Run CUB SpMV
*/
template <
typename ValueT,
typename OffsetT>
float TestGpuMergeCsrmv(
ValueT* vector_y_in,
ValueT* reference_vector_y_out,
SpmvParams<ValueT, OffsetT>& params,
int timing_iterations)
{
// Allocate temporary storage
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
// Get amount of temporary storage needed
CubDebugExit(DeviceSpmv::CsrMV(
d_temp_storage, temp_storage_bytes,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, params.d_vector_y,
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
{
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Timing
GpuTimer timer;
float elapsed_millis = 0.0;
timer.Start();
for(int it = 0; it < timing_iterations; ++it)
{
CubDebugExit(DeviceSpmv::CsrMV(
d_temp_storage, temp_storage_bytes,
params.d_values, params.d_row_end_offsets, params.d_column_indices,
params.d_vector_x, params.d_vector_y,
params.num_rows, params.num_cols, params.num_nonzeros,
// params.alpha, params.beta,
(cudaStream_t) 0, false));
}
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
return elapsed_millis / timing_iterations;
}
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
/**
* Display perf
*/
template <typename ValueT, typename OffsetT>
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
* Run tests
*/
template <
typename ValueT,
typename OffsetT>
void RunTest(
bool rcm_relabel,
ValueT alpha,
ValueT beta,
CooMatrix<ValueT, OffsetT>& coo_matrix,
int timing_iterations,
CommandLineArgs& args)
{
// Adaptive timing iterations: run 16 billion nonzeros through
if (timing_iterations == -1)
timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));
if (!g_quiet)
printf("\t%d timing iterations\n", timing_iterations);
// Convert to CSR
CsrMatrix<ValueT, OffsetT> csr_matrix;
csr_matrix.FromCoo(coo_matrix);
if (!args.CheckCmdLineFlag("csrmv"))
coo_matrix.Clear();
// Relabel
if (rcm_relabel)
{
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
params.alpha = alpha;
params.beta = beta;
CubDebugExit(cudaMemcpy(params.d_values, csr_matrix.values, sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(params.d_row_end_offsets, csr_matrix.row_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(params.d_column_indices, csr_matrix.column_indices, sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(params.d_vector_x, vector_x, sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice));
if (!g_quiet) printf("\n\n");
printf("GPU CSR I/O Prox, "); fflush(stdout);
avg_millis = TestGpuCsrIoProxy(params, timing_iterations);
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
if (args.CheckCmdLineFlag("csrmv"))
{
if (!g_quiet) printf("\n\n");
printf("CUB, "); fflush(stdout);
avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations);
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
}
// Initialize cuSparse
cusparseHandle_t cusparse;
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse));
if (args.CheckCmdLineFlag("csrmv"))
{
if (!g_quiet) printf("\n\n");
printf("Cusparse CsrMV, "); fflush(stdout);
avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
}
if (args.CheckCmdLineFlag("hybmv"))
{
if (!g_quiet) printf("\n\n");
printf("Cusparse HybMV, "); fflush(stdout);
avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
}
// Cleanup
if (params.d_values) CubDebugExit(g_allocator.DeviceFree(params.d_values));
if (params.d_row_end_offsets) CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets));
if (params.d_column_indices) CubDebugExit(g_allocator.DeviceFree(params.d_column_indices));
if (params.d_vector_x) CubDebugExit(g_allocator.DeviceFree(params.d_vector_x));
if (params.d_vector_y) CubDebugExit(g_allocator.DeviceFree(params.d_vector_y));
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
typename OffsetT>
void RunTests(
bool rcm_relabel,
ValueT alpha,
ValueT beta,
const std::string& mtx_filename,
int grid2d,
int grid3d,
int wheel,
int dense,
int timing_iterations,
CommandLineArgs& args)
{
// Initialize matrix in COO form
CooMatrix<ValueT, OffsetT> coo_matrix;
if (!mtx_filename.empty())
{
// Parse matrix market file
printf("%s, ", mtx_filename.c_str()); fflush(stdout);
coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
{
fprintf(stderr, "No graph type specified.\n");
exit(1);
}
RunTest(
rcm_relabel,
alpha,
beta,
coo_matrix,
timing_iterations,
args);
}
/**
* Main
*/
int main(int argc, char **argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
if (args.CheckCmdLineFlag("help"))
{
printf(
"%s "
"[--csrmv | --hybmv | --bsrmv ] "
"[--device=<device-id>] "
"[--quiet] "
"[--v] "
"[--i=<timing iterations>] "
"[--fp64] "
"[--rcm] "
"[--alpha=<alpha scalar (default: 1.0)>] "
"[--beta=<beta scalar (default: 0.0)>] "
"\n\t"
"--mtx=<matrix market file> "
"\n\t"
"--dense=<cols>"
"\n\t"
"--grid2d=<width>"
xgboost/cub/experimental/spmv_compare.cu view on Meta::CPAN
exit(0);
}
bool fp64;
bool rcm_relabel;
std::string mtx_filename;
int grid2d = -1;
int grid3d = -1;
int wheel = -1;
int dense = -1;
int timing_iterations = -1;
float alpha = 1.0;
float beta = 0.0;
g_verbose = args.CheckCmdLineFlag("v");
g_verbose2 = args.CheckCmdLineFlag("v2");
g_quiet = args.CheckCmdLineFlag("quiet");
fp64 = args.CheckCmdLineFlag("fp64");
rcm_relabel = args.CheckCmdLineFlag("rcm");
args.GetCmdLineArgument("i", timing_iterations);
args.GetCmdLineArgument("mtx", mtx_filename);
args.GetCmdLineArgument("grid2d", grid2d);
args.GetCmdLineArgument("grid3d", grid3d);
args.GetCmdLineArgument("wheel", wheel);
args.GetCmdLineArgument("dense", dense);
args.GetCmdLineArgument("alpha", alpha);
args.GetCmdLineArgument("beta", beta);
// Initialize device
CubDebugExit(args.DeviceInit());
// Run test(s)
if (fp64)
{
RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
}
else
{
RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
}
CubDebugExit(cudaDeviceSynchronize());
printf("\n");
return 0;
}
xgboost/cub/test/test_allocator.cu view on Meta::CPAN
{
// Initialize command line
CommandLineArgs args(argc, argv);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>]"
"[--bytes=<timing bytes>]"
"[--i=<timing iterations>]"
"\n", argv[0]);
exit(0);
}
#if (CUB_PTX_ARCH == 0)
// Initialize device
CubDebugExit(args.DeviceInit());
// Get number of GPUs and current GPU
int num_gpus;
int initial_gpu;
int timing_iterations = 10000;
int timing_bytes = 1024 * 1024;
if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
args.GetCmdLineArgument("i", timing_iterations);
args.GetCmdLineArgument("bytes", timing_bytes);
// Create default allocator (caches up to 6MB in device allocations per GPU)
CachingDeviceAllocator allocator;
allocator.debug = true;
printf("Running single-gpu tests...\n"); fflush(stdout);
//
// Test0
xgboost/cub/test/test_allocator.cu view on Meta::CPAN
// Check that that still we have 0 live block across all GPUs
AssertEquals(allocator.live_blocks.size(), 0);
}
#endif // CUB_CDP
//
// Performance
//
printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
fflush(stdout); fflush(stderr);
// CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times
CpuTimer cpu_timer;
char *d_1024MB = NULL;
allocator.debug = false;
// Prime the caching allocator and the kernel
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
CubDebugExit(allocator.DeviceFree(d_1024MB));
cub::EmptyKernel<void><<<1, 32>>>();
// CUDA
cpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
CubDebugExit(cudaFree(d_1024MB));
}
cpu_timer.Stop();
float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
// CUB
cpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
CubDebugExit(allocator.DeviceFree(d_1024MB));
}
cpu_timer.Stop();
float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
cuda_malloc_elapsed_millis / timing_iterations,
cub_calloc_elapsed_millis / timing_iterations);
// GPU performance comparisons. Allocate and free a 1MB block 2000 times
GpuTimer gpu_timer;
printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
fflush(stdout); fflush(stderr);
// Kernel-only
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
cub::EmptyKernel<void><<<1, 32>>>();
}
gpu_timer.Stop();
float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
// CUDA
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
cub::EmptyKernel<void><<<1, 32>>>();
CubDebugExit(cudaFree(d_1024MB));
}
gpu_timer.Stop();
cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
// CUB
gpu_timer.Start();
for (int i = 0; i < timing_iterations; ++i)
{
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
cub::EmptyKernel<void><<<1, 32>>>();
CubDebugExit(allocator.DeviceFree(d_1024MB));
}
gpu_timer.Stop();
cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
cuda_malloc_elapsed_millis / timing_iterations,
cub_calloc_elapsed_millis / timing_iterations);
#endif
printf("Success\n");
return 0;
}
xgboost/cub/test/test_block_histogram.cu view on Meta::CPAN
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------
/**
* BlockHistogram test kernel.
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
enum Backend
{
CUB, // CUB method
NPP, // NPP method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
};
bool g_verbose_input = false;
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
//---------------------------------------------------------------------
// Dispatch to NPP histogram
//---------------------------------------------------------------------
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
/**
* Dispatch to single-channel 8b NPP histo-even
*/
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<1> num_channels,
Int2Type<1> num_active_channels,
Int2Type<NPP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[1], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[1], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
};
if (d_temp_storage_bytes == NULL)
{
int nDeviceBufferSize;
nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
temp_storage_bytes = nDeviceBufferSize;
}
else
{
for (int i = 0; i < timing_timing_iterations; ++i)
{
// compute the histogram
nppiHistogramEven_8u_C1R(
d_samples,
row_stride_bytes,
oSizeROI,
d_histogram[0],
num_levels[0],
lower_level[0],
upper_level[0],
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
/**
* Dispatch to 3/4 8b NPP histo-even
*/
template <typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<4> num_channels,
Int2Type<3> num_active_channels,
Int2Type<NPP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
unsigned char *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists ...
CounterT *d_histogram[3], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_leve...
int num_levels[3], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[3], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[3], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
};
if (d_temp_storage_bytes == NULL)
{
int nDeviceBufferSize;
nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
temp_storage_bytes = nDeviceBufferSize;
}
else
{
for (int i = 0; i < timing_timing_iterations; ++i)
{
// compute the histogram
nppiHistogramEven_8u_AC4R(
d_samples,
row_stride_bytes,
oSizeROI,
d_histogram,
num_levels,
lower_level,
upper_level,
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
/**
* Dispatch to CUB single histogram-even entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<1> num_channels,
Int2Type<1> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT lower_level[1], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[1], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramEven(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram[0],
num_levels[0],
lower_level[0],
upper_level[0],
num_row_pixels,
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
/**
* Dispatch to CUB multi histogram-even entrypoint
*/
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchEven(
Int2Type<NUM_CHANNELS> num_channels,
Int2Type<NUM_ACTIVE_CHANNELS> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_row_pixels,
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
/**
* Dispatch to CUB single histogram-range entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
Int2Type<1> num_channels,
Int2Type<1> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[1], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT *d_levels[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramRange(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram[0],
num_levels[0],
d_levels[0],
num_row_pixels,
num_rows,
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
/**
* Dispatch to CUB multi histogram-range entrypoint
*/
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t DispatchRange(
Int2Type<NUM_CHANNELS> num_channels,
Int2Type<NUM_ACTIVE_CHANNELS> num_active_channels,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where ea...
CounterT *d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> shou...
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_lev...
LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are ...
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
cudaStream_t stream,
bool debug_synchronous)
{
typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
(const SampleT *) d_samples,
d_histogram,
num_levels,
d_levels,
num_row_pixels,
num_rows,
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
//---------------------------------------------------------------------
// CUDA nested-parallelism test kernel
//---------------------------------------------------------------------
/**
* Simple wrapper kernel to invoke DeviceHistogram
* /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
__global__ void CnpDispatchKernel(
Int2Type<ALGORITHM> algorithm,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t temp_storage_bytes,
SampleT *d_samples,
SampleIteratorT d_sample_itr,
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
int num_samples,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_s...
*d_temp_storage_bytes = temp_storage_bytes;
#endif
}
/ **
* Dispatch to CDP kernel
* /
template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
cudaError_t Dispatch(
Int2Type<ALGORITHM> algorithm,
Int2Type<true> use_cdp,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleT *d_samples,
SampleIteratorT d_sample_itr,
CounterT *d_histograms[NUM_ACTIVE_CHANNELS],
int num_samples,
cudaStream_t stream,
bool debug_synchronous)
{
// Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo...
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
// Copy out error
cudaError_t retval;
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
return retval;
}
*/
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchEven(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
DispatchRange(
Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes,
0, false);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
xgboost/cub/test/test_device_histogram.cu view on Meta::CPAN
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
g_verbose_input = args.CheckCmdLineFlag("v2");
args.GetCmdLineArgument("n", num_row_pixels);
int row_stride_pixels = num_row_pixels;
args.GetCmdLineArgument("rows", num_rows);
args.GetCmdLineArgument("stride", row_stride_pixels);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("entropy", entropy_reduction);
bool compare_npp = args.CheckCmdLineFlag("npp");
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<pixels per row> "
"[--rows=<number of rows> "
"[--stride=<row stride in pixels> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--entropy=<entropy-reduction factor (default 0)>]"
"[--v] "
"[--cdp]"
"[--npp]"
"\n", argv[0]);
exit(0);
}
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method (allows overwriting of input)
CUB_NO_OVERWRITE, // CUB method (disallows overwriting of input)
CUB_SEGMENTED, // CUB method (allows overwriting of input)
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
}
if (BACKEND == CUB_NO_OVERWRITE)
{
// Check that input isn't overwritten
int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
compare |= input_compare;
printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Performance
if (g_timing_iterations)
printf("\nPerforming timing iterations:\n"); fflush(stdout);
GpuTimer gpu_timer;
float elapsed_millis = 0.0f;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Initialize/clear device arrays
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
}
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_offsets,
begin_bit, end_bit, 0, false));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = (KEYS_ONLY) ?
giga_rate * sizeof(KeyT) * 2 :
giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
}
printf("\n\n");
// Cleanup
xgboost/cub/test/test_device_radix_sort.cu view on Meta::CPAN
int bits = -1;
int num_items = -1;
int num_segments = -1;
int entropy_reduction = 0;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("s", num_segments);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("bits", bits);
args.GetCmdLineArgument("entropy", entropy_reduction);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--bits=<valid key bits>]"
"[--n=<input items> "
"[--s=<num segments> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--v] "
"[--entropy=<entropy-reduction factor (default 0)>]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
int g_ptx_version;
int g_sm_count;
bool g_verbose = false;
bool g_verbose_input = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method
CUB_SEGMENTED, // CUB segmented method
CUB_CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
// Dispatch to different CUB DeviceReduce entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to reduce entrypoint (custom-max)
*/
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
// Max-identity
OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, reduction_op, identity,
stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to sum entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::Sum reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to min entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::Min reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to max entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::Max reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to argmin entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::ArgMin reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to argmax entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::ArgMax reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
}
return error;
}
//---------------------------------------------------------------------
// Dispatch to different CUB DeviceSegmentedReduce entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to reduce entrypoint (custom-max)
*/
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB_SEGMENTED> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
// The output value type
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
// Max-identity
OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to sum entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB_SEGMENTED> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::Sum reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to min entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB_SEGMENTED> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::Min reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to max entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB_SEGMENTED> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::Max reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to argmin entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB_SEGMENTED> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::ArgMin reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to argmax entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB_SEGMENTED> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
cub::ArgMax reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to device reduction directly
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
stream, debug_synchronous);
}
return error;
}
//---------------------------------------------------------------------
// Dispatch to different Thrust entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to reduction entrypoint (min or max specialization)
*/
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
{
temp_storage_bytes = 1;
}
else
{
OutputT init;
CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
thrust::device_ptr<OutputT> d_in_wrapper(d_in);
OutputT retval;
for (int i = 0; i < timing_timing_iterations; ++i)
{
retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
}
if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
}
return cudaSuccess;
}
/**
* Dispatch to reduction entrypoint (sum specialization)
*/
template <typename InputIteratorT, typename OutputIteratorT>
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
if (d_temp_storage == 0)
{
temp_storage_bytes = 1;
}
else
{
thrust::device_ptr<OutputT> d_in_wrapper(d_in);
OutputT retval;
for (int i = 0; i < timing_timing_iterations; ++i)
{
retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
}
if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
}
return cudaSuccess;
}
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
//---------------------------------------------------------------------
/**
* Simple wrapper kernel to invoke DeviceReduce
*/
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ReductionOpT>
__global__ void CnpDispatchKernel(
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
ReductionOpT reduction_op,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
*d_temp_storage_bytes = temp_storage_bytes;
#endif
}
/**
* Dispatch to CUB_CDP kernel
*/
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB_CDP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
int max_segments,
int *d_segment_offsets,
ReductionOpT reduction_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
// Copy out error
cudaError_t retval;
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
return retval;
}
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
// Check for correctness (and display results, if specified)
int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
printf("\t%s", compare ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
if (g_timing_iterations > 0)
{
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(backend, g_timing_iterations,
d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, num_segments, d_segment_offsets,
reduction_op, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(InputT);
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
//
// Black-box testing on all backends
//
// Test 0, 1, many
TestByGenMode<InputT, OutputT>(0, max_segments);
TestByGenMode<InputT, OutputT>(1, max_segments);
TestByGenMode<InputT, OutputT>(max_items, max_segments);
// Test random problem sizes from a log-distribution [8, max_items-ish)
int num_iterations = 8;
double max_exp = log(double(max_items)) / log(double(2.0));
for (int i = 0; i < num_iterations; ++i)
{
int num_items = (int) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
TestByGenMode<InputT, OutputT>(num_items, max_segments);
}
//
// White-box testing of single-segment problems around specific sizes
//
// Tile-boundaries: multiple blocks, one tile per block
xgboost/cub/test/test_device_reduce.cu view on Meta::CPAN
{
int max_items = 27000000;
int max_segments = 34000;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
g_verbose_input = args.CheckCmdLineFlag("v2");
args.GetCmdLineArgument("n", max_items);
args.GetCmdLineArgument("s", max_segments);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<input items> "
"[--s=<num segments> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--v] "
"[--cdp]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method
THRUST, // Thrust method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
};
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
typename KeyOutputIteratorT,
typename ValueInputIteratorT,
typename ValueOutputIteratorT,
typename NumRunsIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void *d_temp_storage,
size_t &temp_storage_bytes,
KeyInputIteratorT d_keys_in,
KeyOutputIteratorT d_keys_out,
ValueInputIteratorT d_values_in,
ValueOutputIteratorT d_values_out,
NumRunsIteratorT d_num_runs,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceReduce::ReduceByKey(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
d_num_runs,
reduction_op,
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
typename KeyInputIteratorT,
typename KeyOutputIteratorT,
typename ValueInputIteratorT,
typename ValueOutputIteratorT,
typename NumRunsIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT>
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void *d_temp_storage,
size_t &temp_storage_bytes,
KeyInputIteratorT d_keys_in,
KeyOutputIteratorT d_keys_out,
ValueInputIteratorT d_values_in,
ValueOutputIteratorT d_values_out,
NumRunsIteratorT d_num_runs,
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
else
{
thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
for (int i = 0; i < timing_timing_iterations; ++i)
{
d_out_ends = thrust::reduce_by_key(
d_keys_in_wrapper,
d_keys_in_wrapper + num_items,
d_values_in_wrapper,
d_keys_out_wrapper,
d_values_out_wrapper);
}
OffsetT num_segments = d_out_ends.first - d_keys_out_wrapper;
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
template <
typename KeyInputIteratorT,
typename KeyOutputIteratorT,
typename ValueInputIteratorT,
typename ValueOutputIteratorT,
typename NumRunsIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT>
__global__ void CnpDispatchKernel(
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void *d_temp_storage,
size_t temp_storage_bytes,
KeyInputIteratorT d_keys_in,
KeyOutputIteratorT d_keys_out,
ValueInputIteratorT d_values_in,
ValueOutputIteratorT d_values_out,
NumRunsIteratorT d_num_runs,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
*d_temp_storage_bytes = temp_storage_bytes;
#endif
}
/**
* Dispatch to CDP kernel
*/
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
typename KeyOutputIteratorT,
typename ValueInputIteratorT,
typename ValueOutputIteratorT,
typename NumRunsIteratorT,
typename EqualityOpT,
typename ReductionOpT,
typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CDP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void *d_temp_storage,
size_t &temp_storage_bytes,
KeyInputIteratorT d_keys_in,
KeyOutputIteratorT d_keys_out,
ValueInputIteratorT d_values_in,
ValueOutputIteratorT d_values_out,
NumRunsIteratorT d_num_runs,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
// Copy out error
cudaError_t retval;
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
return retval;
}
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
int bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
printf("\n\n");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
xgboost/cub/test/test_device_reduce_by_key.cu view on Meta::CPAN
int main(int argc, char** argv)
{
int num_items = -1;
int entropy_reduction = 0;
int maxseg = 1000;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("maxseg", maxseg);
args.GetCmdLineArgument("entropy", entropy_reduction);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<input items> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--maxseg=<max segment length>]"
"[--entropy=<segment length bit entropy reduction rounds>]"
"[--repeat=<repetitions of entire test suite>]"
"[--v] "
"[--cdp]"
"\n", argv[0]);
exit(0);
}
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method
THRUST, // Thrust method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
};
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
typename InputIteratorT,
typename UniqueOutputIteratorT,
typename OffsetsOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsIterator,
typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<RLE> method,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
UniqueOutputIteratorT d_unique_out,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsIterator d_num_runs,
cub::Equality equality_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceRunLengthEncode::Encode(
d_temp_storage,
temp_storage_bytes,
d_in,
d_unique_out,
d_lengths_out,
d_num_runs,
num_items,
stream,
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
typename InputIteratorT,
typename UniqueOutputIteratorT,
typename OffsetsOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsIterator,
typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<NON_TRIVIAL> method,
Int2Type<CUB> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
UniqueOutputIteratorT d_unique_out,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsIterator d_num_runs,
cub::Equality equality_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceRunLengthEncode::NonTrivialRuns(
d_temp_storage,
temp_storage_bytes,
d_in,
d_offsets_out,
d_lengths_out,
d_num_runs,
num_items,
stream,
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
template <
typename InputIteratorT,
typename UniqueOutputIteratorT,
typename OffsetsOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsIterator,
typename OffsetT>
cudaError_t Dispatch(
Int2Type<RLE> method,
Int2Type<THRUST> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
UniqueOutputIteratorT d_unique_out,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsIterator d_num_runs,
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<UniqueT> d_unique_out_wrapper(d_unique_out);
thrust::device_ptr<LengthT> d_lengths_out_wrapper(d_lengths_out);
thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
LengthT one_val;
InitValue(INTEGER_SEED, one_val, 1);
thrust::constant_iterator<LengthT> constant_one(one_val);
for (int i = 0; i < timing_timing_iterations; ++i)
{
d_out_ends = thrust::reduce_by_key(
d_in_wrapper,
d_in_wrapper + num_items,
constant_one,
d_unique_out_wrapper,
d_lengths_out_wrapper);
}
OffsetT num_runs = d_out_ends.first - d_unique_out_wrapper;
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
int RLE_METHOD,
typename InputIteratorT,
typename UniqueOutputIteratorT,
typename OffsetsOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsIterator,
typename EqualityOp,
typename OffsetT>
__global__ void CnpDispatchKernel(
Int2Type<RLE_METHOD> method,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t temp_storage_bytes,
InputIteratorT d_in,
UniqueOutputIteratorT d_unique_out,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsIterator d_num_runs,
cub::Equality equality_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
*d_temp_storage_bytes = temp_storage_bytes;
#endif
}
/**
* Dispatch to CDP kernel
*/
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
typename UniqueOutputIteratorT,
typename OffsetsOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsIterator,
typename EqualityOp,
typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<RLE_METHOD> method,
Int2Type<CDP> dispatch_to,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
UniqueOutputIteratorT d_unique_out,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsIterator d_num_runs,
EqualityOp equality_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
// Copy out error
cudaError_t retval;
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
return retval;
}
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0...
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
}
printf("\n\n");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
xgboost/cub/test/test_device_run_length_encode.cu view on Meta::CPAN
int main(int argc, char** argv)
{
int num_items = -1;
int entropy_reduction = 0;
int max_segment = 1000;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("maxseg", max_segment);
args.GetCmdLineArgument("entropy", entropy_reduction);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<input items> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--maxseg=<max segment length>]"
"[--entropy=<segment length bit entropy reduction rounds>]"
"[--repeat=<repetitions of entire test suite>]"
"[--v] "
"[--cdp]"
"\n", argv[0]);
exit(0);
}
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
double g_device_giga_bandwidth;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method
THRUST, // Thrust method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
//---------------------------------------------------------------------
/**
* Dispatch to exclusive scan entrypoint
*/
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
IsPrimitiveT is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitialValueT initial_value,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to exclusive sum entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
Int2Type<true> is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
Sum scan_op,
InitialValueT initial_value,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to inclusive scan entrypoint
*/
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
IsPrimitiveT is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
NullType initial_value,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to inclusive sum entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
Int2Type<true> is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
Sum scan_op,
NullType initial_value,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
}
return error;
}
//---------------------------------------------------------------------
// Dispatch to different Thrust entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to exclusive scan entrypoint
*/
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
IsPrimitiveT is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitialValueT initial_value,
OffsetT num_items,
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
if (d_temp_storage == 0)
{
temp_storage_bytes = 1;
}
else
{
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
for (int i = 0; i < timing_timing_iterations; ++i)
{
thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op);
}
}
return cudaSuccess;
}
/**
* Dispatch to exclusive sum entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
Int2Type<true> is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
Sum scan_op,
InitialValueT initial_value,
OffsetT num_items,
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
if (d_temp_storage == 0)
{
temp_storage_bytes = 1;
}
else
{
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
for (int i = 0; i < timing_timing_iterations; ++i)
{
thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
}
}
return cudaSuccess;
}
/**
* Dispatch to inclusive scan entrypoint
*/
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
IsPrimitiveT is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
NullType initial_value,
OffsetT num_items,
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
if (d_temp_storage == 0)
{
temp_storage_bytes = 1;
}
else
{
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
for (int i = 0; i < timing_timing_iterations; ++i)
{
thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op);
}
}
return cudaSuccess;
}
/**
* Dispatch to inclusive sum entrypoint
*/
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
Int2Type<true> is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
Sum scan_op,
NullType initial_value,
OffsetT num_items,
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
if (d_temp_storage == 0)
{
temp_storage_bytes = 1;
}
else
{
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
for (int i = 0; i < timing_timing_iterations; ++i)
{
thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
}
}
return cudaSuccess;
}
//---------------------------------------------------------------------
// CUDA Nested Parallelism Test Kernel
//---------------------------------------------------------------------
/**
* Simple wrapper kernel to invoke DeviceScan
*/
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
__global__ void CnpDispatchKernel(
IsPrimitiveT is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitialValueT initial_value,
OffsetT num_items,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch(
Int2Type<CUB>(),
is_primitive,
timing_timing_iterations,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
initial_value,
num_items,
0,
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
}
/**
* Dispatch to CDP kernel
*/
template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
cudaError_t Dispatch(
Int2Type<CDP> dispatch_to,
IsPrimitiveT is_primitive,
int timing_timing_iterations,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitialValueT initial_value,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<<<1,1>>>(
is_primitive,
timing_timing_iterations,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
initial_value,
num_items,
debug_synchronous);
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(),
Int2Type<Traits<OutputT>::PRIMITIVE>(),
g_timing_iterations,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
initial_value,
num_items,
0,
false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
}
printf("\n\n");
// Cleanup
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
xgboost/cub/test/test_device_scan.cu view on Meta::CPAN
* Main
*/
int main(int argc, char** argv)
{
int num_items = -1;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<input items> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--v] "
"[--cdp]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
int g_repeat = 0;
float g_device_giga_bandwidth;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method
THRUST, // Thrust method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
/**
* Dispatch to select if entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
Int2Type<false> is_flagged,
Int2Type<false> is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
SelectOpT select_op,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to partition if entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
Int2Type<false> is_flagged,
Int2Type<true> is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
SelectOpT select_op,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to select flagged entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
Int2Type<true> is_flagged,
Int2Type<false> partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
SelectOpT select_op,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
}
return error;
}
/**
* Dispatch to partition flagged entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
CUB_RUNTIME_FUNCTION __forceinline__
cudaError_t Dispatch(
Int2Type<CUB> dispatch_to,
Int2Type<true> is_flagged,
Int2Type<true> partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
SelectOpT select_op,
cudaStream_t stream,
bool debug_synchronous)
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
}
return error;
}
//---------------------------------------------------------------------
// Dispatch to different Thrust entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to select if entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
Int2Type<false> is_flagged,
Int2Type<false> is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
if (d_temp_storage == 0)
{
temp_storage_bytes = 1;
}
else
{
thrust::device_ptr<OutputT> d_out_wrapper_end;
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
for (int i = 0; i < timing_timing_iterations; ++i)
{
d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
}
OffsetT num_selected = d_out_wrapper_end - d_out_wrapper;
CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
}
return cudaSuccess;
}
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
/**
* Dispatch to partition if entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
Int2Type<false> is_flagged,
Int2Type<true> is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
}
else
{
thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
for (int i = 0; i < timing_timing_iterations; ++i)
{
d_out_wrapper_end = thrust::partition_copy(
d_in_wrapper,
d_in_wrapper + num_items,
d_out_wrapper,
d_out_unselected,
select_op);
}
OffsetT num_selected = d_out_wrapper_end.first - d_out_wrapper;
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
/**
* Dispatch to select flagged entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
Int2Type<true> is_flagged,
Int2Type<false> is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
{
temp_storage_bytes = 1;
}
else
{
thrust::device_ptr<OutputT> d_out_wrapper_end;
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
thrust::device_ptr<FlagT> d_flags_wrapper(d_flags);
for (int i = 0; i < timing_timing_iterations; ++i)
{
d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, Cast<bool>());
}
OffsetT num_selected = d_out_wrapper_end - d_out_wrapper;
CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
}
return cudaSuccess;
}
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
/**
* Dispatch to partition flagged entrypoint
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
__host__ __forceinline__
cudaError_t Dispatch(
Int2Type<THRUST> dispatch_to,
Int2Type<true> is_flagged,
Int2Type<true> is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
}
else
{
thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
thrust::device_ptr<InputT> d_in_wrapper(d_in);
thrust::device_ptr<OutputT> d_out_wrapper(d_out);
thrust::device_ptr<FlagT> d_flags_wrapper(d_flags);
ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
for (int i = 0; i < timing_timing_iterations; ++i)
{
d_out_wrapper_end = thrust::partition_copy(
d_in_wrapper,
d_in_wrapper + num_items,
d_flags_wrapper,
d_out_wrapper,
d_out_unselected,
Cast<bool>());
}
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
// CUDA Nested Parallelism Test Kernel
//---------------------------------------------------------------------
/**
* Simple wrapper kernel to invoke DeviceSelect
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
__global__ void CnpDispatchKernel(
IsFlaggedTag is_flagged,
IsPartitionTag is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
SelectOpT select_op,
bool debug_synchronous)
{
#ifndef CUB_CDP
*d_cdp_error = cudaErrorNotSupported;
#else
*d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
*d_temp_storage_bytes = temp_storage_bytes;
#endif
}
/**
* Dispatch to CDP kernel
*/
template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
cudaError_t Dispatch(
Int2Type<CDP> dispatch_to,
IsFlaggedTag is_flagged,
IsPartitionTag is_partition,
int timing_timing_iterations,
size_t* d_temp_storage_bytes,
cudaError_t* d_cdp_error,
void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
FlagIteratorT d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
OffsetT num_items,
SelectOpT select_op,
cudaStream_t stream,
bool debug_synchronous)
{
// Invoke kernel to invoke device-side dispatch
CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
// Copy out error
cudaError_t retval;
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
return retval;
}
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
int num_output_items = (IS_PARTITION) ? num_items : num_selected;
int num_flag_items = (IS_FLAGGED) ? num_items : 0;
size_t num_bytes = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
float giga_bandwidth = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
}
printf("\n\n");
xgboost/cub/test/test_device_select_if.cu view on Meta::CPAN
*/
int main(int argc, char** argv)
{
int num_items = -1;
float select_ratio = 0.5;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("repeat", g_repeat);
args.GetCmdLineArgument("ratio", select_ratio);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<input items> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--ratio=<selection ratio, default 0.5>] "
"[--repeat=<repetitions of entire test suite>] "
"[--v] "
"[--cdp] "
"\n", argv[0]);
exit(0);
}
// Initialize device