Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/R-package/R/callbacks.R  view on Meta::CPAN

    not_allowed <- pnames %in% 
      c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq')
    if (any(not_allowed))
      stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
    
    for (n in pnames) {
      p <- new_params[[n]]
      if (is.function(p)) {
        if (length(formals(p)) != 2)
          stop("Parameter '", n, "' is a function but not of two arguments")
      } else if (is.numeric(p) || is.character(p)) {
        if (length(p) != nrounds)
          stop("Length of '", n, "' has to be equal to 'nrounds'")
      } else {
        stop("Parameter '", n, "' is not a function or a vector")
      }
    }
  }
  
  callback <- function(env = parent.frame()) {
    if (is.null(nrounds))
      init(env)
    
    i <- env$iteration
    pars <- lapply(new_params, function(p) {
      if (is.function(p))
        return(p(i, nrounds))
      p[i]
    })
    
    if (!is.null(env$bst)) {
      xgb.parameters(env$bst$handle) <- pars
    } else {
      for (fd in env$bst_folds)
        xgb.parameters(fd$bst) <- pars
    }
  }
  attr(callback, 'is_pre_iteration') <- TRUE
  attr(callback, 'call') <- match.call()
  attr(callback, 'name') <- 'cb.reset.parameters'
  callback
}


#' Callback closure to activate the early stopping.
#' 
#' @param stopping_rounds The number of rounds with no improvement in 
#'        the evaluation metric in order to stop the training.
#' @param maximize whether to maximize the evaluation metric
#' @param metric_name the name of an evaluation column to use as a criteria for early
#'        stopping. If not set, the last column would be used.
#'        Let's say the test data in \code{watchlist} was labelled as \code{dtest}, 
#'        and one wants to use the AUC in test data for early stopping regardless of where 
#'        it is in the \code{watchlist}, then one of the following would need to be set:
#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
#'        All dash '-' characters in metric names are considered equivalent to '_'.
#' @param verbose whether to print the early stopping information.
#' 
#' @details
#' This callback function determines the condition for early stopping 
#' by setting the \code{stop_condition = TRUE} flag in its calling frame.
#' 
#' The following additional fields are assigned to the model's R object:
#' \itemize{
#' \item \code{best_score} the evaluation score at the best iteration
#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
#' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
#'      It differs from \code{best_iteration} in multiclass or random forest settings.
#' }
#' 
#' The Same values are also stored as xgb-attributes:
#' \itemize{
#' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
#' \item \code{best_msg} message string is also stored.
#' }
#' 
#' At least one data element is required in the evaluation watchlist for early stopping to work.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{stop_condition},
#' \code{bst_evaluation},
#' \code{rank},
#' \code{bst} (or \code{bst_folds} and \code{basket}),
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration},
#' \code{num_parallel_tree}.
#' 
#' @seealso
#' \code{\link{callbacks}},
#' \code{\link{xgb.attr}}
#' 
#' @export
cb.early.stop <- function(stopping_rounds, maximize = FALSE, 
                          metric_name = NULL, verbose = TRUE) {
  # state variables
  best_iteration <- -1
  best_ntreelimit <- -1
  best_score <- Inf
  best_msg <- NULL
  metric_idx <- 1
  
  init <- function(env) {
    if (length(env$bst_evaluation) == 0)
      stop("For early stopping, watchlist must have at least one element")
    
    eval_names <- gsub('-', '_', names(env$bst_evaluation))
    if (!is.null(metric_name)) {
      metric_idx <<- which(gsub('-', '_', metric_name) == eval_names)
      if (length(metric_idx) == 0)
        stop("'metric_name' for early stopping is not one of the following:\n",
             paste(eval_names, collapse = ' '), '\n')
    }
    if (is.null(metric_name) &&
        length(env$bst_evaluation) > 1) {
      metric_idx <<- length(eval_names)
      if (verbose)
        cat('Multiple eval metrics are present. Will use ', 
            eval_names[metric_idx], ' for early stopping.\n', sep = '')
    }
    

xgboost/R-package/R/callbacks.R  view on Meta::CPAN

          best_msg = best_msg,
          best_ntreelimit = best_ntreelimit)
      }
    } else if (i - best_iteration >= stopping_rounds) {
      env$stop_condition <- TRUE
      env$end_iteration <- i
      if (verbose && NVL(env$rank, 0) == 0)
        cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
    }
  }
  attr(callback, 'call') <- match.call()
  attr(callback, 'name') <- 'cb.early.stop'
  callback
}


#' Callback closure for saving a model file.
#' 
#' @param save_period save the model to disk after every 
#'        \code{save_period} iterations; 0 means save the model at the end.
#' @param save_name the name or path for the saved model file.
#'        It can contain a \code{\link[base]{sprintf}} formatting specifier 
#'        to include the integer iteration number in the file name.
#'        E.g., with \code{save_name} = 'xgboost_%04d.model', 
#'        the file saved at iteration 50 would be named "xgboost_0050.model".
#' 
#' @details 
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
#' 
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst},
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration}.
#' 
#' @seealso
#' \code{\link{callbacks}}
#' 
#' @export
cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
  
  if (save_period < 0)
    stop("'save_period' cannot be negative")

  callback <- function(env = parent.frame()) {
    if (is.null(env$bst))
      stop("'save_model' callback requires the 'bst' booster object in its calling frame")
    
    if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
        (save_period == 0 && env$iteration == env$end_iteration))
      xgb.save(env$bst, sprintf(save_name, env$iteration))
  }
  attr(callback, 'call') <- match.call()
  attr(callback, 'name') <- 'cb.save.model'
  callback
}


#' Callback closure for returning cross-validation based predictions.
#' 
#' @param save_models a flag for whether to save the folds' models.
#' 
#' @details 
#' This callback function saves predictions for all of the test folds,
#' and also allows to save the folds' models.
#' 
#' It is a "finalizer" callback and it uses early stopping information whenever it is available,
#' thus it must be run after the early stopping callback if the early stopping is used.
#' 
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst_folds},
#' \code{basket},
#' \code{data},
#' \code{end_iteration},
#' \code{params},
#' \code{num_parallel_tree},
#' \code{num_class}.
#' 
#' @return 
#' Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
#' depending on the number of prediction outputs per data row. The order of predictions corresponds 
#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is 
#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a 
#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be 
#' meaningful when user-profided folds have overlapping indices as in, e.g., random sampling splits.
#' When some of the indices in the training dataset are not included into user-provided \code{folds},
#' their prediction value would be \code{NA}.
#' 
#' @seealso
#' \code{\link{callbacks}}
#' 
#' @export
cb.cv.predict <- function(save_models = FALSE) {
  
  finalizer <- function(env) {
    if (is.null(env$basket) || is.null(env$bst_folds))
      stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
    
    N <- nrow(env$data)
    pred <- 
      if (env$num_class > 1) {
        matrix(NA_real_, N, env$num_class)
      } else {
        rep(NA_real_, N)
      }

    ntreelimit <- NVL(env$basket$best_ntreelimit, 
                      env$end_iteration * env$num_parallel_tree)
    if (NVL(env$params[['booster']], '') == 'gblinear') {
      ntreelimit <- 0 # must be 0 for gblinear
    }
    for (fd in env$bst_folds) {
      pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE)
      if (is.matrix(pred)) {
        pred[fd$index,] <- pr
      } else {
        pred[fd$index] <- pr
      }
    }
    env$basket$pred <- pred
    if (save_models) {



( run in 1.005 second using v1.01-cache-2.11-cpan-140bd7fdf52 )