view release on metacpan or search on metacpan
t/10-dynamic_lib_xgboost.t view on Meta::CPAN
use Alien::XGBoost;
alien_ok 'Alien::XGBoost';
# XGBoost C API lacks XGBVersion o similar
ffi_ok { symbols => [qw(XGBGetLastError XGDMatrixCreateFromMat)] }, with_subtest {
my ($ffi) = @_;
my $create_from_matrix =
$ffi->function( XGDMatrixCreateFromMat => [qw(float[] uint64 uint64 float opaque*)] => 'int' );
my $matrix = 0;
my $return_code = $create_from_matrix->call( [ 1, 1 ], 1, 2, "NaN", \$matrix );
like 0, $return_code;
};
done_testing;
xgboost/cub/test/test_util.h view on Meta::CPAN
};
/******************************************************************************
* Random bits generator
******************************************************************************/
int g_num_rand_samples = 0;
template <typename T>
bool IsNaN(T val) { return false; }
template<>
__noinline__ bool IsNaN<float>(float val)
{
volatile unsigned int bits = reinterpret_cast<unsigned int &>(val);
return (((bits >= 0x7F800001) && (bits <= 0x7FFFFFFF)) ||
((bits >= 0xFF800001) && (bits <= 0xFFFFFFFF)));
}
template<>
__noinline__ bool IsNaN<float1>(float1 val)
{
return (IsNaN(val.x));
}
template<>
__noinline__ bool IsNaN<float2>(float2 val)
{
return (IsNaN(val.y) || IsNaN(val.x));
}
template<>
__noinline__ bool IsNaN<float3>(float3 val)
{
return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
}
template<>
__noinline__ bool IsNaN<float4>(float4 val)
{
return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
}
template<>
__noinline__ bool IsNaN<double>(double val)
{
volatile unsigned long long bits = *reinterpret_cast<unsigned long long *>(&val);
return (((bits >= 0x7FF0000000000001) && (bits <= 0x7FFFFFFFFFFFFFFF)) ||
((bits >= 0xFFF0000000000001) && (bits <= 0xFFFFFFFFFFFFFFFF)));
}
template<>
__noinline__ bool IsNaN<double1>(double1 val)
{
return (IsNaN(val.x));
}
template<>
__noinline__ bool IsNaN<double2>(double2 val)
{
return (IsNaN(val.y) || IsNaN(val.x));
}
template<>
__noinline__ bool IsNaN<double3>(double3 val)
{
return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
}
template<>
__noinline__ bool IsNaN<double4>(double4 val)
{
return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
}
/**
* Generates random keys.
*
* We always take the second-order byte from rand() because the higher-order
* bits returned by rand() are commonly considered more uniformly distributed
* than the lower-order bits.
*
xgboost/cub/test/test_util.h view on Meta::CPAN
word &= mersenne::genrand_int32();
g_num_rand_samples++;
}
word_buff[j] = word;
}
memcpy(&key, word_buff, sizeof(K));
K copy = key;
if (!IsNaN(copy))
break; // avoids NaNs when generating random floating point numbers
}
}
/// Randomly select number between [0:max)
template <typename T>
T RandomValue(T max)
{
unsigned int bits;
unsigned int max_int = (unsigned int) -1;
do {
xgboost/include/xgboost/tree_model.h view on Meta::CPAN
* when flag == -1, this indicate the value is missing
*/
union Entry {
bst_float fvalue;
int flag;
};
std::vector<Entry> data;
};
/*!
* \brief get the leaf index
* \param feat dense feature vector, if the feature is missing the field is set to NaN
* \param root_id starting root index of the instance
* \return the leaf index of the given feature
*/
inline int GetLeafIndex(const FVec& feat, unsigned root_id = 0) const;
/*!
* \brief get the prediction of regression tree, only accepts dense feature vector
* \param feat dense feature vector, if the feature is missing the field is set to NaN
* \param root_id starting root index of the instance
* \return the leaf index of the given feature
*/
inline bst_float Predict(const FVec& feat, unsigned root_id = 0) const;
/*!
* \brief calculate the feature contributions for the given root
* \param feat dense feature vector, if the feature is missing the field is set to NaN
* \param root_id starting root index of the instance
* \param out_contribs output vector to hold the contributions
*/
inline void CalculateContributions(const RegTree::FVec& feat, unsigned root_id,
bst_float *out_contribs) const;
/*!
* \brief get next position of the tree given current pid
* \param pid Current node id.
* \param fvalue feature value if not missing.
* \param is_unknown Whether current required feature is missing.
xgboost/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithRDD.scala view on Meta::CPAN
MLLabeledPoint(lp.label, new MLDenseVector(lp.features.toArray)))
val testSet = MLUtils.loadLibSVMFile(sc, inputTestPath)
.map(lp => new MLDenseVector(lp.features.toArray))
// training parameters
val paramMap = List(
"eta" -> 0.1f,
"max_depth" -> 2,
"objective" -> "binary:logistic").toMap
val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
useExternalMemory = true)
xgboostModel.predict(testSet, missingValue = Float.NaN)
// save model to HDFS path
xgboostModel.saveModelAsHadoopFile(outputModelPath)
}
}
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
* The Scala implementation is currently experimental, use at your own risk.
*/
case class TrackerConf(workerConnectionTimeout: Long, trackerImpl: String)
object XGBoost extends Serializable {
private val logger = LogFactory.getLog("XGBoostSpark")
private def fromDenseToSparseLabeledPoints(
denseLabeledPoints: Iterator[XGBLabeledPoint],
missing: Float): Iterator[XGBLabeledPoint] = {
if (!missing.isNaN) {
denseLabeledPoints.map { labeledPoint =>
val indicesBuilder = new mutable.ArrayBuilder.ofInt()
val valuesBuilder = new mutable.ArrayBuilder.ofFloat()
for ((value, i) <- labeledPoint.values.zipWithIndex if value != missing) {
indicesBuilder += (if (labeledPoint.indices == null) i else labeledPoint.indices(i))
valuesBuilder += value
}
labeledPoint.copy(indices = indicesBuilder.result(), values = valuesBuilder.result())
}
} else {
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
}
}
private def fromBaseMarginsToArray(baseMargins: Iterator[Float]): Option[Array[Float]] = {
val builder = new mutable.ArrayBuilder.ofFloat()
var nTotal = 0
var nUndefined = 0
while (baseMargins.hasNext) {
nTotal += 1
val baseMargin = baseMargins.next()
if (baseMargin.isNaN) {
nUndefined += 1 // don't waste space for all-NaNs.
} else {
builder += baseMargin
}
}
if (nUndefined == nTotal) {
None
} else if (nUndefined == 0) {
Some(builder.result())
} else {
throw new IllegalArgumentException(
s"Encountered a partition with $nUndefined NaN base margin values. " +
"If you want to specify base margin, ensure all values are non-NaN.")
}
}
private[spark] def buildDistributedBoosters(
trainingSet: RDD[XGBLabeledPoint],
params: Map[String, Any],
rabitEnv: java.util.Map[String, String],
numWorkers: Int,
round: Int,
obj: ObjectiveTrait,
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
*/
@throws(classOf[XGBoostError])
def trainWithDataFrame(
trainingData: Dataset[_],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN,
featureCol: String = "features",
labelCol: String = "label"): XGBoostModel = {
require(nWorkers > 0, "you must specify more than 0 workers")
val estimator = new XGBoostEstimator(params)
// assigning general parameters
estimator.
set(estimator.useExternalMemory, useExternalMemory).
set(estimator.round, round).
set(estimator.nWorkers, nWorkers).
set(estimator.customObj, obj).
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
*/
@deprecated("Use XGBoost.trainWithRDD instead.")
def train(
trainingData: RDD[MLLabeledPoint],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN): XGBoostModel = {
trainWithRDD(trainingData, params, round, nWorkers, obj, eval, useExternalMemory,
missing)
}
private def overrideParamsAccordingToTaskCPUs(
params: Map[String, Any],
sc: SparkContext): Map[String, Any] = {
val coresPerTask = sc.getConf.getInt("spark.task.cpus", 1)
var overridedParams = params
if (overridedParams.contains("nthread")) {
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
*/
@throws(classOf[XGBoostError])
def trainWithRDD(
trainingData: RDD[MLLabeledPoint],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN): XGBoostModel = {
import DataUtils._
val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, features) =>
features.asXGB.copy(label = label.toFloat)
}
trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
useExternalMemory, missing)
}
@throws(classOf[XGBoostError])
private[spark] def trainDistributed(
trainingData: RDD[XGBLabeledPoint],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN): XGBoostModel = {
if (params.contains("tree_method")) {
require(params("tree_method") != "hist", "xgboost4j-spark does not support fast histogram" +
" for now")
}
require(nWorkers > 0, "you must specify more than 0 workers")
if (obj != null) {
require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," +
" you have to specify the objective type as classification or regression with a" +
" customized objective function")
}
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala view on Meta::CPAN
r - "num_class"
} else {
r
}
}
private def ensureColumns(trainingSet: Dataset[_]): Dataset[_] = {
if (trainingSet.columns.contains($(baseMarginCol))) {
trainingSet
} else {
trainingSet.withColumn($(baseMarginCol), lit(Float.NaN))
}
}
/**
* produce a XGBoostModel by fitting the given dataset
*/
override def train(trainingSet: Dataset[_]): XGBoostModel = {
val instances = ensureColumns(trainingSet).select(
col($(featuresCol)),
col($(labelCol)).cast(FloatType),
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala view on Meta::CPAN
val customObj = new CustomObjParam(this, "custom_obj", "customized objective function " +
"provided by user")
/**
* customized evaluation function provided by user. default: null
*/
val customEval = new CustomEvalParam(this, "custom_eval", "customized evaluation function " +
"provided by user")
/**
* the value treated as missing. default: Float.NaN
*/
val missing = new FloatParam(this, "missing", "the value treated as missing")
/**
* Rabit tracker configurations. The parameter must be provided as an instance of the
* TrackerConf class, which has the following definition:
*
* case class TrackerConf(workerConnectionTimeout: Duration, trainingTimeout: Duration,
* trackerImpl: String)
*
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala view on Meta::CPAN
* perform data transformation before calling XGBoost.train(), so that this timeout truly
* reflects the connection delay. Set a reasonable timeout value to prevent model
* training/testing from hanging indefinitely, possible due to network issues.
* Note that zero timeout value means to wait indefinitely (equivalent to Duration.Inf).
* Ignored if the tracker implementation is "python".
*/
val trackerConf = new TrackerConfParam(this, "tracker_conf", "Rabit tracker configurations")
setDefault(round -> 1, nWorkers -> 1, numThreadPerTask -> 1,
useExternalMemory -> false, silent -> 0,
customObj -> null, customEval -> null, missing -> Float.NaN,
trackerConf -> TrackerConf()
)
}
xgboost/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala view on Meta::CPAN
}
test("build RDD containing boosters with the specified worker number") {
val trainingRDD = sc.parallelize(Classification.train)
val boosterRDD = XGBoost.buildDistributedBoosters(
trainingRDD,
List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic").toMap,
new java.util.HashMap[String, String](),
numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true,
missing = Float.NaN)
val boosterCount = boosterRDD.count()
assert(boosterCount === 2)
}
test("training with external memory cache") {
import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
xgboost/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala view on Meta::CPAN
/** Label of this point. */
label: Float,
/** Feature indices of this point or `null` if the data is dense. */
indices: Array[Int],
/** Feature values of this point. */
values: Array[Float],
/** Weight of this point. */
weight: Float = 1.0f,
/** Group of this point (used for ranking) or -1. */
group: Int = -1,
/** Initial prediction on this point or `Float.NaN`. */
baseMargin: Float = Float.NaN
) extends Serializable {
require(indices == null || indices.length == values.length,
"indices and values must have the same number of elements")
def this(label: Float, indices: Array[Int], values: Array[Float]) = {
// [[weight]] default duplicated to disambiguate the constructor call.
this(label, indices, values, 1.0f)
}
}
xgboost/src/c_api/c_api.cc view on Meta::CPAN
if (common::CheckNAN(data[ncol*i + j]) && !nan_missing) {
badnan[ithread] = 1;
} else if (common::CheckNAN(data[ncol * i + j])) {
} else if (nan_missing || data[ncol * i + j] != missing) {
++nelem;
}
}
mat.row_ptr_[i+1] = nelem;
}
}
// Inform about any NaNs and resize data matrix
for (int i = 0; i < nthread; i++) {
CHECK(!badnan[i]) << "There are NAN in the matrix, however, you did not set missing=NAN";
}
// do cumulative sum (to avoid otherwise need to copy)
prefixsum_inplace(&mat.row_ptr_[0], mat.row_ptr_.size());
mat.row_data_.resize(mat.row_data_.size() + mat.row_ptr_.back());
// Fill data matrix (now that know size, no need for slow push_back())
#pragma omp parallel num_threads(nthread)
xgboost/tests/cpp/c_api/test_c_api.cc view on Meta::CPAN
#include <xgboost/data.h>
TEST(c_api, XGDMatrixCreateFromMat_omp) {
std::vector<int> num_rows = {100, 11374, 15000};
for (auto row : num_rows) {
int num_cols = 50;
int num_missing = 5;
DMatrixHandle handle;
std::vector<float> data(num_cols * row, 1.5);
for (int i = 0; i < num_missing; i++) {
data[i] = std::numeric_limits<float>::quiet_NaN();
}
XGDMatrixCreateFromMat_omp(data.data(), row, num_cols,
std::numeric_limits<float>::quiet_NaN(), &handle,
0);
std::shared_ptr<xgboost::DMatrix> dmat =
*static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
xgboost::MetaInfo &info = dmat->info();
ASSERT_EQ(info.num_col, num_cols);
ASSERT_EQ(info.num_row, row);
ASSERT_EQ(info.num_nonzero, num_cols * row - num_missing);
auto iter = dmat->RowIterator();