NaN results from the CPAN

Alien-XGBoost

use Alien::XGBoost;

alien_ok 'Alien::XGBoost';

# XGBoost C API lacks XGBVersion o similar
ffi_ok { symbols => [qw(XGBGetLastError XGDMatrixCreateFromMat)] }, with_subtest {
    my ($ffi) = @_;
    my $create_from_matrix =
      $ffi->function( XGDMatrixCreateFromMat => [qw(float[] uint64 uint64 float opaque*)] => 'int' );
    my $matrix = 0;
    my $return_code = $create_from_matrix->call( [ 1, 1 ], 1, 2, "NaN", \$matrix );
    like 0, $return_code;
};

done_testing;

xgboost/cub/test/test_util.h view on Meta::CPAN

};

/******************************************************************************
 * Random bits generator
 ******************************************************************************/

int g_num_rand_samples = 0;


template <typename T>
bool IsNaN(T val) { return false; }

template<>
__noinline__ bool IsNaN<float>(float val)
{
    volatile unsigned int bits = reinterpret_cast<unsigned int &>(val);

    return (((bits >= 0x7F800001) && (bits <= 0x7FFFFFFF)) || 
        ((bits >= 0xFF800001) && (bits <= 0xFFFFFFFF)));
}

template<>
__noinline__ bool IsNaN<float1>(float1 val)
{
    return (IsNaN(val.x));
}

template<>
__noinline__ bool IsNaN<float2>(float2 val)
{
    return (IsNaN(val.y) || IsNaN(val.x));
}

template<>
__noinline__ bool IsNaN<float3>(float3 val)
{
    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
}

template<>
__noinline__ bool IsNaN<float4>(float4 val)
{
    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
}

template<>
__noinline__ bool IsNaN<double>(double val)
{
    volatile unsigned long long bits = *reinterpret_cast<unsigned long long *>(&val);

    return (((bits >= 0x7FF0000000000001) && (bits <= 0x7FFFFFFFFFFFFFFF)) || 
        ((bits >= 0xFFF0000000000001) && (bits <= 0xFFFFFFFFFFFFFFFF)));
}

template<>
__noinline__ bool IsNaN<double1>(double1 val)
{
    return (IsNaN(val.x));
}

template<>
__noinline__ bool IsNaN<double2>(double2 val)
{
    return (IsNaN(val.y) || IsNaN(val.x));
}

template<>
__noinline__ bool IsNaN<double3>(double3 val)
{
    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
}

template<>
__noinline__ bool IsNaN<double4>(double4 val)
{
    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
}


/**
 * Generates random keys.
 *
 * We always take the second-order byte from rand() because the higher-order
 * bits returned by rand() are commonly considered more uniformly distributed
 * than the lower-order bits.
 *

xgboost/cub/test/test_util.h view on Meta::CPAN

                word &= mersenne::genrand_int32();
                g_num_rand_samples++;                
            }

            word_buff[j] = word;
        }

        memcpy(&key, word_buff, sizeof(K));

        K copy = key;
        if (!IsNaN(copy))
            break;          // avoids NaNs when generating random floating point numbers
    }
}

/// Randomly select number between [0:max)
template <typename T>
T RandomValue(T max)
{
    unsigned int bits;
    unsigned int max_int = (unsigned int) -1;
    do {

xgboost/include/xgboost/tree_model.h view on Meta::CPAN

     *  when flag == -1, this indicate the value is missing
     */
    union Entry {
      bst_float fvalue;
      int flag;
    };
    std::vector<Entry> data;
  };
  /*!
   * \brief get the leaf index
   * \param feat dense feature vector, if the feature is missing the field is set to NaN
   * \param root_id starting root index of the instance
   * \return the leaf index of the given feature
   */
  inline int GetLeafIndex(const FVec& feat, unsigned root_id = 0) const;
  /*!
   * \brief get the prediction of regression tree, only accepts dense feature vector
   * \param feat dense feature vector, if the feature is missing the field is set to NaN
   * \param root_id starting root index of the instance
   * \return the leaf index of the given feature
   */
  inline bst_float Predict(const FVec& feat, unsigned root_id = 0) const;
  /*!
   * \brief calculate the feature contributions for the given root
   * \param feat dense feature vector, if the feature is missing the field is set to NaN
   * \param root_id starting root index of the instance
   * \param out_contribs output vector to hold the contributions
   */
  inline void CalculateContributions(const RegTree::FVec& feat, unsigned root_id,
                                     bst_float *out_contribs) const;
  /*!
   * \brief get next position of the tree given current pid
   * \param pid Current node id.
   * \param fvalue feature value if not missing.
   * \param is_unknown Whether current required feature is missing.

xgboost/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithRDD.scala view on Meta::CPAN

      MLLabeledPoint(lp.label, new MLDenseVector(lp.features.toArray)))
    val testSet = MLUtils.loadLibSVMFile(sc, inputTestPath)
        .map(lp => new MLDenseVector(lp.features.toArray))
    // training parameters
    val paramMap = List(
      "eta" -> 0.1f,
      "max_depth" -> 2,
      "objective" -> "binary:logistic").toMap
    val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
      useExternalMemory = true)
    xgboostModel.predict(testSet, missingValue = Float.NaN)
    // save model to HDFS path
    xgboostModel.saveModelAsHadoopFile(outputModelPath)
  }
}

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN

  *                    The Scala implementation is currently experimental, use at your own risk.
  */
case class TrackerConf(workerConnectionTimeout: Long, trackerImpl: String)

object XGBoost extends Serializable {
  private val logger = LogFactory.getLog("XGBoostSpark")

  private def fromDenseToSparseLabeledPoints(
      denseLabeledPoints: Iterator[XGBLabeledPoint],
      missing: Float): Iterator[XGBLabeledPoint] = {
    if (!missing.isNaN) {
      denseLabeledPoints.map { labeledPoint =>
        val indicesBuilder = new mutable.ArrayBuilder.ofInt()
        val valuesBuilder = new mutable.ArrayBuilder.ofFloat()
        for ((value, i) <- labeledPoint.values.zipWithIndex if value != missing) {
          indicesBuilder += (if (labeledPoint.indices == null) i else labeledPoint.indices(i))
          valuesBuilder += value
        }
        labeledPoint.copy(indices = indicesBuilder.result(), values = valuesBuilder.result())
      }
    } else {

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN

    }
  }

  private def fromBaseMarginsToArray(baseMargins: Iterator[Float]): Option[Array[Float]] = {
    val builder = new mutable.ArrayBuilder.ofFloat()
    var nTotal = 0
    var nUndefined = 0
    while (baseMargins.hasNext) {
      nTotal += 1
      val baseMargin = baseMargins.next()
      if (baseMargin.isNaN) {
        nUndefined += 1  // don't waste space for all-NaNs.
      } else {
        builder += baseMargin
      }
    }
    if (nUndefined == nTotal) {
      None
    } else if (nUndefined == 0) {
      Some(builder.result())
    } else {
      throw new IllegalArgumentException(
        s"Encountered a partition with $nUndefined NaN base margin values. " +
            "If you want to specify base margin, ensure all values are non-NaN.")
    }
  }

  private[spark] def buildDistributedBoosters(
      trainingSet: RDD[XGBLabeledPoint],
      params: Map[String, Any],
      rabitEnv: java.util.Map[String, String],
      numWorkers: Int,
      round: Int,
      obj: ObjectiveTrait,

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN

   */
  @throws(classOf[XGBoostError])
  def trainWithDataFrame(
      trainingData: Dataset[_],
      params: Map[String, Any],
      round: Int,
      nWorkers: Int,
      obj: ObjectiveTrait = null,
      eval: EvalTrait = null,
      useExternalMemory: Boolean = false,
      missing: Float = Float.NaN,
      featureCol: String = "features",
      labelCol: String = "label"): XGBoostModel = {
    require(nWorkers > 0, "you must specify more than 0 workers")
    val estimator = new XGBoostEstimator(params)
    // assigning general parameters
    estimator.
      set(estimator.useExternalMemory, useExternalMemory).
      set(estimator.round, round).
      set(estimator.nWorkers, nWorkers).
      set(estimator.customObj, obj).

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN

   */
  @deprecated("Use XGBoost.trainWithRDD instead.")
  def train(
      trainingData: RDD[MLLabeledPoint],
      params: Map[String, Any],
      round: Int,
      nWorkers: Int,
      obj: ObjectiveTrait = null,
      eval: EvalTrait = null,
      useExternalMemory: Boolean = false,
      missing: Float = Float.NaN): XGBoostModel = {
    trainWithRDD(trainingData, params, round, nWorkers, obj, eval, useExternalMemory,
      missing)
  }

  private def overrideParamsAccordingToTaskCPUs(
      params: Map[String, Any],
      sc: SparkContext): Map[String, Any] = {
    val coresPerTask = sc.getConf.getInt("spark.task.cpus", 1)
    var overridedParams = params
    if (overridedParams.contains("nthread")) {

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN

   */
  @throws(classOf[XGBoostError])
  def trainWithRDD(
      trainingData: RDD[MLLabeledPoint],
      params: Map[String, Any],
      round: Int,
      nWorkers: Int,
      obj: ObjectiveTrait = null,
      eval: EvalTrait = null,
      useExternalMemory: Boolean = false,
      missing: Float = Float.NaN): XGBoostModel = {
    import DataUtils._
    val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, features) =>
      features.asXGB.copy(label = label.toFloat)
    }
    trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
      useExternalMemory, missing)
  }

  @throws(classOf[XGBoostError])
  private[spark] def trainDistributed(
      trainingData: RDD[XGBLabeledPoint],
      params: Map[String, Any],
      round: Int,
      nWorkers: Int,
      obj: ObjectiveTrait = null,
      eval: EvalTrait = null,
      useExternalMemory: Boolean = false,
      missing: Float = Float.NaN): XGBoostModel = {
    if (params.contains("tree_method")) {
      require(params("tree_method") != "hist", "xgboost4j-spark does not support fast histogram" +
          " for now")
    }
    require(nWorkers > 0, "you must specify more than 0 workers")
    if (obj != null) {
      require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," +
          " you have to specify the objective type as classification or regression with a" +
          " customized objective function")
    }

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala view on Meta::CPAN

      r - "num_class"
    } else {
      r
    }
  }

  private def ensureColumns(trainingSet: Dataset[_]): Dataset[_] = {
    if (trainingSet.columns.contains($(baseMarginCol))) {
      trainingSet
    } else {
      trainingSet.withColumn($(baseMarginCol), lit(Float.NaN))
    }
  }

  /**
   * produce a XGBoostModel by fitting the given dataset
   */
  override def train(trainingSet: Dataset[_]): XGBoostModel = {
    val instances = ensureColumns(trainingSet).select(
      col($(featuresCol)),
      col($(labelCol)).cast(FloatType),

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala view on Meta::CPAN

  val customObj = new CustomObjParam(this, "custom_obj", "customized objective function " +
    "provided by user")

  /**
   * customized evaluation function provided by user. default: null
   */
  val customEval = new CustomEvalParam(this, "custom_eval", "customized evaluation function " +
    "provided by user")

  /**
   * the value treated as missing. default: Float.NaN
   */
  val missing = new FloatParam(this, "missing", "the value treated as missing")

  /**
    * Rabit tracker configurations. The parameter must be provided as an instance of the
    * TrackerConf class, which has the following definition:
    *
    *     case class TrackerConf(workerConnectionTimeout: Duration, trainingTimeout: Duration,
    *                            trackerImpl: String)
    *

xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala view on Meta::CPAN

    *        perform data transformation before calling XGBoost.train(), so that this timeout truly
    *        reflects the connection delay. Set a reasonable timeout value to prevent model
    *        training/testing from hanging indefinitely, possible due to network issues.
    *        Note that zero timeout value means to wait indefinitely (equivalent to Duration.Inf).
    *        Ignored if the tracker implementation is "python".
    */
  val trackerConf = new TrackerConfParam(this, "tracker_conf", "Rabit tracker configurations")

  setDefault(round -> 1, nWorkers -> 1, numThreadPerTask -> 1,
    useExternalMemory -> false, silent -> 0,
    customObj -> null, customEval -> null, missing -> Float.NaN,
    trackerConf -> TrackerConf()
  )
}

xgboost/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala view on Meta::CPAN

  }

  test("build RDD containing boosters with the specified worker number") {
    val trainingRDD = sc.parallelize(Classification.train)
    val boosterRDD = XGBoost.buildDistributedBoosters(
      trainingRDD,
      List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
        "objective" -> "binary:logistic").toMap,
      new java.util.HashMap[String, String](),
      numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true,
      missing = Float.NaN)
    val boosterCount = boosterRDD.count()
    assert(boosterCount === 2)
  }

  test("training with external memory cache") {
    import DataUtils._
    val eval = new EvalError()
    val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
    val testSetDMatrix = new DMatrix(Classification.test.iterator)
    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",

xgboost/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala view on Meta::CPAN

    /** Label of this point. */
    label: Float,
    /** Feature indices of this point or `null` if the data is dense. */
    indices: Array[Int],
    /** Feature values of this point. */
    values: Array[Float],
    /** Weight of this point. */
    weight: Float = 1.0f,
    /** Group of this point (used for ranking) or -1. */
    group: Int = -1,
    /** Initial prediction on this point or `Float.NaN`. */
    baseMargin: Float = Float.NaN
) extends Serializable {
  require(indices == null || indices.length == values.length,
    "indices and values must have the same number of elements")

  def this(label: Float, indices: Array[Int], values: Array[Float]) = {
    // [[weight]] default duplicated to disambiguate the constructor call.
    this(label, indices, values, 1.0f)
  }
}

xgboost/src/c_api/c_api.cc view on Meta::CPAN

        if (common::CheckNAN(data[ncol*i + j]) && !nan_missing) {
          badnan[ithread] = 1;
        } else if (common::CheckNAN(data[ncol * i + j])) {
        } else if (nan_missing || data[ncol * i + j] != missing) {
          ++nelem;
        }
      }
      mat.row_ptr_[i+1] = nelem;
    }
  }
  // Inform about any NaNs and resize data matrix
  for (int i = 0; i < nthread; i++) {
    CHECK(!badnan[i]) << "There are NAN in the matrix, however, you did not set missing=NAN";
  }

  // do cumulative sum (to avoid otherwise need to copy)
  prefixsum_inplace(&mat.row_ptr_[0], mat.row_ptr_.size());
  mat.row_data_.resize(mat.row_data_.size() + mat.row_ptr_.back());

  // Fill data matrix (now that know size, no need for slow push_back())
#pragma omp parallel num_threads(nthread)

xgboost/tests/cpp/c_api/test_c_api.cc view on Meta::CPAN

#include <xgboost/data.h>

TEST(c_api, XGDMatrixCreateFromMat_omp) {
  std::vector<int> num_rows = {100, 11374, 15000};
  for (auto row : num_rows) {
    int num_cols = 50;
    int num_missing = 5;
    DMatrixHandle handle;
    std::vector<float> data(num_cols * row, 1.5);
    for (int i = 0; i < num_missing; i++) {
      data[i] = std::numeric_limits<float>::quiet_NaN();
    }

    XGDMatrixCreateFromMat_omp(data.data(), row, num_cols,
                               std::numeric_limits<float>::quiet_NaN(), &handle,
                               0);

    std::shared_ptr<xgboost::DMatrix> dmat =
        *static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
    xgboost::MetaInfo &info = dmat->info();
    ASSERT_EQ(info.num_col, num_cols);
    ASSERT_EQ(info.num_row, row);
    ASSERT_EQ(info.num_nonzero, num_cols * row - num_missing);

    auto iter = dmat->RowIterator();

( run in 0.306 second using v1.01-cache-2.11-cpan-4d50c553e7e )