Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/src/tree/updater_colmaker.cc view on Meta::CPAN
// training parameter
TrainParam param;
// data structure
/*! \brief per thread x per node entry to store tmp data */
struct ThreadEntry {
/*! \brief statistics of data */
TStats stats;
/*! \brief extra statistics of data */
TStats stats_extra;
/*! \brief last feature value scanned */
bst_float last_fvalue;
/*! \brief first feature value scanned */
bst_float first_fvalue;
/*! \brief current best solution */
SplitEntry best;
// constructor
explicit ThreadEntry(const TrainParam ¶m)
: stats(param), stats_extra(param) {
}
};
struct NodeEntry {
/*! \brief statics for node entry */
TStats stats;
/*! \brief loss of this node, without split */
bst_float root_gain;
/*! \brief weight calculated related to current data */
bst_float weight;
/*! \brief current best solution */
SplitEntry best;
// constructor
explicit NodeEntry(const TrainParam& param)
: stats(param), root_gain(0.0f), weight(0.0f){
}
};
// actual builder that runs the algorithm
struct Builder {
public:
// constructor
explicit Builder(const TrainParam& param) : param(param), nthread(omp_get_max_threads()) {}
// update one tree, growing
virtual void Update(const std::vector<bst_gpair>& gpair,
DMatrix* p_fmat,
RegTree* p_tree) {
this->InitData(gpair, *p_fmat, *p_tree);
this->InitNewNode(qexpand_, gpair, *p_fmat, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, qexpand_, gpair, p_fmat, p_tree);
this->ResetPosition(qexpand_, p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &qexpand_);
this->InitNewNode(qexpand_, gpair, *p_fmat, *p_tree);
// if nothing left to be expand, break
if (qexpand_.size() == 0) break;
}
// set all the rest expanding nodes to leaf
for (size_t i = 0; i < qexpand_.size(); ++i) {
const int nid = qexpand_[i];
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
}
// remember auxiliary statistics in the tree node
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
p_tree->stat(nid).base_weight = snode[nid].weight;
p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
snode[nid].stats.SetLeafVec(param, p_tree->leafvec(nid));
}
}
protected:
// initialize temp data structure
inline void InitData(const std::vector<bst_gpair>& gpair,
const DMatrix& fmat,
const RegTree& tree) {
CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
<< "ColMaker: can only grow new tree";
const std::vector<unsigned>& root_index = fmat.info().root_index;
const RowSet& rowset = fmat.buffered_rowset();
{
// setup position
position.resize(gpair.size());
if (root_index.size() == 0) {
for (size_t i = 0; i < rowset.size(); ++i) {
position[rowset[i]] = 0;
}
} else {
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
position[ridx] = root_index[ridx];
CHECK_LT(root_index[ridx], (unsigned)tree.param.num_roots);
}
}
// mark delete for the deleted datas
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
}
// mark subsample
if (param.subsample < 1.0f) {
std::bernoulli_distribution coin_flip(param.subsample);
auto& rnd = common::GlobalRandom();
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
if (gpair[ridx].hess < 0.0f) continue;
if (!coin_flip(rnd)) position[ridx] = ~position[ridx];
}
}
}
{
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.info().num_col);
for (unsigned i = 0; i < ncol; ++i) {
if (fmat.GetColSize(i) != 0) {
feat_index.push_back(i);
}
}
unsigned n = std::max(static_cast<unsigned>(1),
static_cast<unsigned>(param.colsample_bytree * feat_index.size()));
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
CHECK_GT(param.colsample_bytree, 0U)
<< "colsample_bytree cannot be zero.";
feat_index.resize(n);
}
{
// setup temp space for each thread
( run in 1.629 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )