Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/src/tree/updater_fast_hist.cc view on Meta::CPAN
bst_float root_gain;
/*! \brief weight calculated related to current data */
float weight;
/*! \brief current best solution */
SplitEntry best;
// constructor
explicit NodeEntry(const TrainParam& param)
: stats(param), root_gain(0.0f), weight(0.0f) {
}
};
// actual builder that runs the algorithm
struct Builder {
public:
// constructor
explicit Builder(const TrainParam& param,
const FastHistParam& fhparam,
std::unique_ptr<TreeUpdater> pruner)
: param(param), fhparam(fhparam), pruner_(std::move(pruner)),
p_last_tree_(nullptr), p_last_fmat_(nullptr) {}
// update one tree, growing
virtual void Update(const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
const ColumnMatrix& column_matrix,
const std::vector<bst_gpair>& gpair,
DMatrix* p_fmat,
RegTree* p_tree) {
double gstart = dmlc::GetTime();
int num_leaves = 0;
unsigned timestamp = 0;
double tstart;
double time_init_data = 0;
double time_init_new_node = 0;
double time_build_hist = 0;
double time_evaluate_split = 0;
double time_apply_split = 0;
tstart = dmlc::GetTime();
this->InitData(gmat, gpair, *p_fmat, *p_tree);
std::vector<bst_uint> feat_set = feat_index;
time_init_data = dmlc::GetTime() - tstart;
// FIXME(hcho3): this code is broken when param.num_roots > 1. Please fix it
CHECK_EQ(p_tree->param.num_roots, 1)
<< "tree_method=hist does not support multiple roots at this moment";
for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
tstart = dmlc::GetTime();
hist_.AddHistRow(nid);
BuildHist(gpair, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]);
time_build_hist += dmlc::GetTime() - tstart;
tstart = dmlc::GetTime();
this->InitNewNode(nid, gmat, gpair, *p_fmat, *p_tree);
time_init_new_node += dmlc::GetTime() - tstart;
tstart = dmlc::GetTime();
this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree, feat_set);
time_evaluate_split += dmlc::GetTime() - tstart;
qexpand_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
snode[nid].best.loss_chg,
timestamp++));
++num_leaves;
}
while (!qexpand_->empty()) {
const ExpandEntry candidate = qexpand_->top();
const int nid = candidate.nid;
qexpand_->pop();
if (candidate.loss_chg <= rt_eps
|| (param.max_depth > 0 && candidate.depth == param.max_depth)
|| (param.max_leaves > 0 && num_leaves == param.max_leaves) ) {
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
} else {
tstart = dmlc::GetTime();
this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
time_apply_split += dmlc::GetTime() - tstart;
tstart = dmlc::GetTime();
const int cleft = (*p_tree)[nid].cleft();
const int cright = (*p_tree)[nid].cright();
hist_.AddHistRow(cleft);
hist_.AddHistRow(cright);
if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) {
BuildHist(gpair, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
} else {
BuildHist(gpair, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]);
SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
}
time_build_hist += dmlc::GetTime() - tstart;
tstart = dmlc::GetTime();
this->InitNewNode(cleft, gmat, gpair, *p_fmat, *p_tree);
this->InitNewNode(cright, gmat, gpair, *p_fmat, *p_tree);
time_init_new_node += dmlc::GetTime() - tstart;
tstart = dmlc::GetTime();
this->EvaluateSplit(cleft, gmat, hist_, *p_fmat, *p_tree, feat_set);
this->EvaluateSplit(cright, gmat, hist_, *p_fmat, *p_tree, feat_set);
time_evaluate_split += dmlc::GetTime() - tstart;
qexpand_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
snode[cleft].best.loss_chg,
timestamp++));
qexpand_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
snode[cright].best.loss_chg,
timestamp++));
++num_leaves; // give two and take one, as parent is no longer a leaf
}
}
// set all the rest expanding nodes to leaf
// This post condition is not needed in current code, but may be necessary
// when there are stopping rule that leaves qexpand non-empty
while (!qexpand_->empty()) {
const int nid = qexpand_->top().nid;
qexpand_->pop();
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
}
// remember auxiliary statistics in the tree node
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
p_tree->stat(nid).base_weight = snode[nid].weight;
p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
snode[nid].stats.SetLeafVec(param, p_tree->leafvec(nid));
}
pruner_->Update(gpair, p_fmat, std::vector<RegTree*>{p_tree});
if (param.debug_verbose > 0) {
double total_time = dmlc::GetTime() - gstart;
LOG(INFO) << "\nInitData: "
<< std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
<< time_init_data / total_time * 100 << "%)\n"
<< "InitNewNode: "
<< std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
<< time_init_new_node / total_time * 100 << "%)\n"
<< "BuildHist: "
<< std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
<< time_build_hist / total_time * 100 << "%)\n"
<< "EvaluateSplit: "
<< std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
<< time_evaluate_split / total_time * 100 << "%)\n"
<< "ApplySplit: "
<< std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
<< time_apply_split / total_time * 100 << "%)\n"
<< "========================================\n"
<< "Total: "
<< std::fixed << std::setw(6) << std::setprecision(4) << total_time;
}
}
inline void BuildHist(const std::vector<bst_gpair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const GHistIndexBlockMatrix& gmatb,
const std::vector<bst_uint>& feat_set,
GHistRow hist) {
if (fhparam.enable_feature_grouping > 0) {
hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, feat_set, hist);
} else {
hist_builder_.BuildHist(gpair, row_indices, gmat, feat_set, hist);
}
}
inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
hist_builder_.SubtractionTrick(self, sibling, parent);
}
inline bool UpdatePredictionCache(const DMatrix* data,
std::vector<bst_float>* p_out_preds) {
std::vector<bst_float>& out_preds = *p_out_preds;
xgboost/src/tree/updater_fast_hist.cc view on Meta::CPAN
if (nrow * ncol == nnz) {
// dense data with zero-based indexing
data_layout_ = kDenseDataZeroBased;
} else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) {
// dense data with one-based indexing
data_layout_ = kDenseDataOneBased;
} else {
// sparse data
data_layout_ = kSparseData;
}
}
{
// store a pointer to the tree
p_last_tree_ = &tree;
// store a pointer to training data
p_last_fmat_ = &fmat;
// initialize feature index
bst_uint ncol = static_cast<bst_uint>(info.num_col);
feat_index.clear();
if (data_layout_ == kDenseDataOneBased) {
for (bst_uint i = 1; i < ncol; ++i) {
feat_index.push_back(i);
}
} else {
for (bst_uint i = 0; i < ncol; ++i) {
feat_index.push_back(i);
}
}
bst_uint n = std::max(static_cast<bst_uint>(1),
static_cast<bst_uint>(param.colsample_bytree * feat_index.size()));
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
CHECK_GT(param.colsample_bytree, 0U)
<< "colsample_bytree cannot be zero.";
feat_index.resize(n);
}
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
/* specialized code for dense data:
choose the column that has a least positive number of discrete bins.
For dense data (with no missing value),
the sum of gradient histogram is equal to snode[nid] */
const std::vector<uint32_t>& row_ptr = gmat.cut->row_ptr;
const bst_uint nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
uint32_t min_nbins_per_feature = 0;
for (bst_uint i = 0; i < nfeature; ++i) {
const uint32_t nbins = row_ptr[i + 1] - row_ptr[i];
if (nbins > 0) {
if (min_nbins_per_feature == 0 || min_nbins_per_feature > nbins) {
min_nbins_per_feature = nbins;
fid_least_bins_ = i;
}
}
}
CHECK_GT(min_nbins_per_feature, 0U);
}
{
snode.reserve(256);
snode.clear();
}
{
if (param.grow_policy == TrainParam::kLossGuide) {
qexpand_.reset(new ExpandQueue(loss_guide));
} else {
qexpand_.reset(new ExpandQueue(depth_wise));
}
}
}
inline void EvaluateSplit(int nid,
const GHistIndexMatrix& gmat,
const HistCollection& hist,
const DMatrix& fmat,
const RegTree& tree,
const std::vector<bst_uint>& feat_set) {
// start enumeration
const MetaInfo& info = fmat.info();
const bst_uint nfeature = static_cast<bst_uint>(feat_set.size());
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread);
best_split_tloc_.resize(nthread);
#pragma omp parallel for schedule(static) num_threads(nthread)
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
best_split_tloc_[tid] = snode[nid].best;
}
#pragma omp parallel for schedule(dynamic) num_threads(nthread)
for (bst_omp_uint i = 0; i < nfeature; ++i) {
const bst_uint fid = feat_set[i];
const unsigned tid = omp_get_thread_num();
this->EnumerateSplit(-1, gmat, hist[nid], snode[nid], constraints_[nid], info,
&best_split_tloc_[tid], fid);
this->EnumerateSplit(+1, gmat, hist[nid], snode[nid], constraints_[nid], info,
&best_split_tloc_[tid], fid);
}
for (unsigned tid = 0; tid < nthread; ++tid) {
snode[nid].best.Update(best_split_tloc_[tid]);
}
}
inline void ApplySplit(int nid,
const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
const HistCollection& hist,
const DMatrix& fmat,
RegTree* p_tree) {
XGBOOST_TYPE_SWITCH(column_matrix.dtype, {
ApplySplit_<DType>(nid, gmat, column_matrix, hist, fmat, p_tree);
});
}
template <typename T>
inline void ApplySplit_(int nid,
const GHistIndexMatrix& gmat,
const ColumnMatrix& column_matrix,
const HistCollection& hist,
const DMatrix& fmat,
RegTree* p_tree) {
// TODO(hcho3): support feature sampling by levels
/* 1. Create child nodes */
NodeEntry& e = snode[nid];
p_tree->AddChilds(nid);
(*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
// mark right child as 0, to indicate fresh leaf
int cleft = (*p_tree)[nid].cleft();
xgboost/src/tree/updater_fast_hist.cc view on Meta::CPAN
/* tree growing policies */
struct ExpandEntry {
int nid;
int depth;
bst_float loss_chg;
unsigned timestamp;
ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
: nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
};
inline static bool depth_wise(ExpandEntry lhs, ExpandEntry rhs) {
if (lhs.depth == rhs.depth) {
return lhs.timestamp > rhs.timestamp; // favor small timestamp
} else {
return lhs.depth > rhs.depth; // favor small depth
}
}
inline static bool loss_guide(ExpandEntry lhs, ExpandEntry rhs) {
if (lhs.loss_chg == rhs.loss_chg) {
return lhs.timestamp > rhs.timestamp; // favor small timestamp
} else {
return lhs.loss_chg < rhs.loss_chg; // favor large loss_chg
}
}
// --data fields--
const TrainParam& param;
const FastHistParam& fhparam;
// number of omp thread used during training
int nthread;
// Per feature: shuffle index of each feature index
std::vector<bst_uint> feat_index;
// the internal row sets
RowSetCollection row_set_collection_;
// the temp space for split
std::vector<RowSetCollection::Split> row_split_tloc_;
std::vector<SplitEntry> best_split_tloc_;
/*! \brief TreeNode Data: statistics for each constructed node */
std::vector<NodeEntry> snode;
/*! \brief culmulative histogram of gradients. */
HistCollection hist_;
/*! \brief feature with least # of bins. to be used for dense specialization
of InitNewNode() */
uint32_t fid_least_bins_;
/*! \brief local prediction cache; maps node id to leaf value */
std::vector<float> leaf_value_cache_;
GHistBuilder hist_builder_;
std::unique_ptr<TreeUpdater> pruner_;
// back pointers to tree and data matrix
const RegTree* p_last_tree_;
const DMatrix* p_last_fmat_;
// constraint value
std::vector<TConstraint> constraints_;
typedef std::priority_queue<ExpandEntry,
std::vector<ExpandEntry>,
std::function<bool(ExpandEntry, ExpandEntry)>> ExpandQueue;
std::unique_ptr<ExpandQueue> qexpand_;
enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
DataLayout data_layout_;
};
std::unique_ptr<Builder> builder_;
std::unique_ptr<TreeUpdater> pruner_;
};
XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker")
.describe("Grow tree using quantized histogram.")
.set_body([]() {
return new FastHistMaker<GradStats, NoConstraint>();
});
} // namespace tree
} // namespace xgboost
( run in 0.674 second using v1.01-cache-2.11-cpan-5b529ec07f3 )