Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/src/gbm/gbtree.cc view on Meta::CPAN
// gradient boosted trees
class GBTree : public GradientBooster {
public:
explicit GBTree(bst_float base_margin) : model_(base_margin) {}
void InitCache(const std::vector<std::shared_ptr<DMatrix> > &cache) {
cache_ = cache;
}
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
this->cfg = cfg;
model_.Configure(cfg);
// initialize the updaters only when needed.
std::string updater_seq = tparam.updater_seq;
tparam.InitAllowUnknown(cfg);
if (updater_seq != tparam.updater_seq) updaters.clear();
for (const auto& up : updaters) {
up->Init(cfg);
}
// for the 'update' process_type, move trees into trees_to_update
if (tparam.process_type == kUpdate) {
model_.InitTreesToUpdate();
}
// configure predictor
predictor = std::unique_ptr<Predictor>(Predictor::Create(tparam.predictor));
predictor->Init(cfg, cache_);
}
void Load(dmlc::Stream* fi) override {
model_.Load(fi);
this->cfg.clear();
this->cfg.push_back(std::make_pair(std::string("num_feature"),
common::ToString(model_.param.num_feature)));
}
void Save(dmlc::Stream* fo) const override {
model_.Save(fo);
}
bool AllowLazyCheckPoint() const override {
return model_.param.num_output_group == 1 ||
tparam.updater_seq.find("distcol") != std::string::npos;
}
void DoBoost(DMatrix* p_fmat,
std::vector<bst_gpair>* in_gpair,
ObjFunction* obj) override {
const std::vector<bst_gpair>& gpair = *in_gpair;
std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
const int ngroup = model_.param.num_output_group;
if (ngroup == 1) {
std::vector<std::unique_ptr<RegTree> > ret;
BoostNewTrees(gpair, p_fmat, 0, &ret);
new_trees.push_back(std::move(ret));
} else {
CHECK_EQ(gpair.size() % ngroup, 0U)
<< "must have exactly ngroup*nrow gpairs";
std::vector<bst_gpair> tmp(gpair.size() / ngroup);
for (int gid = 0; gid < ngroup; ++gid) {
bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid];
}
std::vector<std::unique_ptr<RegTree> > ret;
BoostNewTrees(tmp, p_fmat, gid, &ret);
new_trees.push_back(std::move(ret));
}
}
double tstart = dmlc::GetTime();
for (int gid = 0; gid < ngroup; ++gid) {
this->CommitModel(std::move(new_trees[gid]), gid);
}
if (tparam.debug_verbose > 0) {
LOG(INFO) << "CommitModel(): " << dmlc::GetTime() - tstart << " sec";
}
}
void PredictBatch(DMatrix* p_fmat,
std::vector<bst_float>* out_preds,
unsigned ntree_limit) override {
predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
}
void PredictInstance(const SparseBatch::Inst& inst,
std::vector<bst_float>* out_preds,
unsigned ntree_limit,
unsigned root_index) override {
predictor->PredictInstance(inst, out_preds, model_,
ntree_limit, root_index);
}
void PredictLeaf(DMatrix* p_fmat,
std::vector<bst_float>* out_preds,
unsigned ntree_limit) override {
predictor->PredictLeaf(p_fmat, out_preds, model_, ntree_limit);
}
void PredictContribution(DMatrix* p_fmat,
std::vector<bst_float>* out_contribs,
unsigned ntree_limit) override {
predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit);
}
std::vector<std::string> DumpModel(const FeatureMap& fmap,
bool with_stats,
std::string format) const override {
return model_.DumpModel(fmap, with_stats, format);
}
protected:
// initialize updater before using them
inline void InitUpdater() {
if (updaters.size() != 0) return;
std::string tval = tparam.updater_seq;
std::vector<std::string> ups = common::Split(tval, ',');
for (const std::string& pstr : ups) {
std::unique_ptr<TreeUpdater> up(TreeUpdater::Create(pstr.c_str()));
up->Init(this->cfg);
updaters.push_back(std::move(up));
}
}
// do group specific group
inline void
BoostNewTrees(const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat,
int bst_group,
std::vector<std::unique_ptr<RegTree> >* ret) {
this->InitUpdater();
std::vector<RegTree*> new_trees;
ret->clear();
// create the trees
xgboost/src/gbm/gbtree.cc view on Meta::CPAN
// ----training fields----
// configurations for tree
std::vector<std::pair<std::string, std::string> > cfg;
// the updaters that can be applied to each of tree
std::vector<std::unique_ptr<TreeUpdater>> updaters;
// Cached matrices
std::vector<std::shared_ptr<DMatrix>> cache_;
std::unique_ptr<Predictor> predictor;
};
// dart
class Dart : public GBTree {
public:
explicit Dart(bst_float base_margin) : GBTree(base_margin) {}
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
GBTree::Configure(cfg);
if (model_.trees.size() == 0) {
dparam.InitAllowUnknown(cfg);
}
}
void Load(dmlc::Stream* fi) override {
GBTree::Load(fi);
weight_drop.resize(model_.param.num_trees);
if (model_.param.num_trees != 0) {
fi->Read(&weight_drop);
}
}
void Save(dmlc::Stream* fo) const override {
GBTree::Save(fo);
if (weight_drop.size() != 0) {
fo->Write(weight_drop);
}
}
// predict the leaf scores with dropout if ntree_limit = 0
void PredictBatch(DMatrix* p_fmat,
std::vector<bst_float>* out_preds,
unsigned ntree_limit) override {
DropTrees(ntree_limit);
PredLoopInternal<Dart>(p_fmat, out_preds, 0, ntree_limit, true);
}
void PredictInstance(const SparseBatch::Inst& inst,
std::vector<bst_float>* out_preds,
unsigned ntree_limit,
unsigned root_index) override {
DropTrees(1);
if (thread_temp.size() == 0) {
thread_temp.resize(1, RegTree::FVec());
thread_temp[0].Init(model_.param.num_feature);
}
out_preds->resize(model_.param.num_output_group);
ntree_limit *= model_.param.num_output_group;
if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
ntree_limit = static_cast<unsigned>(model_.trees.size());
}
// loop over output groups
for (int gid = 0; gid < model_.param.num_output_group; ++gid) {
(*out_preds)[gid]
= PredValue(inst, gid, root_index,
&thread_temp[0], 0, ntree_limit) + model_.base_margin;
}
}
protected:
friend class GBTree;
// internal prediction loop
// add predictions to out_preds
template<typename Derived>
inline void PredLoopInternal(
DMatrix* p_fmat,
std::vector<bst_float>* out_preds,
unsigned tree_begin,
unsigned ntree_limit,
bool init_out_preds) {
int num_group = model_.param.num_output_group;
ntree_limit *= num_group;
if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
ntree_limit = static_cast<unsigned>(model_.trees.size());
}
if (init_out_preds) {
size_t n = num_group * p_fmat->info().num_row;
const std::vector<bst_float>& base_margin = p_fmat->info().base_margin;
out_preds->resize(n);
if (base_margin.size() != 0) {
CHECK_EQ(out_preds->size(), n);
std::copy(base_margin.begin(), base_margin.end(), out_preds->begin());
} else {
std::fill(out_preds->begin(), out_preds->end(), model_.base_margin);
}
}
if (num_group == 1) {
PredLoopSpecalize<Derived>(p_fmat, out_preds, 1,
tree_begin, ntree_limit);
} else {
PredLoopSpecalize<Derived>(p_fmat, out_preds, num_group,
tree_begin, ntree_limit);
}
}
template<typename Derived>
inline void PredLoopSpecalize(
DMatrix* p_fmat,
std::vector<bst_float>* out_preds,
int num_group,
unsigned tree_begin,
unsigned tree_end) {
const MetaInfo& info = p_fmat->info();
const int nthread = omp_get_max_threads();
CHECK_EQ(num_group, model_.param.num_output_group);
InitThreadTemp(nthread);
std::vector<bst_float>& preds = *out_preds;
CHECK_EQ(model_.param.size_leaf_vector, 0)
<< "size_leaf_vector is enforced to 0 so far";
CHECK_EQ(preds.size(), p_fmat->info().num_row * num_group);
// start collecting the prediction
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
Derived* self = static_cast<Derived*>(this);
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
// parallel over local batch
const int K = 8;
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
const bst_omp_uint rest = nsize % K;
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize - rest; i += K) {
const int tid = omp_get_thread_num();
RegTree::FVec& feats = thread_temp[tid];
int64_t ridx[K];
RowBatch::Inst inst[K];
for (int k = 0; k < K; ++k) {
ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
}
for (int k = 0; k < K; ++k) {
inst[k] = batch[i + k];
}
for (int k = 0; k < K; ++k) {
for (int gid = 0; gid < num_group; ++gid) {
const size_t offset = ridx[k] * num_group + gid;
preds[offset] +=
self->PredValue(inst[k], gid, info.GetRoot(ridx[k]),
&feats, tree_begin, tree_end);
}
}
}
for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
RegTree::FVec& feats = thread_temp[0];
const int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
const RowBatch::Inst inst = batch[i];
for (int gid = 0; gid < num_group; ++gid) {
const size_t offset = ridx * num_group + gid;
preds[offset] +=
self->PredValue(inst, gid, info.GetRoot(ridx),
&feats, tree_begin, tree_end);
}
}
}
}
// commit new trees all at once
void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) override {
for (size_t i = 0; i < new_trees.size(); ++i) {
model_.trees.push_back(std::move(new_trees[i]));
model_.tree_info.push_back(bst_group);
}
model_.param.num_trees += static_cast<int>(new_trees.size());
size_t num_drop = NormalizeTrees(new_trees.size());
if (dparam.silent != 1) {
LOG(INFO) << "drop " << num_drop << " trees, "
<< "weight = " << weight_drop.back();
}
}
// predict the leaf scores without dropped trees
inline bst_float PredValue(const RowBatch::Inst &inst,
int bst_group,
unsigned root_index,
RegTree::FVec *p_feats,
unsigned tree_begin,
unsigned tree_end) {
bst_float psum = 0.0f;
p_feats->Fill(inst);
for (size_t i = tree_begin; i < tree_end; ++i) {
if (model_.tree_info[i] == bst_group) {
bool drop = (std::binary_search(idx_drop.begin(), idx_drop.end(), i));
if (!drop) {
int tid = model_.trees[i]->GetLeafIndex(*p_feats, root_index);
psum += weight_drop[i] * (*model_.trees[i])[tid].leaf_value();
}
}
}
p_feats->Drop(inst);
return psum;
}
// select dropped trees
inline void DropTrees(unsigned ntree_limit_drop) {
std::uniform_real_distribution<> runif(0.0, 1.0);
auto& rnd = common::GlobalRandom();
// reset
idx_drop.clear();
// sample dropped trees
bool skip = false;
if (dparam.skip_drop > 0.0) skip = (runif(rnd) < dparam.skip_drop);
if (ntree_limit_drop == 0 && !skip) {
if (dparam.sample_type == 1) {
bst_float sum_weight = 0.0;
for (size_t i = 0; i < weight_drop.size(); ++i) {
sum_weight += weight_drop[i];
}
for (size_t i = 0; i < weight_drop.size(); ++i) {
if (runif(rnd) < dparam.rate_drop * weight_drop.size() * weight_drop[i] / sum_weight) {
idx_drop.push_back(i);
}
( run in 0.621 second using v1.01-cache-2.11-cpan-ceb78f64989 )