view release on metacpan or search on metacpan
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
#include "dmlc/timer.h"
#include "gpu_hist_builder.cuh"
namespace xgboost {
namespace tree {
void DeviceGMat::Init(int device_idx, const common::GHistIndexMatrix& gmat,
bst_ulong element_begin, bst_ulong element_end,
bst_ulong row_begin, bst_ulong row_end, int n_bins) {
dh::safe_cuda(cudaSetDevice(device_idx));
CHECK(gidx_buffer.size()) << "gidx_buffer must be externally allocated";
CHECK_EQ(row_ptr.size(), (row_end - row_begin) + 1)
<< "row_ptr must be externally allocated";
common::CompressedBufferWriter cbw(n_bins);
std::vector<common::compressed_byte_t> host_buffer(gidx_buffer.size());
cbw.Write(host_buffer.data(), gmat.index.begin() + element_begin,
gmat.index.begin() + element_end);
gidx_buffer = host_buffer;
gidx = common::CompressedIterator<uint32_t>(gidx_buffer.data(), n_bins);
// row_ptr
thrust::copy(gmat.row_ptr.data() + row_begin,
gmat.row_ptr.data() + row_end + 1, row_ptr.tbegin());
// normalise row_ptr
size_t start = gmat.row_ptr[row_begin];
thrust::transform(row_ptr.tbegin(), row_ptr.tend(), row_ptr.tbegin(),
[=] __device__(size_t val) { return val - start; });
}
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
__device__ void HistBuilder::Add(bst_gpair_precise gpair, int gidx, int nidx) const {
int hist_idx = nidx * n_bins + gidx;
atomicAdd(&(d_hist[hist_idx].grad), gpair.grad); // OPTMARK: This and below
// line lead to about 3X
// slowdown due to memory
// dependency and access
// pattern issues.
atomicAdd(&(d_hist[hist_idx].hess), gpair.hess);
}
__device__ bst_gpair_precise HistBuilder::Get(int gidx, int nidx) const {
return d_hist[nidx * n_bins + gidx];
}
GPUHistBuilder::GPUHistBuilder()
: initialised(false),
is_dense(false),
p_last_fmat_(nullptr),
prediction_cache_initialised(false) {}
GPUHistBuilder::~GPUHistBuilder() {
if (initialised) {
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
// Build feature segments
std::vector<int> h_feature_segments;
for (int node = 0; node < n_nodes_level(param.max_depth - 1); node++) {
for (int fidx = 0; fidx < n_features; fidx++) {
h_feature_segments.push_back(hmat_.row_ptr[fidx] + node * n_bins);
}
}
h_feature_segments.push_back(n_nodes_level(param.max_depth - 1) * n_bins);
// Construct feature map
std::vector<int> h_gidx_feature_map(n_bins);
for (int fidx = 0; fidx < n_features; fidx++) {
for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
h_gidx_feature_map[i] = fidx;
}
}
int level_max_bins = n_nodes_level(param.max_depth - 1) * n_bins;
// allocate unique common data that reside on master device (NOTE: None
// currently)
// int master_device=dList[0];
// ba.allocate(master_device, );
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
left_child_smallest.resize(n_devices);
left_child_smallest_temp.resize(n_devices);
feature_flags.resize(n_devices);
fidx_min_map.resize(n_devices);
feature_segments.resize(n_devices);
prediction_cache.resize(n_devices);
position.resize(n_devices);
position_tmp.resize(n_devices);
device_matrix.resize(n_devices);
device_gpair.resize(n_devices);
gidx_feature_map.resize(n_devices);
gidx_fvalue_map.resize(n_devices);
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
find_split_n_devices =
std::min(n_nodes_level(param.max_depth), find_split_n_devices);
int max_num_nodes_device =
n_nodes_level(param.max_depth) / find_split_n_devices;
// num_rows_segment: for sharding rows onto gpus for splitting data
// num_elements_segment: for sharding rows (of elements) onto gpus for
// splitting data
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
&left_child_smallest_temp[d_idx], max_num_nodes_device,
&feature_flags[d_idx],
n_features, // may change but same on all devices
&fidx_min_map[d_idx],
hmat_.min_val.size(), // constant and same on all devices
&feature_segments[d_idx],
h_feature_segments.size(), // constant and same on all devices
&prediction_cache[d_idx], num_rows_segment, &position[d_idx],
num_rows_segment, &position_tmp[d_idx], num_rows_segment,
&device_gpair[d_idx], num_rows_segment,
&device_matrix[d_idx].gidx_buffer,
common::CompressedBufferWriter::CalculateBufferSize(
num_elements_segment,
n_bins), // constant and same on all devices
&device_matrix[d_idx].row_ptr, num_rows_segment + 1,
&gidx_feature_map[d_idx], n_bins, // constant and same on all devices
&gidx_fvalue_map[d_idx],
hmat_.cut.size()); // constant and same on all devices
// Copy Host to Device (assumes comes after ba.allocate that sets device)
device_matrix[d_idx].Init(
device_idx, gmat_, device_element_segments[d_idx],
device_element_segments[d_idx + 1], device_row_segments[d_idx],
device_row_segments[d_idx + 1], n_bins);
gidx_feature_map[d_idx] = h_gidx_feature_map;
gidx_fvalue_map[d_idx] = hmat_.cut;
feature_segments[d_idx] = h_feature_segments;
fidx_min_map[d_idx] = hmat_.min_val;
// Initialize, no copy
hist_vec[d_idx].Init(n_bins); // init host object
prediction_cache[d_idx].fill(0); // init device object (assumes comes
// after ba.allocate that sets device)
feature_flags[d_idx].fill(1); // init device object (assumes comes after
// ba.allocate that sets device)
}
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
}
void GPUHistBuilder::BuildHist(int depth) {
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
size_t begin = device_element_segments[d_idx];
size_t end = device_element_segments[d_idx + 1];
size_t row_begin = device_row_segments[d_idx];
size_t row_end = device_row_segments[d_idx + 1];
auto d_gidx = device_matrix[d_idx].gidx;
auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();
auto d_position = position[d_idx].data();
auto d_gpair = device_gpair[d_idx].data();
auto d_left_child_smallest = left_child_smallest[d_idx].data();
auto hist_builder = hist_vec[d_idx].GetBuilder();
dh::TransformLbs(
device_idx, &temp_memory[d_idx], end - begin, d_row_ptr,
row_end - row_begin, is_dense, [=] __device__(size_t local_idx, int local_ridx) {
int nidx = d_position[local_ridx]; // OPTMARK: latency
if (!is_active(nidx, depth)) return;
// Only increment smallest node
bool is_smallest = (d_left_child_smallest[parent_nidx(nidx)] &&
is_left_child(nidx)) ||
(!d_left_child_smallest[parent_nidx(nidx)] &&
!is_left_child(nidx));
if (!is_smallest && depth > 0) return;
int gidx = d_gidx[local_idx];
bst_gpair gpair = d_gpair[local_ridx];
hist_builder.Add(gpair, gidx,
nidx); // OPTMARK: This is slow, could use
// shared memory or cache results
// intead of writing to global
// memory every time in atomic way.
});
}
dh::synchronize_n_devices(n_devices, dList);
// time.printElapsed("Add Time");
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
auto d_left_child_smallest = left_child_smallest[d_idx].data();
int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins;
dh::launch_n(device_idx, n_sub_bins, [=] __device__(int idx) {
int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2);
bool left_smallest = d_left_child_smallest[parent_nidx(nidx)];
if (left_smallest) {
nidx++; // If left is smallest switch to right child
}
int gidx = idx % hist_builder.n_bins;
bst_gpair_precise parent = hist_builder.Get(gidx, parent_nidx(nidx));
int other_nidx = left_smallest ? nidx - 1 : nidx + 1;
bst_gpair_precise other = hist_builder.Get(gidx, other_nidx);
hist_builder.Add(parent - other, gidx,
nidx); // OPTMARK: This is slow, could use shared
// memory or cache results intead of writing to
// global memory every time in atomic way.
});
}
dh::synchronize_n_devices(n_devices, dList);
}
}
template <int BLOCK_THREADS>
__global__ void find_split_kernel(
const bst_gpair_precise* d_level_hist, int* d_feature_segments, int depth,
int n_features, int n_bins, Node* d_nodes, Node* d_nodes_temp,
Node* d_nodes_child_temp, int nodes_offset_device, float* d_fidx_min_map,
float* d_gidx_fvalue_map, GPUTrainingParam gpu_param,
bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) {
typedef cub::KeyValuePair<int, float> ArgMaxT;
typedef cub::BlockScan<bst_gpair_precise, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
BlockScanT;
typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
typedef cub::BlockReduce<bst_gpair_precise, BLOCK_THREADS> SumReduceT;
union TempStorage {
typename BlockScanT::TempStorage scan;
typename MaxReduceT::TempStorage max_reduce;
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
if (threadIdx.x == 0) {
block_max = best;
}
__syncthreads();
// Best thread updates split
if (threadIdx.x == block_max.key) {
float fvalue;
int gidx = (scan_begin - (level_node_idx * n_bins)) + threadIdx.x;
if (threadIdx.x == 0 &&
begin == scan_begin) { // check at start of first tile
fvalue = d_fidx_min_map[fidx];
} else {
fvalue = d_gidx_fvalue_map[gidx - 1];
}
bst_gpair_precise left = missing_left ? bin + missing : bin;
bst_gpair_precise right = parent_sum - left;
split.Update(gain, missing_left, fvalue, fidx, left, right, gpu_param);
}
__syncthreads();
} // end scan
} // end over features
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
int device_idx = dList[d_idx];
dh::safe_cuda(cudaSetDevice(device_idx));
int nodes_offset_device = d_idx * num_nodes_device;
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
(const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
feature_segments[d_idx].data(), depth, (info->num_col),
(hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(),
nodes_child_temp[d_idx].data(), nodes_offset_device,
fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(),
GPUTrainingParam(param), left_child_smallest_temp[d_idx].data(),
colsample, feature_flags[d_idx].data());
}
// nccl only on devices that did split
dh::synchronize_n_devices(find_split_n_devices, dList);
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
int device_idx = dList[d_idx];
dh::safe_cuda(cudaSetDevice(device_idx));
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
int master_device = dList[d_idx];
int device_idx = dList[d_idx];
dh::safe_cuda(cudaSetDevice(device_idx));
int nodes_offset_device = d_idx * num_nodes_device;
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
(const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
feature_segments[d_idx].data(), depth, (info->num_col),
(hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
nodes_offset_device, fidx_min_map[d_idx].data(),
gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
left_child_smallest[d_idx].data(), colsample,
feature_flags[d_idx].data());
// broadcast result
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
dh::safe_cuda(cudaSetDevice(device_idx));
dh::safe_nccl(ncclBcast(
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
dh::safe_cuda(cudaSetDevice(device_idx));
int nodes_offset_device = 0;
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
(const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
feature_segments[d_idx].data(), depth, (info->num_col),
(hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
nodes_offset_device, fidx_min_map[d_idx].data(),
gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
left_child_smallest[d_idx].data(), colsample,
feature_flags[d_idx].data());
}
}
// NOTE: No need to syncrhonize with host as all above pure P2P ops or
// on-device ops
}
void GPUHistBuilder::InitFirstNode(const std::vector<bst_gpair>& gpair) {
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
this->UpdatePositionSparse(depth);
}
}
void GPUHistBuilder::UpdatePositionDense(int depth) {
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
auto d_position = position[d_idx].data();
Node* d_nodes = nodes[d_idx].data();
auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
auto d_gidx = device_matrix[d_idx].gidx;
int n_columns = info->num_col;
size_t begin = device_row_segments[d_idx];
size_t end = device_row_segments[d_idx + 1];
dh::launch_n(device_idx, end - begin, [=] __device__(size_t local_idx) {
int pos = d_position[local_idx];
if (!is_active(pos, depth)) {
return;
}
Node node = d_nodes[pos];
if (node.IsLeaf()) {
return;
}
int gidx = d_gidx[local_idx *
static_cast<size_t>(n_columns) + static_cast<size_t>(node.split.findex)];
float fvalue = d_gidx_fvalue_map[gidx];
if (fvalue <= node.split.fvalue) {
d_position[local_idx] = left_child_nidx(pos);
} else {
d_position[local_idx] = right_child_nidx(pos);
}
});
}
dh::synchronize_n_devices(n_devices, dList);
// dh::safe_cuda(cudaDeviceSynchronize());
}
void GPUHistBuilder::UpdatePositionSparse(int depth) {
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
auto d_position = position[d_idx].data();
auto d_position_tmp = position_tmp[d_idx].data();
Node* d_nodes = nodes[d_idx].data();
auto d_gidx_feature_map = gidx_feature_map[d_idx].data();
auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
auto d_gidx = device_matrix[d_idx].gidx;
auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();
size_t row_begin = device_row_segments[d_idx];
size_t row_end = device_row_segments[d_idx + 1];
size_t element_begin = device_element_segments[d_idx];
size_t element_end = device_element_segments[d_idx + 1];
// Update missing direction
dh::launch_n(device_idx, row_end - row_begin,
[=] __device__(int local_idx) {
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN
if (!is_active(pos, depth)) {
return;
}
Node node = d_nodes[pos];
if (node.IsLeaf()) {
return;
}
int gidx = d_gidx[local_idx];
int findex = d_gidx_feature_map[gidx]; // OPTMARK: slowest global
// memory access, maybe setup
// position, gidx, etc. as
// combined structure?
if (findex == node.split.findex) {
float fvalue = d_gidx_fvalue_map[gidx];
if (fvalue <= node.split.fvalue) {
d_position_tmp[local_ridx] = left_child_nidx(pos);
} else {
d_position_tmp[local_ridx] = right_child_nidx(pos);
}
}
});
position[d_idx] = position_tmp[d_idx];
}
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cuh view on Meta::CPAN
#include "../../src/tree/param.h"
#include "../../src/common/compressed_iterator.h"
#include "device_helpers.cuh"
#include "types.cuh"
#include "nccl.h"
namespace xgboost {
namespace tree {
struct DeviceGMat {
dh::dvec<common::compressed_byte_t> gidx_buffer;
common::CompressedIterator<uint32_t> gidx;
dh::dvec<size_t> row_ptr;
void Init(int device_idx, const common::GHistIndexMatrix &gmat,
bst_ulong element_begin, bst_ulong element_end, bst_ulong row_begin, bst_ulong row_end,int n_bins);
};
struct HistBuilder {
bst_gpair_precise *d_hist;
int n_bins;
__host__ __device__ HistBuilder(bst_gpair_precise *ptr, int n_bins);
__device__ void Add(bst_gpair_precise gpair, int gidx, int nidx) const;
__device__ bst_gpair_precise Get(int gidx, int nidx) const;
};
struct DeviceHist {
int n_bins;
dh::dvec<bst_gpair_precise> data;
void Init(int max_depth);
void Reset(int device_idx);
xgboost/plugin/updater_gpu/src/gpu_hist_builder.cuh view on Meta::CPAN
std::vector<dh::dvec<bool>> left_child_smallest;
std::vector<dh::dvec<bool>> left_child_smallest_temp;
std::vector<dh::dvec<int>> feature_flags;
std::vector<dh::dvec<float>> fidx_min_map;
std::vector<dh::dvec<int>> feature_segments;
std::vector<dh::dvec<bst_float>> prediction_cache;
std::vector<dh::dvec<int>> position;
std::vector<dh::dvec<int>> position_tmp;
std::vector<DeviceGMat> device_matrix;
std::vector<dh::dvec<bst_gpair>> device_gpair;
std::vector<dh::dvec<int>> gidx_feature_map;
std::vector<dh::dvec<float>> gidx_fvalue_map;
std::vector<cudaStream_t *> streams;
std::vector<ncclComm_t> comms;
std::vector<std::vector<ncclComm_t>> find_split_comms;
double cpu_init_time;
double gpu_init_time;
dh::Timer cpu_time;
double gpu_time;
xgboost/src/common/hist_util.cc view on Meta::CPAN
= static_cast<size_t>(param.max_conflict_rate * nrow);
for (auto fid : feature_list) {
const Column<T>& column = colmat.GetColumn<T>(fid);
const size_t cur_fid_nnz = feature_nnz[fid];
bool need_new_group = true;
// randomly choose some of existing groups as candidates
std::vector<unsigned> search_groups;
for (size_t gid = 0; gid < groups.size(); ++gid) {
if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
search_groups.push_back(gid);
}
}
std::shuffle(search_groups.begin(), search_groups.end(), common::GlobalRandom());
if (param.max_search_group > 0 && search_groups.size() > param.max_search_group) {
search_groups.resize(param.max_search_group);
}
// examine each candidate group: is it okay to insert fid?
for (auto gid : search_groups) {
const size_t rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
const size_t cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
if (cnt <= rest_max_cnt) {
need_new_group = false;
groups[gid].push_back(fid);
group_conflict_cnt[gid] += cnt;
group_nnz[gid] += cur_fid_nnz - cnt;
MarkUsed(&conflict_marks[gid], column);
break;
}
}
// create new group if necessary
if (need_new_group) {
groups.emplace_back();
groups.back().push_back(fid);
group_conflict_cnt.push_back(0);
conflict_marks.emplace_back(nrow, false);
xgboost/src/gbm/gblinear.cc view on Meta::CPAN
ObjFunction* obj) override {
// lazily initialize the model when not ready.
if (model.weight.size() == 0) {
model.InitModel();
}
std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model.param.num_output_group;
const RowSet &rowset = p_fmat->buffered_rowset();
// for all the output group
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) reduction(+: sum_grad, sum_hess)
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.hess >= 0.0f) {
sum_grad += p.grad; sum_hess += p.hess;
}
}
// remove bias effect
bst_float dw = static_cast<bst_float>(
param.learning_rate * param.CalcDeltaBias(sum_grad, sum_hess, model.bias()[gid]));
model.bias()[gid] += dw;
// update grad value
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.hess >= 0.0f) {
p.grad += p.hess * dw;
}
}
}
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
// number of features
const ColBatch &batch = iter->Value();
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const bst_uint fid = batch.col_index[i];
ColBatch::Inst col = batch[i];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (bst_uint j = 0; j < col.length; ++j) {
const bst_float v = col[j].fvalue;
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue;
sum_grad += p.grad * v;
sum_hess += p.hess * v * v;
}
bst_float &w = model[fid][gid];
bst_float dw = static_cast<bst_float>(param.learning_rate *
param.CalcDelta(sum_grad, sum_hess, w));
w += dw;
// update grad value
for (bst_uint j = 0; j < col.length; ++j) {
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue;
p.grad += p.hess * col[j].fvalue * dw;
}
}
}
}
}
void PredictBatch(DMatrix *p_fmat,
std::vector<bst_float> *out_preds,
xgboost/src/gbm/gblinear.cc view on Meta::CPAN
CHECK_EQ(batch.base_rowid * ngroup, preds.size());
// output convention: nrow * k, where nrow is number of rows
// k is number of group
preds.resize(preds.size() + batch.size * ngroup);
// parallel over local batch
const omp_ulong nsize = static_cast<omp_ulong>(batch.size);
#pragma omp parallel for schedule(static)
for (omp_ulong i = 0; i < nsize; ++i) {
const size_t ridx = batch.base_rowid + i;
// loop over output groups
for (int gid = 0; gid < ngroup; ++gid) {
bst_float margin = (base_margin.size() != 0) ?
base_margin[ridx * ngroup + gid] : base_margin_;
this->Pred(batch[i], &preds[ridx * ngroup], gid, margin);
}
}
}
}
// add base margin
void PredictInstance(const SparseBatch::Inst &inst,
std::vector<bst_float> *out_preds,
unsigned ntree_limit,
unsigned root_index) override {
const int ngroup = model.param.num_output_group;
for (int gid = 0; gid < ngroup; ++gid) {
this->Pred(inst, dmlc::BeginPtr(*out_preds), gid, base_margin_);
}
}
void PredictLeaf(DMatrix *p_fmat,
std::vector<bst_float> *out_preds,
unsigned ntree_limit) override {
LOG(FATAL) << "gblinear does not support prediction of leaf index";
}
void PredictContribution(DMatrix* p_fmat,
xgboost/src/gbm/gblinear.cc view on Meta::CPAN
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch& batch = iter->Value();
// parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const RowBatch::Inst &inst = batch[i];
size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
// loop over output groups
for (int gid = 0; gid < ngroup; ++gid) {
bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
// calculate linear terms' contributions
for (bst_uint c = 0; c < inst.length; ++c) {
if (inst[c].index >= model.param.num_feature) continue;
p_contribs[inst[c].index] = inst[c].fvalue * model[inst[c].index][gid];
}
// add base margin to BIAS
p_contribs[ncolumns - 1] = model.bias()[gid] +
((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] : base_margin_);
}
}
}
}
std::vector<std::string> DumpModel(const FeatureMap& fmap,
bool with_stats,
std::string format) const override {
const int ngroup = model.param.num_output_group;
const unsigned nfeature = model.param.num_feature;
std::stringstream fo("");
if (format == "json") {
fo << " { \"bias\": [" << std::endl;
for (int gid = 0; gid < ngroup; ++gid) {
if (gid != 0) fo << "," << std::endl;
fo << " " << model.bias()[gid];
}
fo << std::endl << " ]," << std::endl
<< " \"weight\": [" << std::endl;
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
if (i != 0 || gid != 0) fo << "," << std::endl;
fo << " " << model[i][gid];
}
}
fo << std::endl << " ]" << std::endl << " }";
} else {
fo << "bias:\n";
for (int gid = 0; gid < ngroup; ++gid) {
fo << model.bias()[gid] << std::endl;
}
fo << "weight:\n";
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
fo << model[i][gid] << std::endl;
}
}
}
std::vector<std::string> v;
v.push_back(fo.str());
return v;
}
protected:
inline void Pred(const RowBatch::Inst &inst, bst_float *preds, int gid, bst_float base) {
bst_float psum = model.bias()[gid] + base;
for (bst_uint i = 0; i < inst.length; ++i) {
if (inst[i].index >= model.param.num_feature) continue;
psum += inst[i].fvalue * model[inst[i].index][gid];
}
preds[gid] = psum;
}
// model for linear booster
class Model {
public:
// parameter
GBLinearModelParam param;
// weight for each of feature, bias is the last one
std::vector<bst_float> weight;
// initialize the model parameter
inline void InitModel(void) {
xgboost/src/gbm/gbtree.cc view on Meta::CPAN
std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
const int ngroup = model_.param.num_output_group;
if (ngroup == 1) {
std::vector<std::unique_ptr<RegTree> > ret;
BoostNewTrees(gpair, p_fmat, 0, &ret);
new_trees.push_back(std::move(ret));
} else {
CHECK_EQ(gpair.size() % ngroup, 0U)
<< "must have exactly ngroup*nrow gpairs";
std::vector<bst_gpair> tmp(gpair.size() / ngroup);
for (int gid = 0; gid < ngroup; ++gid) {
bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid];
}
std::vector<std::unique_ptr<RegTree> > ret;
BoostNewTrees(tmp, p_fmat, gid, &ret);
new_trees.push_back(std::move(ret));
}
}
double tstart = dmlc::GetTime();
for (int gid = 0; gid < ngroup; ++gid) {
this->CommitModel(std::move(new_trees[gid]), gid);
}
if (tparam.debug_verbose > 0) {
LOG(INFO) << "CommitModel(): " << dmlc::GetTime() - tstart << " sec";
}
}
void PredictBatch(DMatrix* p_fmat,
std::vector<bst_float>* out_preds,
unsigned ntree_limit) override {
predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
xgboost/src/gbm/gbtree.cc view on Meta::CPAN
if (thread_temp.size() == 0) {
thread_temp.resize(1, RegTree::FVec());
thread_temp[0].Init(model_.param.num_feature);
}
out_preds->resize(model_.param.num_output_group);
ntree_limit *= model_.param.num_output_group;
if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
ntree_limit = static_cast<unsigned>(model_.trees.size());
}
// loop over output groups
for (int gid = 0; gid < model_.param.num_output_group; ++gid) {
(*out_preds)[gid]
= PredValue(inst, gid, root_index,
&thread_temp[0], 0, ntree_limit) + model_.base_margin;
}
}
protected:
friend class GBTree;
// internal prediction loop
// add predictions to out_preds
template<typename Derived>
inline void PredLoopInternal(
xgboost/src/gbm/gbtree.cc view on Meta::CPAN
RegTree::FVec& feats = thread_temp[tid];
int64_t ridx[K];
RowBatch::Inst inst[K];
for (int k = 0; k < K; ++k) {
ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
}
for (int k = 0; k < K; ++k) {
inst[k] = batch[i + k];
}
for (int k = 0; k < K; ++k) {
for (int gid = 0; gid < num_group; ++gid) {
const size_t offset = ridx[k] * num_group + gid;
preds[offset] +=
self->PredValue(inst[k], gid, info.GetRoot(ridx[k]),
&feats, tree_begin, tree_end);
}
}
}
for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
RegTree::FVec& feats = thread_temp[0];
const int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
const RowBatch::Inst inst = batch[i];
for (int gid = 0; gid < num_group; ++gid) {
const size_t offset = ridx * num_group + gid;
preds[offset] +=
self->PredValue(inst, gid, info.GetRoot(ridx),
&feats, tree_begin, tree_end);
}
}
}
}
// commit new trees all at once
void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) override {
for (size_t i = 0; i < new_trees.size(); ++i) {
model_.trees.push_back(std::move(new_trees[i]));
xgboost/src/predictor/cpu_predictor.cc view on Meta::CPAN
RegTree::FVec& feats = thread_temp[tid];
int64_t ridx[K];
RowBatch::Inst inst[K];
for (int k = 0; k < K; ++k) {
ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
}
for (int k = 0; k < K; ++k) {
inst[k] = batch[i + k];
}
for (int k = 0; k < K; ++k) {
for (int gid = 0; gid < num_group; ++gid) {
const size_t offset = ridx[k] * num_group + gid;
preds[offset] += this->PredValue(
inst[k], model.trees, model.tree_info, gid,
info.GetRoot(ridx[k]), &feats, tree_begin, tree_end);
}
}
}
for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
RegTree::FVec& feats = thread_temp[0];
const int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
const RowBatch::Inst inst = batch[i];
for (int gid = 0; gid < num_group; ++gid) {
const size_t offset = ridx * num_group + gid;
preds[offset] +=
this->PredValue(inst, model.trees, model.tree_info, gid,
info.GetRoot(ridx), &feats, tree_begin, tree_end);
}
}
}
}
void PredLoopInternal(DMatrix* dmat, std::vector<bst_float>* out_preds,
const gbm::GBTreeModel& model, int tree_begin,
unsigned ntree_limit) {
// TODO(Rory): Check if this specialisation actually improves performance
xgboost/src/predictor/cpu_predictor.cc view on Meta::CPAN
thread_temp.resize(1, RegTree::FVec());
thread_temp[0].Init(model.param.num_feature);
}
ntree_limit *= model.param.num_output_group;
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
ntree_limit = static_cast<unsigned>(model.trees.size());
}
out_preds->resize(model.param.num_output_group *
(model.param.size_leaf_vector + 1));
// loop over output groups
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
(*out_preds)[gid] =
PredValue(inst, model.trees, model.tree_info, gid, root_index,
&thread_temp[0], 0, ntree_limit) +
model.base_margin;
}
}
void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
const gbm::GBTreeModel& model, unsigned ntree_limit) override {
const int nthread = omp_get_max_threads();
InitThreadTemp(nthread, model.param.num_feature);
const MetaInfo& info = p_fmat->info();
// number of valid trees
xgboost/src/predictor/cpu_predictor.cc view on Meta::CPAN
while (iter->Next()) {
const RowBatch& batch = iter->Value();
// parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
unsigned root_id = info.GetRoot(row_idx);
RegTree::FVec& feats = thread_temp[omp_get_thread_num()];
// loop over all classes
for (int gid = 0; gid < ngroup; ++gid) {
bst_float* p_contribs =
&contribs[(row_idx * ngroup + gid) * ncolumns];
feats.Fill(batch[i]);
// calculate contributions
for (unsigned j = 0; j < ntree_limit; ++j) {
if (model.tree_info[j] != gid) {
continue;
}
model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
}
feats.Drop(batch[i]);
// add base margin to BIAS
if (base_margin.size() != 0) {
p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
} else {
p_contribs[ncolumns - 1] += model.base_margin;
}
}
}
}
}
std::vector<RegTree::FVec> thread_temp;
};