gid results from the CPAN

Alien-XGBoost

#include "dmlc/timer.h"
#include "gpu_hist_builder.cuh"

namespace xgboost {
namespace tree {

void DeviceGMat::Init(int device_idx, const common::GHistIndexMatrix& gmat,
                      bst_ulong element_begin, bst_ulong element_end,
                      bst_ulong row_begin, bst_ulong row_end, int n_bins) {
  dh::safe_cuda(cudaSetDevice(device_idx));
  CHECK(gidx_buffer.size()) << "gidx_buffer must be externally allocated";
  CHECK_EQ(row_ptr.size(), (row_end - row_begin) + 1)
      << "row_ptr must be externally allocated";

  common::CompressedBufferWriter cbw(n_bins);
  std::vector<common::compressed_byte_t> host_buffer(gidx_buffer.size());
  cbw.Write(host_buffer.data(), gmat.index.begin() + element_begin,
            gmat.index.begin() + element_end);
  gidx_buffer = host_buffer;
  gidx = common::CompressedIterator<uint32_t>(gidx_buffer.data(), n_bins);

  // row_ptr
  thrust::copy(gmat.row_ptr.data() + row_begin,
               gmat.row_ptr.data() + row_end + 1, row_ptr.tbegin());
  // normalise row_ptr
  size_t start = gmat.row_ptr[row_begin];
  thrust::transform(row_ptr.tbegin(), row_ptr.tend(), row_ptr.tbegin(),
                    [=] __device__(size_t val) { return val - start; });
}

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

  unsigned long long int old = *address_as_ull, assumed; // NOLINT
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
                    __double_as_longlong(val + __longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
}
#endif

__device__ void HistBuilder::Add(bst_gpair_precise gpair, int gidx, int nidx) const {
  int hist_idx = nidx * n_bins + gidx;
  atomicAdd(&(d_hist[hist_idx].grad), gpair.grad);  // OPTMARK: This and below
                                                    // line lead to about 3X
                                                    // slowdown due to memory
                                                    // dependency and access
                                                    // pattern issues.
  atomicAdd(&(d_hist[hist_idx].hess), gpair.hess);
}

__device__ bst_gpair_precise HistBuilder::Get(int gidx, int nidx) const {
  return d_hist[nidx * n_bins + gidx];
}

GPUHistBuilder::GPUHistBuilder()
    : initialised(false),
      is_dense(false),
      p_last_fmat_(nullptr),
      prediction_cache_initialised(false) {}

GPUHistBuilder::~GPUHistBuilder() {
  if (initialised) {

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

    // Build feature segments
    std::vector<int> h_feature_segments;
    for (int node = 0; node < n_nodes_level(param.max_depth - 1); node++) {
      for (int fidx = 0; fidx < n_features; fidx++) {
        h_feature_segments.push_back(hmat_.row_ptr[fidx] + node * n_bins);
      }
    }
    h_feature_segments.push_back(n_nodes_level(param.max_depth - 1) * n_bins);

    // Construct feature map
    std::vector<int> h_gidx_feature_map(n_bins);
    for (int fidx = 0; fidx < n_features; fidx++) {
      for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
        h_gidx_feature_map[i] = fidx;
      }
    }

    int level_max_bins = n_nodes_level(param.max_depth - 1) * n_bins;

    // allocate unique common data that reside on master device (NOTE: None
    // currently)
    //    int master_device=dList[0];
    //    ba.allocate(master_device, );

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

    left_child_smallest.resize(n_devices);
    left_child_smallest_temp.resize(n_devices);
    feature_flags.resize(n_devices);
    fidx_min_map.resize(n_devices);
    feature_segments.resize(n_devices);
    prediction_cache.resize(n_devices);
    position.resize(n_devices);
    position_tmp.resize(n_devices);
    device_matrix.resize(n_devices);
    device_gpair.resize(n_devices);
    gidx_feature_map.resize(n_devices);
    gidx_fvalue_map.resize(n_devices);

    int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
    find_split_n_devices =
        std::min(n_nodes_level(param.max_depth), find_split_n_devices);
    int max_num_nodes_device =
        n_nodes_level(param.max_depth) / find_split_n_devices;

    // num_rows_segment: for sharding rows onto gpus for splitting data
    // num_elements_segment: for sharding rows (of elements) onto gpus for
    // splitting data

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

          &left_child_smallest_temp[d_idx], max_num_nodes_device,
          &feature_flags[d_idx],
          n_features,  // may change but same on all devices
          &fidx_min_map[d_idx],
          hmat_.min_val.size(),  // constant and same on all devices
          &feature_segments[d_idx],
          h_feature_segments.size(),  // constant and same on all devices
          &prediction_cache[d_idx], num_rows_segment, &position[d_idx],
          num_rows_segment, &position_tmp[d_idx], num_rows_segment,
          &device_gpair[d_idx], num_rows_segment,
          &device_matrix[d_idx].gidx_buffer,
          common::CompressedBufferWriter::CalculateBufferSize(
              num_elements_segment,
              n_bins),  // constant and same on all devices
          &device_matrix[d_idx].row_ptr, num_rows_segment + 1,
          &gidx_feature_map[d_idx], n_bins,  // constant and same on all devices
          &gidx_fvalue_map[d_idx],
          hmat_.cut.size());  // constant and same on all devices

      // Copy Host to Device (assumes comes after ba.allocate that sets device)
      device_matrix[d_idx].Init(
          device_idx, gmat_, device_element_segments[d_idx],
          device_element_segments[d_idx + 1], device_row_segments[d_idx],
          device_row_segments[d_idx + 1], n_bins);
      gidx_feature_map[d_idx] = h_gidx_feature_map;
      gidx_fvalue_map[d_idx] = hmat_.cut;
      feature_segments[d_idx] = h_feature_segments;
      fidx_min_map[d_idx] = hmat_.min_val;

      // Initialize, no copy
      hist_vec[d_idx].Init(n_bins);     // init host object
      prediction_cache[d_idx].fill(0);  // init device object (assumes comes
                                        // after ba.allocate that sets device)
      feature_flags[d_idx].fill(1);  // init device object (assumes comes after
                                     // ba.allocate that sets device)
    }

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

}

void GPUHistBuilder::BuildHist(int depth) {
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];
    size_t begin = device_element_segments[d_idx];
    size_t end = device_element_segments[d_idx + 1];
    size_t row_begin = device_row_segments[d_idx];
    size_t row_end = device_row_segments[d_idx + 1];

    auto d_gidx = device_matrix[d_idx].gidx;
    auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();
    auto d_position = position[d_idx].data();
    auto d_gpair = device_gpair[d_idx].data();
    auto d_left_child_smallest = left_child_smallest[d_idx].data();
    auto hist_builder = hist_vec[d_idx].GetBuilder();
    dh::TransformLbs(
        device_idx, &temp_memory[d_idx], end - begin, d_row_ptr,
        row_end - row_begin, is_dense, [=] __device__(size_t local_idx, int local_ridx) {
          int nidx = d_position[local_ridx];  // OPTMARK: latency
          if (!is_active(nidx, depth)) return;

          // Only increment smallest node
          bool is_smallest = (d_left_child_smallest[parent_nidx(nidx)] &&
                              is_left_child(nidx)) ||
                             (!d_left_child_smallest[parent_nidx(nidx)] &&
                              !is_left_child(nidx));
          if (!is_smallest && depth > 0) return;

          int gidx = d_gidx[local_idx];
          bst_gpair gpair = d_gpair[local_ridx];

          hist_builder.Add(gpair, gidx,
                           nidx);  // OPTMARK: This is slow, could use
                                   // shared memory or cache results
                                   // intead of writing to global
                                   // memory every time in atomic way.
        });
  }

  dh::synchronize_n_devices(n_devices, dList);

  //  time.printElapsed("Add Time");

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

      auto d_left_child_smallest = left_child_smallest[d_idx].data();
      int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins;

      dh::launch_n(device_idx, n_sub_bins, [=] __device__(int idx) {
        int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2);
        bool left_smallest = d_left_child_smallest[parent_nidx(nidx)];
        if (left_smallest) {
          nidx++;  // If left is smallest switch to right child
        }

        int gidx = idx % hist_builder.n_bins;
        bst_gpair_precise parent = hist_builder.Get(gidx, parent_nidx(nidx));
        int other_nidx = left_smallest ? nidx - 1 : nidx + 1;
        bst_gpair_precise other = hist_builder.Get(gidx, other_nidx);
        hist_builder.Add(parent - other, gidx,
                         nidx);  // OPTMARK: This is slow, could use shared
                                 // memory or cache results intead of writing to
                                 // global memory every time in atomic way.
      });
    }
    dh::synchronize_n_devices(n_devices, dList);
  }
}

template <int BLOCK_THREADS>
__global__ void find_split_kernel(
    const bst_gpair_precise* d_level_hist, int* d_feature_segments, int depth,
    int n_features, int n_bins, Node* d_nodes, Node* d_nodes_temp,
    Node* d_nodes_child_temp, int nodes_offset_device, float* d_fidx_min_map,
    float* d_gidx_fvalue_map, GPUTrainingParam gpu_param,
    bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) {
  typedef cub::KeyValuePair<int, float> ArgMaxT;
  typedef cub::BlockScan<bst_gpair_precise, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
      BlockScanT;
  typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
  typedef cub::BlockReduce<bst_gpair_precise, BLOCK_THREADS> SumReduceT;

  union TempStorage {
    typename BlockScanT::TempStorage scan;
    typename MaxReduceT::TempStorage max_reduce;

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN


      if (threadIdx.x == 0) {
        block_max = best;
      }

      __syncthreads();

      // Best thread updates split
      if (threadIdx.x == block_max.key) {
        float fvalue;
        int gidx = (scan_begin - (level_node_idx * n_bins)) + threadIdx.x;
        if (threadIdx.x == 0 &&
            begin == scan_begin) {  // check at start of first tile
          fvalue = d_fidx_min_map[fidx];
        } else {
          fvalue = d_gidx_fvalue_map[gidx - 1];
        }

        bst_gpair_precise left = missing_left ? bin + missing : bin;
        bst_gpair_precise right = parent_sum - left;

        split.Update(gain, missing_left, fvalue, fidx, left, right, gpu_param);
      }
      __syncthreads();
    }  // end scan
  }    // end over features

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

    for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      int nodes_offset_device = d_idx * num_nodes_device;
      find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
          (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
          feature_segments[d_idx].data(), depth, (info->num_col),
          (hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(),
          nodes_child_temp[d_idx].data(), nodes_offset_device,
          fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(),
          GPUTrainingParam(param), left_child_smallest_temp[d_idx].data(),
          colsample, feature_flags[d_idx].data());
    }

    // nccl only on devices that did split
    dh::synchronize_n_devices(find_split_n_devices, dList);

    for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

    int master_device = dList[d_idx];
    int device_idx = dList[d_idx];
    dh::safe_cuda(cudaSetDevice(device_idx));

    int nodes_offset_device = d_idx * num_nodes_device;
    find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
        (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
        feature_segments[d_idx].data(), depth, (info->num_col),
        (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
        nodes_offset_device, fidx_min_map[d_idx].data(),
        gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
        left_child_smallest[d_idx].data(), colsample,
        feature_flags[d_idx].data());

    // broadcast result
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      dh::safe_nccl(ncclBcast(
          reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      int nodes_offset_device = 0;
      find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
          (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
          feature_segments[d_idx].data(), depth, (info->num_col),
          (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
          nodes_offset_device, fidx_min_map[d_idx].data(),
          gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
          left_child_smallest[d_idx].data(), colsample,
          feature_flags[d_idx].data());
    }
  }

  // NOTE: No need to syncrhonize with host as all above pure P2P ops or
  // on-device ops
}

void GPUHistBuilder::InitFirstNode(const std::vector<bst_gpair>& gpair) {

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

    this->UpdatePositionSparse(depth);
  }
}

void GPUHistBuilder::UpdatePositionDense(int depth) {
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];

    auto d_position = position[d_idx].data();
    Node* d_nodes = nodes[d_idx].data();
    auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
    auto d_gidx = device_matrix[d_idx].gidx;
    int n_columns = info->num_col;
    size_t begin = device_row_segments[d_idx];
    size_t end = device_row_segments[d_idx + 1];

    dh::launch_n(device_idx, end - begin, [=] __device__(size_t local_idx) {
      int pos = d_position[local_idx];
      if (!is_active(pos, depth)) {
        return;
      }
      Node node = d_nodes[pos];

      if (node.IsLeaf()) {
        return;
      }

      int gidx = d_gidx[local_idx *
                        static_cast<size_t>(n_columns) + static_cast<size_t>(node.split.findex)];

      float fvalue = d_gidx_fvalue_map[gidx];

      if (fvalue <= node.split.fvalue) {
        d_position[local_idx] = left_child_nidx(pos);
      } else {
        d_position[local_idx] = right_child_nidx(pos);
      }
    });
  }
  dh::synchronize_n_devices(n_devices, dList);
  // dh::safe_cuda(cudaDeviceSynchronize());
}

void GPUHistBuilder::UpdatePositionSparse(int depth) {
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];

    auto d_position = position[d_idx].data();
    auto d_position_tmp = position_tmp[d_idx].data();
    Node* d_nodes = nodes[d_idx].data();
    auto d_gidx_feature_map = gidx_feature_map[d_idx].data();
    auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
    auto d_gidx = device_matrix[d_idx].gidx;
    auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();

    size_t row_begin = device_row_segments[d_idx];
    size_t row_end = device_row_segments[d_idx + 1];
    size_t element_begin = device_element_segments[d_idx];
    size_t element_end = device_element_segments[d_idx + 1];

    // Update missing direction
    dh::launch_n(device_idx, row_end - row_begin,
                 [=] __device__(int local_idx) {

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

          if (!is_active(pos, depth)) {
            return;
          }

          Node node = d_nodes[pos];

          if (node.IsLeaf()) {
            return;
          }

          int gidx = d_gidx[local_idx];
          int findex = d_gidx_feature_map[gidx];  // OPTMARK: slowest global
                                                  // memory access, maybe setup
                                                  // position, gidx, etc. as
                                                  // combined structure?

          if (findex == node.split.findex) {
            float fvalue = d_gidx_fvalue_map[gidx];

            if (fvalue <= node.split.fvalue) {
              d_position_tmp[local_ridx] = left_child_nidx(pos);
            } else {
              d_position_tmp[local_ridx] = right_child_nidx(pos);
            }
          }
        });
    position[d_idx] = position_tmp[d_idx];
  }

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cuh view on Meta::CPAN

#include "../../src/tree/param.h"
#include "../../src/common/compressed_iterator.h"
#include "device_helpers.cuh"
#include "types.cuh"
#include "nccl.h"

namespace xgboost {
namespace tree {

struct DeviceGMat {
  dh::dvec<common::compressed_byte_t> gidx_buffer;
  common::CompressedIterator<uint32_t> gidx;
  dh::dvec<size_t> row_ptr;
  void Init(int device_idx, const common::GHistIndexMatrix &gmat,
            bst_ulong element_begin, bst_ulong element_end, bst_ulong row_begin, bst_ulong row_end,int n_bins);
};

struct HistBuilder {
  bst_gpair_precise *d_hist;
  int n_bins;
  __host__ __device__ HistBuilder(bst_gpair_precise *ptr, int n_bins);
  __device__ void Add(bst_gpair_precise gpair, int gidx, int nidx) const;
  __device__ bst_gpair_precise Get(int gidx, int nidx) const;
};

struct DeviceHist {
  int n_bins;
  dh::dvec<bst_gpair_precise> data;

  void Init(int max_depth);

  void Reset(int device_idx);

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cuh view on Meta::CPAN

  std::vector<dh::dvec<bool>> left_child_smallest;
  std::vector<dh::dvec<bool>> left_child_smallest_temp;
  std::vector<dh::dvec<int>> feature_flags;
  std::vector<dh::dvec<float>> fidx_min_map;
  std::vector<dh::dvec<int>> feature_segments;
  std::vector<dh::dvec<bst_float>> prediction_cache;
  std::vector<dh::dvec<int>> position;
  std::vector<dh::dvec<int>> position_tmp;
  std::vector<DeviceGMat> device_matrix;
  std::vector<dh::dvec<bst_gpair>> device_gpair;
  std::vector<dh::dvec<int>> gidx_feature_map;
  std::vector<dh::dvec<float>> gidx_fvalue_map;

  std::vector<cudaStream_t *> streams;
  std::vector<ncclComm_t> comms;
  std::vector<std::vector<ncclComm_t>> find_split_comms;

  double cpu_init_time;
  double gpu_init_time;
  dh::Timer cpu_time;
  double gpu_time;

xgboost/src/common/hist_util.cc view on Meta::CPAN

    = static_cast<size_t>(param.max_conflict_rate * nrow);

  for (auto fid : feature_list) {
    const Column<T>& column = colmat.GetColumn<T>(fid);

    const size_t cur_fid_nnz = feature_nnz[fid];
    bool need_new_group = true;

    // randomly choose some of existing groups as candidates
    std::vector<unsigned> search_groups;
    for (size_t gid = 0; gid < groups.size(); ++gid) {
      if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
        search_groups.push_back(gid);
      }
    }
    std::shuffle(search_groups.begin(), search_groups.end(), common::GlobalRandom());
    if (param.max_search_group > 0 && search_groups.size() > param.max_search_group) {
      search_groups.resize(param.max_search_group);
    }

    // examine each candidate group: is it okay to insert fid?
    for (auto gid : search_groups) {
      const size_t rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
      const size_t cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
      if (cnt <= rest_max_cnt) {
        need_new_group = false;
        groups[gid].push_back(fid);
        group_conflict_cnt[gid] += cnt;
        group_nnz[gid] += cur_fid_nnz - cnt;
        MarkUsed(&conflict_marks[gid], column);
        break;
      }
    }

    // create new group if necessary
    if (need_new_group) {
      groups.emplace_back();
      groups.back().push_back(fid);
      group_conflict_cnt.push_back(0);
      conflict_marks.emplace_back(nrow, false);

xgboost/src/gbm/gblinear.cc view on Meta::CPAN

               ObjFunction* obj) override {
    // lazily initialize the model when not ready.
    if (model.weight.size() == 0) {
      model.InitModel();
    }

    std::vector<bst_gpair> &gpair = *in_gpair;
    const int ngroup = model.param.num_output_group;
    const RowSet &rowset = p_fmat->buffered_rowset();
    // for all the output group
    for (int gid = 0; gid < ngroup; ++gid) {
      double sum_grad = 0.0, sum_hess = 0.0;
      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
      #pragma omp parallel for schedule(static) reduction(+: sum_grad, sum_hess)
      for (bst_omp_uint i = 0; i < ndata; ++i) {
        bst_gpair &p = gpair[rowset[i] * ngroup + gid];
        if (p.hess >= 0.0f) {
          sum_grad += p.grad; sum_hess += p.hess;
        }
      }
      // remove bias effect
      bst_float dw = static_cast<bst_float>(
        param.learning_rate * param.CalcDeltaBias(sum_grad, sum_hess, model.bias()[gid]));
      model.bias()[gid] += dw;
      // update grad value
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < ndata; ++i) {
        bst_gpair &p = gpair[rowset[i] * ngroup + gid];
        if (p.hess >= 0.0f) {
          p.grad += p.hess * dw;
        }
      }
    }
    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
    while (iter->Next()) {
      // number of features
      const ColBatch &batch = iter->Value();
      const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
        const bst_uint fid = batch.col_index[i];
        ColBatch::Inst col = batch[i];
        for (int gid = 0; gid < ngroup; ++gid) {
          double sum_grad = 0.0, sum_hess = 0.0;
          for (bst_uint j = 0; j < col.length; ++j) {
            const bst_float v = col[j].fvalue;
            bst_gpair &p = gpair[col[j].index * ngroup + gid];
            if (p.hess < 0.0f) continue;
            sum_grad += p.grad * v;
            sum_hess += p.hess * v * v;
          }
          bst_float &w = model[fid][gid];
          bst_float dw = static_cast<bst_float>(param.learning_rate *
                                                param.CalcDelta(sum_grad, sum_hess, w));
          w += dw;
          // update grad value
          for (bst_uint j = 0; j < col.length; ++j) {
            bst_gpair &p = gpair[col[j].index * ngroup + gid];
            if (p.hess < 0.0f) continue;
            p.grad += p.hess * col[j].fvalue * dw;
          }
        }
      }
    }
  }

  void PredictBatch(DMatrix *p_fmat,
               std::vector<bst_float> *out_preds,

xgboost/src/gbm/gblinear.cc view on Meta::CPAN

      CHECK_EQ(batch.base_rowid * ngroup, preds.size());
      // output convention: nrow * k, where nrow is number of rows
      // k is number of group
      preds.resize(preds.size() + batch.size * ngroup);
      // parallel over local batch
      const omp_ulong nsize = static_cast<omp_ulong>(batch.size);
      #pragma omp parallel for schedule(static)
      for (omp_ulong i = 0; i < nsize; ++i) {
        const size_t ridx = batch.base_rowid + i;
        // loop over output groups
        for (int gid = 0; gid < ngroup; ++gid) {
          bst_float margin =  (base_margin.size() != 0) ?
              base_margin[ridx * ngroup + gid] : base_margin_;
          this->Pred(batch[i], &preds[ridx * ngroup], gid, margin);
        }
      }
    }
  }
  // add base margin
  void PredictInstance(const SparseBatch::Inst &inst,
               std::vector<bst_float> *out_preds,
               unsigned ntree_limit,
               unsigned root_index) override {
    const int ngroup = model.param.num_output_group;
    for (int gid = 0; gid < ngroup; ++gid) {
      this->Pred(inst, dmlc::BeginPtr(*out_preds), gid, base_margin_);
    }
  }

  void PredictLeaf(DMatrix *p_fmat,
                   std::vector<bst_float> *out_preds,
                   unsigned ntree_limit) override {
    LOG(FATAL) << "gblinear does not support prediction of leaf index";
  }

  void PredictContribution(DMatrix* p_fmat,

xgboost/src/gbm/gblinear.cc view on Meta::CPAN

    iter->BeforeFirst();
    while (iter->Next()) {
      const RowBatch& batch = iter->Value();
      // parallel over local batch
      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nsize; ++i) {
        const RowBatch::Inst &inst = batch[i];
        size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
        // loop over output groups
        for (int gid = 0; gid < ngroup; ++gid) {
          bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
          // calculate linear terms' contributions
          for (bst_uint c = 0; c < inst.length; ++c) {
            if (inst[c].index >= model.param.num_feature) continue;
            p_contribs[inst[c].index] = inst[c].fvalue * model[inst[c].index][gid];
          }
          // add base margin to BIAS
          p_contribs[ncolumns - 1] = model.bias()[gid] +
            ((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] : base_margin_);
        }
      }
    }
  }

  std::vector<std::string> DumpModel(const FeatureMap& fmap,
                                     bool with_stats,
                                     std::string format) const override {
    const int ngroup = model.param.num_output_group;
    const unsigned nfeature = model.param.num_feature;

    std::stringstream fo("");
    if (format == "json") {
      fo << "  { \"bias\": [" << std::endl;
      for (int gid = 0; gid < ngroup; ++gid) {
        if (gid != 0) fo << "," << std::endl;
        fo << "      " << model.bias()[gid];
      }
      fo << std::endl << "    ]," << std::endl
         << "    \"weight\": [" << std::endl;
      for (unsigned i = 0; i < nfeature; ++i) {
        for (int gid = 0; gid < ngroup; ++gid) {
          if (i != 0 || gid != 0) fo << "," << std::endl;
          fo << "      " << model[i][gid];
        }
      }
      fo << std::endl << "    ]" << std::endl << "  }";
    } else {
      fo << "bias:\n";
      for (int gid = 0; gid < ngroup; ++gid) {
        fo << model.bias()[gid] << std::endl;
      }
      fo << "weight:\n";
      for (unsigned i = 0; i < nfeature; ++i) {
        for (int gid = 0; gid < ngroup; ++gid) {
          fo << model[i][gid] << std::endl;
        }
      }
    }
    std::vector<std::string> v;
    v.push_back(fo.str());
    return v;
  }

 protected:
  inline void Pred(const RowBatch::Inst &inst, bst_float *preds, int gid, bst_float base) {
    bst_float psum = model.bias()[gid] + base;
    for (bst_uint i = 0; i < inst.length; ++i) {
      if (inst[i].index >= model.param.num_feature) continue;
      psum += inst[i].fvalue * model[inst[i].index][gid];
    }
    preds[gid] = psum;
  }
  // model for linear booster
  class Model {
   public:
    // parameter
    GBLinearModelParam param;
    // weight for each of feature, bias is the last one
    std::vector<bst_float> weight;
    // initialize the model parameter
    inline void InitModel(void) {

xgboost/src/gbm/gbtree.cc view on Meta::CPAN

    std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
    const int ngroup = model_.param.num_output_group;
    if (ngroup == 1) {
      std::vector<std::unique_ptr<RegTree> > ret;
      BoostNewTrees(gpair, p_fmat, 0, &ret);
      new_trees.push_back(std::move(ret));
    } else {
      CHECK_EQ(gpair.size() % ngroup, 0U)
          << "must have exactly ngroup*nrow gpairs";
      std::vector<bst_gpair> tmp(gpair.size() / ngroup);
      for (int gid = 0; gid < ngroup; ++gid) {
        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
        #pragma omp parallel for schedule(static)
        for (bst_omp_uint i = 0; i < nsize; ++i) {
          tmp[i] = gpair[i * ngroup + gid];
        }
        std::vector<std::unique_ptr<RegTree> > ret;
        BoostNewTrees(tmp, p_fmat, gid, &ret);
        new_trees.push_back(std::move(ret));
      }
    }
    double tstart = dmlc::GetTime();
    for (int gid = 0; gid < ngroup; ++gid) {
      this->CommitModel(std::move(new_trees[gid]), gid);
    }
    if (tparam.debug_verbose > 0) {
      LOG(INFO) << "CommitModel(): " << dmlc::GetTime() - tstart << " sec";
    }
  }

  void PredictBatch(DMatrix* p_fmat,
               std::vector<bst_float>* out_preds,
               unsigned ntree_limit) override {
    predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);

xgboost/src/gbm/gbtree.cc view on Meta::CPAN

    if (thread_temp.size() == 0) {
      thread_temp.resize(1, RegTree::FVec());
      thread_temp[0].Init(model_.param.num_feature);
    }
    out_preds->resize(model_.param.num_output_group);
    ntree_limit *= model_.param.num_output_group;
    if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
      ntree_limit = static_cast<unsigned>(model_.trees.size());
    }
    // loop over output groups
    for (int gid = 0; gid < model_.param.num_output_group; ++gid) {
      (*out_preds)[gid]
          = PredValue(inst, gid, root_index,
                      &thread_temp[0], 0, ntree_limit) + model_.base_margin;
    }
  }

 protected:
  friend class GBTree;
  // internal prediction loop
  // add predictions to out_preds
  template<typename Derived>
  inline void PredLoopInternal(

xgboost/src/gbm/gbtree.cc view on Meta::CPAN

        RegTree::FVec& feats = thread_temp[tid];
        int64_t ridx[K];
        RowBatch::Inst inst[K];
        for (int k = 0; k < K; ++k) {
          ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
        }
        for (int k = 0; k < K; ++k) {
          inst[k] = batch[i + k];
        }
        for (int k = 0; k < K; ++k) {
          for (int gid = 0; gid < num_group; ++gid) {
            const size_t offset = ridx[k] * num_group + gid;
            preds[offset] +=
                self->PredValue(inst[k], gid, info.GetRoot(ridx[k]),
                                &feats, tree_begin, tree_end);
          }
        }
      }
      for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
        RegTree::FVec& feats = thread_temp[0];
        const int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
        const RowBatch::Inst inst = batch[i];
        for (int gid = 0; gid < num_group; ++gid) {
          const size_t offset = ridx * num_group + gid;
          preds[offset] +=
              self->PredValue(inst, gid, info.GetRoot(ridx),
                              &feats, tree_begin, tree_end);
        }
      }
    }
  }
  // commit new trees all at once
  void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
                   int bst_group) override {
    for (size_t i = 0; i < new_trees.size(); ++i) {
      model_.trees.push_back(std::move(new_trees[i]));

xgboost/src/predictor/cpu_predictor.cc view on Meta::CPAN

        RegTree::FVec& feats = thread_temp[tid];
        int64_t ridx[K];
        RowBatch::Inst inst[K];
        for (int k = 0; k < K; ++k) {
          ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
        }
        for (int k = 0; k < K; ++k) {
          inst[k] = batch[i + k];
        }
        for (int k = 0; k < K; ++k) {
          for (int gid = 0; gid < num_group; ++gid) {
            const size_t offset = ridx[k] * num_group + gid;
            preds[offset] += this->PredValue(
                inst[k], model.trees, model.tree_info, gid,
                info.GetRoot(ridx[k]), &feats, tree_begin, tree_end);
          }
        }
      }
      for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
        RegTree::FVec& feats = thread_temp[0];
        const int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
        const RowBatch::Inst inst = batch[i];
        for (int gid = 0; gid < num_group; ++gid) {
          const size_t offset = ridx * num_group + gid;
          preds[offset] +=
              this->PredValue(inst, model.trees, model.tree_info, gid,
                              info.GetRoot(ridx), &feats, tree_begin, tree_end);
        }
      }
    }
  }

  void PredLoopInternal(DMatrix* dmat, std::vector<bst_float>* out_preds,
                        const gbm::GBTreeModel& model, int tree_begin,
                        unsigned ntree_limit) {
    // TODO(Rory): Check if this specialisation actually improves performance

xgboost/src/predictor/cpu_predictor.cc view on Meta::CPAN

      thread_temp.resize(1, RegTree::FVec());
      thread_temp[0].Init(model.param.num_feature);
    }
    ntree_limit *= model.param.num_output_group;
    if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
      ntree_limit = static_cast<unsigned>(model.trees.size());
    }
    out_preds->resize(model.param.num_output_group *
                      (model.param.size_leaf_vector + 1));
    // loop over output groups
    for (int gid = 0; gid < model.param.num_output_group; ++gid) {
      (*out_preds)[gid] =
          PredValue(inst, model.trees, model.tree_info, gid, root_index,
                    &thread_temp[0], 0, ntree_limit) +
          model.base_margin;
    }
  }
  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
    const int nthread = omp_get_max_threads();
    InitThreadTemp(nthread, model.param.num_feature);
    const MetaInfo& info = p_fmat->info();
    // number of valid trees

xgboost/src/predictor/cpu_predictor.cc view on Meta::CPAN

    while (iter->Next()) {
      const RowBatch& batch = iter->Value();
      // parallel over local batch
      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nsize; ++i) {
        size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
        unsigned root_id = info.GetRoot(row_idx);
        RegTree::FVec& feats = thread_temp[omp_get_thread_num()];
        // loop over all classes
        for (int gid = 0; gid < ngroup; ++gid) {
          bst_float* p_contribs =
              &contribs[(row_idx * ngroup + gid) * ncolumns];
          feats.Fill(batch[i]);
          // calculate contributions
          for (unsigned j = 0; j < ntree_limit; ++j) {
            if (model.tree_info[j] != gid) {
              continue;
            }
            model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
          }
          feats.Drop(batch[i]);
          // add base margin to BIAS
          if (base_margin.size() != 0) {
            p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
          } else {
            p_contribs[ncolumns - 1] += model.base_margin;
          }
        }
      }
    }
  }
  std::vector<RegTree::FVec> thread_temp;
};

( run in 1.496 second using v1.01-cache-2.11-cpan-3c2a17b8caa )