PUT results from the CPAN

Alien-XGBoost

  // TODO(JCM): use out of place with pre-allocated buffer, but then have to
  // copy
  // back on device
  //  fprintf(stderr,"sizeof(bst_gpair)/sizeof(float)=%d\n",sizeof(bst_gpair)/sizeof(float));
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];
    dh::safe_cuda(cudaSetDevice(device_idx));
    dh::safe_nccl(ncclAllReduce(
        reinterpret_cast<const void*>(hist_vec[d_idx].GetLevelPtr(depth)),
        reinterpret_cast<void*>(hist_vec[d_idx].GetLevelPtr(depth)),
        hist_vec[d_idx].LevelSize(depth) * sizeof(bst_gpair_precise) / sizeof(double),
        ncclDouble, ncclSum, comms[d_idx], *(streams[d_idx])));
  }

  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];
    dh::safe_cuda(cudaSetDevice(device_idx));
    dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
  }
  // if no NCCL, then presume only 1 GPU, then already correct

  //  time.printElapsed("Reduce-Add Time");

  // Subtraction trick (applied to all devices in same way -- to avoid doing on
  // master and then Bcast)
  if (depth > 0) {
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      auto hist_builder = hist_vec[d_idx].GetBuilder();
      auto d_left_child_smallest = left_child_smallest[d_idx].data();
      int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins;

      dh::launch_n(device_idx, n_sub_bins, [=] __device__(int idx) {
        int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2);
        bool left_smallest = d_left_child_smallest[parent_nidx(nidx)];
        if (left_smallest) {
          nidx++;  // If left is smallest switch to right child
        }

        int gidx = idx % hist_builder.n_bins;
        bst_gpair_precise parent = hist_builder.Get(gidx, parent_nidx(nidx));
        int other_nidx = left_smallest ? nidx - 1 : nidx + 1;
        bst_gpair_precise other = hist_builder.Get(gidx, other_nidx);
        hist_builder.Add(parent - other, gidx,
                         nidx);  // OPTMARK: This is slow, could use shared
                                 // memory or cache results intead of writing to
                                 // global memory every time in atomic way.
      });
    }
    dh::synchronize_n_devices(n_devices, dList);
  }
}

template <int BLOCK_THREADS>
__global__ void find_split_kernel(
    const bst_gpair_precise* d_level_hist, int* d_feature_segments, int depth,
    int n_features, int n_bins, Node* d_nodes, Node* d_nodes_temp,
    Node* d_nodes_child_temp, int nodes_offset_device, float* d_fidx_min_map,
    float* d_gidx_fvalue_map, GPUTrainingParam gpu_param,
    bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) {
  typedef cub::KeyValuePair<int, float> ArgMaxT;
  typedef cub::BlockScan<bst_gpair_precise, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
      BlockScanT;
  typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
  typedef cub::BlockReduce<bst_gpair_precise, BLOCK_THREADS> SumReduceT;

  union TempStorage {
    typename BlockScanT::TempStorage scan;
    typename MaxReduceT::TempStorage max_reduce;
    typename SumReduceT::TempStorage sum_reduce;
  };

  struct UninitializedSplit : cub::Uninitialized<Split> {};
  struct UninitializedGpair : cub::Uninitialized<bst_gpair_precise> {};

  __shared__ UninitializedSplit uninitialized_split;
  Split& split = uninitialized_split.Alias();
  __shared__ UninitializedGpair uninitialized_sum;
  bst_gpair_precise& shared_sum = uninitialized_sum.Alias();
  __shared__ ArgMaxT block_max;
  __shared__ TempStorage temp_storage;

  if (threadIdx.x == 0) {
    split = Split();
  }

  __syncthreads();

  // below two are for accessing full-sized node list stored on each device
  // always one block per node, BLOCK_THREADS threads per block
  int level_node_idx = blockIdx.x + nodes_offset_device;
  int node_idx = n_nodes(depth - 1) + level_node_idx;

  for (int fidx = 0; fidx < n_features; fidx++) {
    if (colsample && d_feature_flags[fidx] == 0) continue;

    int begin = d_feature_segments[level_node_idx * n_features + fidx];
    int end = d_feature_segments[level_node_idx * n_features + fidx + 1];

    bst_gpair_precise feature_sum = bst_gpair_precise();
    for (int reduce_begin = begin; reduce_begin < end;
         reduce_begin += BLOCK_THREADS) {
      bool thread_active = reduce_begin + threadIdx.x < end;
      // Scan histogram
      bst_gpair_precise bin = thread_active ? d_level_hist[reduce_begin + threadIdx.x]
                                    : bst_gpair_precise();

      feature_sum +=
          SumReduceT(temp_storage.sum_reduce).Reduce(bin, cub::Sum());
    }

    if (threadIdx.x == 0) {
      shared_sum = feature_sum;
    }
    //    __syncthreads(); // no need to synch because below there is a Scan

    GpairCallbackOp prefix_op = GpairCallbackOp();
    for (int scan_begin = begin; scan_begin < end;
         scan_begin += BLOCK_THREADS) {

xgboost/plugin/updater_gpu/src/gpu_hist_builder.cu view on Meta::CPAN

// for Maximum number of threads per block
#define MAX_BLOCK_THREADS 1024

void GPUHistBuilder::FindSplit(int depth) {
  // Specialised based on max_bins
  this->FindSplitSpecialize<MIN_BLOCK_THREADS>(depth);
}

template <>
void GPUHistBuilder::FindSplitSpecialize<MAX_BLOCK_THREADS>(int depth) {
  LaunchFindSplit<MAX_BLOCK_THREADS>(depth);
}
template <int BLOCK_THREADS>
void GPUHistBuilder::FindSplitSpecialize(int depth) {
  if (param.max_bin <= BLOCK_THREADS) {
    LaunchFindSplit<BLOCK_THREADS>(depth);
  } else {
    this->FindSplitSpecialize<BLOCK_THREADS + CHUNK_BLOCK_THREADS>(depth);
  }
}

template <int BLOCK_THREADS>
void GPUHistBuilder::LaunchFindSplit(int depth) {
  bool colsample =
      param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;

  int dosimuljob = 1;

  int simuljob = 1;  // whether to do job on single GPU and broadcast (0) or to
                     // do same job on each GPU (1) (could make user parameter,
                     // but too fine-grained maybe)
  int findsplit_shardongpus = 0;  // too expensive generally, disable for now

  if (findsplit_shardongpus) {
    dosimuljob = 0;
    // use power of 2 for split finder because nodes are power of 2 (broadcast
    // result to remaining devices)
    int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
    find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices);
    int num_nodes_device = n_nodes_level(depth) / find_split_n_devices;
    int num_nodes_child_device =
        n_nodes_level(depth + 1) / find_split_n_devices;
    const int GRID_SIZE = num_nodes_device;

    // NOTE: No need to scatter before gather as all devices have same copy of
    // nodes, and within find_split_kernel() nodes_temp is given values from
    // nodes

    // for all nodes (split among devices) find best split per node
    for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      int nodes_offset_device = d_idx * num_nodes_device;
      find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
          (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
          feature_segments[d_idx].data(), depth, (info->num_col),
          (hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(),
          nodes_child_temp[d_idx].data(), nodes_offset_device,
          fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(),
          GPUTrainingParam(param), left_child_smallest_temp[d_idx].data(),
          colsample, feature_flags[d_idx].data());
    }

    // nccl only on devices that did split
    dh::synchronize_n_devices(find_split_n_devices, dList);

    for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      dh::safe_nccl(ncclAllGather(
          reinterpret_cast<const void*>(nodes_temp[d_idx].data()),
          num_nodes_device * sizeof(Node) / sizeof(char), ncclChar,
          reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
          find_split_comms[find_split_n_devices - 1][d_idx],
          *(streams[d_idx])));

      if (depth !=
          param.max_depth) {  // don't copy over children nodes if no more nodes
        dh::safe_nccl(ncclAllGather(
            reinterpret_cast<const void*>(nodes_child_temp[d_idx].data()),
            num_nodes_child_device * sizeof(Node) / sizeof(char), ncclChar,
            reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
            find_split_comms[find_split_n_devices - 1][d_idx],
            *(streams[d_idx])));  // Note offset by n_nodes(depth)
        // for recvbuff for child nodes
      }

      dh::safe_nccl(ncclAllGather(
          reinterpret_cast<const void*>(left_child_smallest_temp[d_idx].data()),
          num_nodes_device * sizeof(bool) / sizeof(char), ncclChar,
          reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
                                  n_nodes(depth - 1)),
          find_split_comms[find_split_n_devices - 1][d_idx],
          *(streams[d_idx])));
    }

    for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));
      dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
    }

    if (n_devices > find_split_n_devices && n_devices > 1) {
      // if n_devices==1, no need to Bcast
      // if find_split_n_devices==1, this is just a copy operation, else it
      // copies
      // from master to all nodes in case extra devices not involved in split
      for (int d_idx = 0; d_idx < n_devices; d_idx++) {
        int device_idx = dList[d_idx];
        dh::safe_cuda(cudaSetDevice(device_idx));

        int master_device = dList[0];
        dh::safe_nccl(ncclBcast(
            reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
            n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
            master_device, comms[d_idx], *(streams[d_idx])));

        if (depth != param.max_depth) {  // don't copy over children nodes if no
                                         // more nodes
          dh::safe_nccl(ncclBcast(
              reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
              n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
              master_device, comms[d_idx], *(streams[d_idx])));
        }

        dh::safe_nccl(ncclBcast(
            reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
                                    n_nodes(depth - 1)),
            n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar,
            master_device, comms[d_idx], *(streams[d_idx])));
      }

      for (int d_idx = 0; d_idx < n_devices; d_idx++) {
        int device_idx = dList[d_idx];
        dh::safe_cuda(cudaSetDevice(device_idx));
        dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
      }
    }
  } else if (simuljob == 0) {
    dosimuljob = 0;
    int num_nodes_device = n_nodes_level(depth);
    const int GRID_SIZE = num_nodes_device;

    int d_idx = 0;
    int master_device = dList[d_idx];
    int device_idx = dList[d_idx];
    dh::safe_cuda(cudaSetDevice(device_idx));

    int nodes_offset_device = d_idx * num_nodes_device;
    find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
        (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
        feature_segments[d_idx].data(), depth, (info->num_col),
        (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
        nodes_offset_device, fidx_min_map[d_idx].data(),
        gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
        left_child_smallest[d_idx].data(), colsample,
        feature_flags[d_idx].data());

    // broadcast result
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      dh::safe_nccl(ncclBcast(
          reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
          n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
          master_device, comms[d_idx], *(streams[d_idx])));

      if (depth !=
          param.max_depth) {  // don't copy over children nodes if no more nodes
        dh::safe_nccl(ncclBcast(
            reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
            n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
            master_device, comms[d_idx], *(streams[d_idx])));
      }

      dh::safe_nccl(
          ncclBcast(reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
                                            n_nodes(depth - 1)),
                    n_nodes_level(depth) * sizeof(bool) / sizeof(char),
                    ncclChar, master_device, comms[d_idx], *(streams[d_idx])));
    }

    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));
      dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
    }
  } else {
    dosimuljob = 1;
  }

  if (dosimuljob) {  // if no NCCL or simuljob==1, do this
    int num_nodes_device = n_nodes_level(depth);
    const int GRID_SIZE = num_nodes_device;

    // all GPUs do same work
    for (int d_idx = 0; d_idx < n_devices; d_idx++) {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

      int nodes_offset_device = 0;
      find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
          (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
          feature_segments[d_idx].data(), depth, (info->num_col),
          (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
          nodes_offset_device, fidx_min_map[d_idx].data(),
          gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
          left_child_smallest[d_idx].data(), colsample,
          feature_flags[d_idx].data());
    }
  }

  // NOTE: No need to syncrhonize with host as all above pure P2P ops or
  // on-device ops
}

void GPUHistBuilder::InitFirstNode(const std::vector<bst_gpair>& gpair) {
  // Perform asynchronous reduction on each gpu
  std::vector<bst_gpair> device_sums(n_devices);
#pragma omp parallel for num_threads(n_devices)
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];
    dh::safe_cuda(cudaSetDevice(device_idx));
    auto begin = device_gpair[d_idx].tbegin();
    auto end = device_gpair[d_idx].tend();
    bst_gpair init = bst_gpair();
    auto binary_op = thrust::plus<bst_gpair>();
    device_sums[d_idx] = thrust::reduce(begin, end, init, binary_op);
  }

  bst_gpair sum = bst_gpair();
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    sum += device_sums[d_idx];
  }

  // Setup first node so all devices have same first node (here done same on all
  // devices, or could have done one device and Bcast if worried about exact
  // precision issues)
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];

    auto d_nodes = nodes[d_idx].data();
    auto gpu_param = GPUTrainingParam(param);

    dh::launch_n(device_idx, 1, [=] __device__(int idx) {
      bst_gpair sum_gradients = sum;
      d_nodes[idx] =
          Node(sum_gradients,
               CalcGain(gpu_param, sum_gradients.grad, sum_gradients.hess),
               CalcWeight(gpu_param, sum_gradients.grad, sum_gradients.hess));
    });
  }
  // synch all devices to host before moving on (No, can avoid because BuildHist
  // calls another kernel in default stream)
  //  dh::synchronize_n_devices(n_devices, dList);
}

void GPUHistBuilder::UpdatePosition(int depth) {
  if (is_dense) {
    this->UpdatePositionDense(depth);
  } else {
    this->UpdatePositionSparse(depth);
  }
}

void GPUHistBuilder::UpdatePositionDense(int depth) {
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];

    auto d_position = position[d_idx].data();
    Node* d_nodes = nodes[d_idx].data();
    auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
    auto d_gidx = device_matrix[d_idx].gidx;
    int n_columns = info->num_col;
    size_t begin = device_row_segments[d_idx];
    size_t end = device_row_segments[d_idx + 1];

    dh::launch_n(device_idx, end - begin, [=] __device__(size_t local_idx) {
      int pos = d_position[local_idx];
      if (!is_active(pos, depth)) {
        return;
      }
      Node node = d_nodes[pos];

      if (node.IsLeaf()) {
        return;
      }

      int gidx = d_gidx[local_idx *
                        static_cast<size_t>(n_columns) + static_cast<size_t>(node.split.findex)];

      float fvalue = d_gidx_fvalue_map[gidx];

      if (fvalue <= node.split.fvalue) {
        d_position[local_idx] = left_child_nidx(pos);
      } else {
        d_position[local_idx] = right_child_nidx(pos);
      }
    });
  }
  dh::synchronize_n_devices(n_devices, dList);
  // dh::safe_cuda(cudaDeviceSynchronize());
}

( run in 0.838 second using v1.01-cache-2.11-cpan-39bf76dae61 )