Alien-XGBoost

 view release on metacpan or  search on metacpan

LICENSE  view on Meta::CPAN

      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this

xgboost/cub/cub/block/block_exchange.cuh  view on Meta::CPAN

{
private:

    /******************************************************************************
     * Constants
     ******************************************************************************/

    /// Constants
    enum
    {
        // Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,

        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,

        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,

xgboost/cub/cub/warp/specializations/warp_reduce_shfl.cuh  view on Meta::CPAN

    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
struct WarpReduceShfl
{
    //---------------------------------------------------------------------
    // Constants and type definitions
    //---------------------------------------------------------------------

    enum
    {
        /// Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        /// The number of warp reduction steps
        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,

        /// Number of logical warps in a PTX warp
        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
    };

    template <typename S>

xgboost/cub/cub/warp/specializations/warp_reduce_smem.cuh  view on Meta::CPAN

    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
struct WarpReduceSmem
{
    /******************************************************************************
     * Constants and type definitions
     ******************************************************************************/

    enum
    {
        /// Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        /// Whether the logical warp size is a power-of-two
        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,

        /// The number of warp scan steps
        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,

        /// The number of threads in half a warp
        HALF_WARP_THREADS = 1 << (STEPS - 1),

xgboost/cub/cub/warp/specializations/warp_scan_shfl.cuh  view on Meta::CPAN

    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
struct WarpScanShfl
{
    //---------------------------------------------------------------------
    // Constants and type definitions
    //---------------------------------------------------------------------

    enum
    {
        /// Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        /// The number of warp scan steps
        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,

        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
        SHFL_C = ((-1 << STEPS) & 31) << 8,
    };

    template <typename S>

xgboost/cub/cub/warp/specializations/warp_scan_smem.cuh  view on Meta::CPAN

    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
struct WarpScanSmem
{
    /******************************************************************************
     * Constants and type definitions
     ******************************************************************************/

    enum
    {
        /// Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        /// Whether the logical warp size is a power-of-two
        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,

        /// The number of warp scan steps
        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,

        /// The number of threads in half a warp
        HALF_WARP_THREADS = 1 << (STEPS - 1),

xgboost/cub/cub/warp/warp_reduce.cuh  view on Meta::CPAN

class WarpReduce
{
private:

    /******************************************************************************
     * Constants and type definitions
     ******************************************************************************/

    enum
    {
        /// Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        /// Whether the logical warp size is a power-of-two
        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
    };

public:

    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document

xgboost/cub/cub/warp/warp_scan.cuh  view on Meta::CPAN

class WarpScan
{
private:

    /******************************************************************************
     * Constants and type definitions
     ******************************************************************************/

    enum
    {
        /// Whether the logical warp size and the PTX warp size coincide
        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

        /// Whether the logical warp size is a power-of-two
        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),

        /// Whether the data type is an integer (which has fully-associative addition)
        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
    };

    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)

xgboost/dmlc-core/cmake/Modules/FindHDFS.cmake  view on Meta::CPAN

#  HDFS_LIBRARIES, location of libhdfs.so
#  HDFS_FOUND, whether HDFS is found.
#  hdfs_static, imported static hdfs library.

exec_program(hadoop ARGS version OUTPUT_VARIABLE Hadoop_VERSION
             RETURN_VALUE Hadoop_RETURN)

# currently only looking in HADOOP_HOME
find_path(HDFS_INCLUDE_DIR hdfs.h PATHS
  $ENV{HADOOP_HOME}/include/
  # make sure we don't accidentally pick up a different version
  NO_DEFAULT_PATH
)

if ("${CMAKE_SIZEOF_VOID_P}" STREQUAL "8")
  set(arch_hint "x64")
elseif ("$ENV{LIB}" MATCHES "(amd64|ia64)")
  set(arch_hint "x64")
else ()
  set(arch_hint "x86")
endif()

xgboost/dmlc-core/cmake/Modules/FindHDFS.cmake  view on Meta::CPAN

if ("${arch_hint}" STREQUAL "x64")
  set(HDFS_LIB_PATHS $ENV{HADOOP_HOME}/lib/native)
else ()
  set(HDFS_LIB_PATHS $ENV{HADOOP_HOME}/lib/native)
endif ()

message(STATUS "HDFS_LIB_PATHS: ${HDFS_LIB_PATHS}")

find_library(HDFS_LIB NAMES hdfs PATHS
  ${HDFS_LIB_PATHS}
  # make sure we don't accidentally pick up a different version
  NO_DEFAULT_PATH
)

if (HDFS_LIB)
  set(HDFS_FOUND TRUE)
  set(HDFS_LIBRARIES ${HDFS_LIB})
  set(HDFS_STATIC_LIB ${HDFS_LIB_PATHS}/libhdfs.a)

  add_library(hdfs_static STATIC IMPORTED)
  set_target_properties(hdfs_static PROPERTIES IMPORTED_LOCATION ${HDFS_STATIC_LIB})

xgboost/dmlc-core/include/dmlc/base.h  view on Meta::CPAN

/*!
 * \brief whether always log a message before throw
 * This can help identify the error that cannot be catched.
 */
#ifndef DMLC_LOG_BEFORE_THROW
#define DMLC_LOG_BEFORE_THROW 1
#endif

/*!
 * \brief Whether to use customized logger,
 * whose output can be decided by other libraries.
 */
#ifndef DMLC_LOG_CUSTOMIZE
#define DMLC_LOG_CUSTOMIZE 0
#endif

/*!
 * \brief Wheter to print stack trace for fatal error,
 * enabled on linux when using gcc.
 */
#if (!defined(DMLC_LOG_STACK_TRACE) && defined(__GNUC__) && !defined(__MINGW32__))

xgboost/dmlc-core/include/dmlc/data.h  view on Meta::CPAN

class Parser : public DataIter<RowBlock<IndexType> > {
 public:
  /*!
  * \brief create a new instance of parser based on the "type"
  *
  * \param uri_ the uri of the input, can contain hdfs prefix
  * \param part_index the part id of current input
  * \param num_parts total number of splits
  * \param type type of dataset can be: "libsvm", "auto", ...
  *
  * When "auto" is passed, the type is decided by format argument string in URI.
  *
  * \return the created parser
  */
  static Parser<IndexType> *
  Create(const char *uri_,
         unsigned part_index,
         unsigned num_parts,
         const char *type);
  /*! \return size of bytes read so far */
  virtual size_t BytesRead(void) const = 0;

xgboost/dmlc-core/tracker/dmlc_tracker/mpi.py  view on Meta::CPAN

from __future__ import absolute_import

import subprocess, logging
from threading import Thread
from . import tracker

def get_mpi_env(envs):
    """get the mpirun command for setting the envornment
    support both openmpi and mpich2
    """
    # decide MPI version.
    (_, err) = subprocess.Popen('mpirun',
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE).communicate()
    cmd = ''
    if 'Open MPI' in err:
        for k, v in envs.items():
            cmd += ' -x %s=%s' % (k, str(v))
    elif 'mpich' in err:
        for k, v in envs.items():
            cmd += ' -env %s %s' % (k, str(v))

xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py  view on Meta::CPAN

        magic = slave.recvint()
        assert magic == kMagic, 'invalid magic number=%d from %s' % (magic, self.host)
        slave.sendint(kMagic)
        self.rank = slave.recvint()
        self.world_size = slave.recvint()
        self.jobid = slave.recvstr()
        self.cmd = slave.recvstr()
        self.wait_accept = 0
        self.port = None

    def decide_rank(self, job_map):
        if self.rank >= 0:
            return self.rank
        if self.jobid != 'NULL' and self.jobid in job_map:
            return job_map[self.jobid]
        return -1

    def assign_rank(self, rank, wait_conn, tree_map, parent_map, ring_map):
        self.rank = rank
        nnset = set(tree_map[rank])
        rprev, rnext = ring_map[rank]

xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py  view on Meta::CPAN

                if s.world_size > 0:
                    nslave = s.world_size
                tree_map, parent_map, ring_map = self.get_link_map(nslave)
                # set of nodes that is pending for getting up
                todo_nodes = list(range(nslave))
            else:
                assert s.world_size == -1 or s.world_size == nslave
            if s.cmd == 'recover':
                assert s.rank >= 0

            rank = s.decide_rank(job_map)
            # batch assignment of ranks
            if rank == -1:
                assert len(todo_nodes) != 0
                pending.append(s)
                if len(pending) == len(todo_nodes):
                    pending.sort(key=lambda x: x.host)
                    for s in pending:
                        rank = todo_nodes.pop(0)
                        if s.jobid != 'NULL':
                            job_map[s.jobid] = rank

xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/ApplicationMaster.java  view on Meta::CPAN

            assert (!this.runningTasks.containsKey(container.getId()));
            this.runningTasks.put(container.getId(), task);
            this.nmClient.startContainerAsync(container, ctx);
        }
    }
    /**
     * free the containers that have not yet been launched
     *
     * @param containers
     */
    private synchronized void onStartContainerError(ContainerId cid) {
        ApplicationMaster.this.handleFailure(Collections.singletonList(cid));
    }
    /**
     * free the containers that have not yet been launched
     *
     * @param containers
     */
    private synchronized void freeUnusedContainers(
            Collection<Container> containers) {
        if(containers.size() == 0) return;
        for(Container c : containers){

xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/ApplicationMaster.java  view on Meta::CPAN

            rmClient.removeContainerRequest(r.containerRequest);
        }
        this.pendingTasks.clear();
        this.runningTasks.clear();
        LOG.info(msg);
    }

    /**
     * handle non fatal failures
     *
     * @param cid
     */
    private synchronized void handleFailure(Collection<ContainerId> failed) {
        Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
        for (ContainerId cid : failed) {
            TaskRecord r = runningTasks.remove(cid);
            if (r == null) {
                continue;
            }
            LOG.info("Task "
                    + r.taskId
                    + " failed on "
                    + r.container.getId()
                    + ". See LOG at : "
                    + String.format("http://%s/node/containerlogs/%s/"
                            + userName, r.container.getNodeHttpAddress(),

xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/ApplicationMaster.java  view on Meta::CPAN


        @Override
        public void onShutdownRequest() {
            ApplicationMaster.this
                    .abortJob("[DMLC] Get shutdown request, start to shutdown...");
        }
    }

    private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
        @Override
        public void onContainerStarted(ContainerId cid,
                Map<String, ByteBuffer> services) {
            LOG.info("onContainerStarted Invoked");
        }

        @Override
        public void onContainerStatusReceived(ContainerId cid,
                ContainerStatus status) {
            LOG.info("onContainerStatusReceived Invoked");
        }

        @Override
        public void onContainerStopped(ContainerId cid) {
            LOG.info("onContainerStopped Invoked");
        }

        @Override
        public void onGetContainerStatusError(ContainerId cid, Throwable ex) {
            LOG.info("onGetContainerStatusError Invoked: " + ex.toString());
            ApplicationMaster.this
                    .handleFailure(Collections.singletonList(cid));
        }

        @Override
        public void onStartContainerError(ContainerId cid, Throwable ex) {
            LOG.info("onStartContainerError Invoked: " + ex.getMessage());
            ApplicationMaster.this
               .onStartContainerError(cid);
        }

        @Override
        public void onStopContainerError(ContainerId cid, Throwable ex) {
            LOG.info("onStopContainerError Invoked: " + ex.toString());
        }
    }
}

xgboost/doc/faq.md  view on Meta::CPAN

You can directly run xgboost on Yarn. In theory Mesos and other resource allocation engines can be easily supported as well.


Why not implement distributed xgboost on top of X (Spark, Hadoop)
-----------------------------------------------------------------
The first fact we need to know is going distributed does not necessarily solve all the problems.
Instead, it creates more problems such as more communication overhead and fault tolerance.
The ultimate question will still come back to how to push the limit of each computation node
and use less resources to complete the task (thus with less communication and chance of failure).

To achieve these, we decide to reuse the optimizations in the single node xgboost and build distributed version on top of it.
The demand of communication in machine learning is rather simple, in the sense that we can depend on a limited set of API (in our case rabit).
Such design allows us to reuse most of the code, while being portable to major platforms such as Hadoop/Yarn, MPI, SGE.
Most importantly, it pushes the limit of the computation resources we can use.


How can I port the model to my own system
-----------------------------------------
The model and data format of XGBoost is exchangeable,
which means the model trained by one language can be loaded in another.
This means you can train the model using R, while running prediction using

xgboost/doc/input_format.md  view on Meta::CPAN

### Group Input Format
As XGBoost supports accomplishing [ranking task](../demo/rank), we support the group input format. In ranking task, instances are categorized into different groups in real world scenarios, for example, in the learning to rank web pages scenario, the ...
and the group file is as below:

train.txt.group
```
2
3
```
This means that, the data set contains 5 instances, and the first two instances are in a group and the other three are in another group. The numbers in the group file are actually indicating the number of instances in each group in the instance file ...
While configuration, you do not have to indicate the path of the group file. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.group" in the same directory and decides whether to read the data as group input fo...

### Instance Weight File
XGBoost supports providing each instance an weight to differentiate the importance of instances. For example, if we provide an instance weight file for the "train.txt" file in the example as below:

train.txt.weight
```
1
0.5
0.5
1

xgboost/doc/parameter.md  view on Meta::CPAN

XGBoost Parameters
==================
Before running XGboost, we must set three types of parameters: general parameters, booster parameters and task parameters.
- General parameters relates to which booster we are using to do boosting, commonly tree or linear model
- Booster parameters depends on which booster you have chosen
- Learning Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks.
- Command line parameters that relates to behavior of CLI version of xgboost.

Parameters in R Package
-----------------------
In R-package, you can use .(dot) to replace underscore in the parameters, for example, you can use max.depth as max_depth. The underscore parameters are also valid in R.

General Parameters
------------------
* booster [default=gbtree]
  - which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.

xgboost/include/xgboost/data.h  view on Meta::CPAN

   */
  virtual void InitColAccess(const std::vector<bool>& enabled,
                             float subsample,
                             size_t max_row_perbatch) = 0;
  // the following are column meta data, should be able to answer them fast.
  /*! \return whether column access is enabled */
  virtual bool HaveColAccess() const = 0;
  /*! \return Whether the data columns single column block. */
  virtual bool SingleColBlock() const = 0;
  /*! \brief get number of non-missing entries in column */
  virtual size_t GetColSize(size_t cidx) const = 0;
  /*! \brief get column density */
  virtual float GetColDensity(size_t cidx) const = 0;
  /*! \return reference of buffered rowset, in column access */
  virtual const RowSet& buffered_rowset() const = 0;
  /*! \brief virtual destructor */
  virtual ~DMatrix() {}
  /*!
   * \brief Save DMatrix to local file.
   *  The saved file only works for non-sharded dataset(single machine training).
   *  This API is deprecated and dis-encouraged to use.
   * \param fname The file name to be saved.
   * \return The created DMatrix.

xgboost/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala  view on Meta::CPAN

  private[this] val jobToRankMap = mutable.HashMap.empty[String, Int]
  private[this] val actorRefToHost = mutable.HashMap.empty[ActorRef, String]
  private[this] val ranksToAssign = mutable.ListBuffer(0 until numWorkers: _*)
  private[this] var maxPortTrials = 0
  private[this] var workerConnectionTimeout: Duration = Duration.Inf
  private[this] var portTrials = 0
  private[this] val startedWorkers = mutable.Set.empty[Int]

  val linkMap = new LinkMap(numWorkers)

  def decideRank(rank: Int, jobId: String = "NULL"): Option[Int] = {
    rank match {
      case r if r >= 0 => Some(r)
      case _ =>
        jobId match {
          case "NULL" => None
          case jid => jobToRankMap.get(jid)
        }
    }
  }

xgboost/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala  view on Meta::CPAN


    case WorkerRecover(prevRank, worldSize, jobId) =>
      assert(prevRank >= 0)
      sender() ! linkMap.assignRank(prevRank)

    case WorkerStart(rank, worldSize, jobId) =>
      assert(worldSize == numWorkers || worldSize == -1,
        s"Purported worldSize ($worldSize) does not match worker count ($numWorkers)."
      )

      Try(decideRank(rank, jobId).getOrElse(ranksToAssign.remove(0))) match {
        case Success(wkRank) =>
          if (jobId != "NULL") {
            jobToRankMap.put(jobId, wkRank)
          }

          val assignedRank = linkMap.assignRank(wkRank)
          sender() ! assignedRank
          resolver ! assignedRank

          log.info("Received start signal from " +

xgboost/plugin/lz4/sparse_page_lz4_format.cc  view on Meta::CPAN

  }

  bool Read(SparsePage* page,
            dmlc::SeekStream* fi,
            const std::vector<bst_uint>& sorted_index_set) override {
    if (!fi->Read(&disk_offset_)) return false;
    this->LoadIndexValue(fi);

    page->offset.clear();
    page->offset.push_back(0);
    for (bst_uint cid : sorted_index_set) {
      page->offset.push_back(
          page->offset.back() + disk_offset_[cid + 1] - disk_offset_[cid]);
    }
    page->data.resize(page->offset.back());
    CHECK_EQ(index_.data.size(), value_.data.size());
    CHECK_EQ(index_.data.size(), disk_offset_.back());

    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
      bst_uint cid = sorted_index_set[i];
      size_t dst_begin = page->offset[i];
      size_t src_begin = disk_offset_[cid];
      size_t num = disk_offset_[cid + 1] - disk_offset_[cid];
      for (size_t j = 0; j < num; ++j) {
        page->data[dst_begin + j] = SparseBatch::Entry(
            index_.data[src_begin + j] + min_index_, value_.data[src_begin + j]);
      }
    }
    return true;
  }

  void Write(const SparsePage& page, dmlc::Stream* fo) override {
    CHECK(page.offset.size() != 0 && page.offset[0] == 0);

xgboost/plugin/updater_gpu/src/exact/argmax_by_key.cuh  view on Meta::CPAN

#include "../common.cuh"
#include "node.cuh"
#include "../types.cuh"

namespace xgboost {
namespace tree {
namespace exact {

/**
 * @enum ArgMaxByKeyAlgo best_split_evaluation.cuh
 * @brief Help decide which algorithm to use for multi-argmax operation
 */
enum ArgMaxByKeyAlgo {
  /** simplest, use gmem-atomics for all updates */
  ABK_GMEM = 0,
  /** use smem-atomics for updates (when number of keys are less) */
  ABK_SMEM
};

/** max depth until which to use shared mem based atomics for argmax */
static const int MAX_ABK_LEVELS = 3;

xgboost/rabit/doc/guide.md  view on Meta::CPAN


#### Link against Mock Test Rabit Library
If you want to use a mock to test the program in order to see the behavior of the code when some nodes go down, you can link against ```rabit_mock.a``` .
* Simply change the linker flag from ```-lrabit``` to ```-lrabit_mock```

The resulting rabit mock program can take in additional arguments in the following format
```
mock=rank,version,seq,ndeath
```

The four integers specify an event that will cause the program to ```commit suicide```(exit with -2)
* rank specifies the rank of the node to kill
* version specifies the version (iteration) of the model where you want the process to die
* seq specifies the sequence number of the Allreduce/Broadcast call since last checkpoint, where the process will be killed
* ndeath specifies how many times this node died already

For example, consider the following script in the test case
```bash
../tracker/rabit_demo.py -n 10 test_model_recover 10000\
                         mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1
```

xgboost/rabit/include/dmlc/base.h  view on Meta::CPAN

/*!
 * \brief whether always log a message before throw
 * This can help identify the error that cannot be catched.
 */
#ifndef DMLC_LOG_BEFORE_THROW
#define DMLC_LOG_BEFORE_THROW 1
#endif

/*!
 * \brief Whether to use customized logger,
 * whose output can be decided by other libraries.
 */
#ifndef DMLC_LOG_CUSTOMIZE
#define DMLC_LOG_CUSTOMIZE 0
#endif

/*!
 * \brief Wheter to print stack trace for fatal error,
 * enabled on linux when using gcc.
 */
#if (!defined(DMLC_LOG_STACK_TRACE) && defined(__GNUC__) && !defined(__MINGW32__))

xgboost/rabit/src/allreduce_robust.cc  view on Meta::CPAN

        ReConnectLinks();
        err_type = kSuccess;
        break;
      }
      default: utils::Assert(false, "RecoverLinks: cannot reach here");
    }
  }
  return false;
}
/*!
 * \brief message passing function, used to decide the
 *        shortest distance to the possible source of data
 * \param node_value a pair of have_data and size
 *           have_data whether current node have data
 *           size gives the size of data, if current node is kHaveData
 * \param dist_in the shorest to any data source distance in each direction
 * \param out_index the edge index of output link
 * \return the shorest distance result of out edge specified by out_index
 */
inline std::pair<int, size_t>
ShortestDist(const std::pair<bool, size_t> &node_value,

xgboost/rabit/src/allreduce_robust.cc  view on Meta::CPAN

    if (dist_in[i].first + 1 < res) {
      res = dist_in[i].first + 1;
      size = dist_in[i].second;
    }
  }
  // add one hop

  return std::make_pair(res, size);
}
/*!
 * \brief message passing function, used to decide the
 *    data request from each edge, whether need to request data from certain edge
 * \param node_value a pair of request_data and best_link
 *           request_data stores whether current node need to request data
 *           best_link gives the best edge index to fetch the data
 * \param req_in the data request from incoming edges
 * \param out_index the edge index of output link
 * \return the request to the output edge
 */
inline char DataRequest(const std::pair<bool, int> &node_value,
                        const std::vector<char> &req_in,

xgboost/rabit/src/allreduce_robust.cc  view on Meta::CPAN

  if (static_cast<int>(out_index) == best_link) {
    if (request_data) return 1;
    for (size_t i = 0; i < req_in.size(); ++i) {
      if (i == out_index) continue;
      if (req_in[i] != 0) return 1;
    }
  }
  return 0;
}
/*!
 * \brief try to decide the recovery message passing request
 * \param role the current role of the node
 * \param p_size used to store the size of the message, for node in state kHaveData,
 *               this size must be set correctly before calling the function
 *               for others, this surves as output parameter
 *
 * \param p_recvlink used to store the link current node should recv data from, if necessary
 *          this can be -1, which means current node have the data
 * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
 *
 * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
 * \sa ReturnType
 */
AllreduceRobust::ReturnType
AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
                                  size_t *p_size,
                                  int *p_recvlink,
                                  std::vector<bool> *p_req_in) {
  int best_link = -2;
  {
    // get the shortest distance to the request point
    std::vector<std::pair<int, size_t> > dist_in, dist_out;
    ReturnType succ = MsgPassing(std::make_pair(role == kHaveData, *p_size),
                                 &dist_in, &dist_out, ShortestDist);
    if (succ != kSuccess) return succ;

xgboost/rabit/src/allreduce_robust.cc  view on Meta::CPAN

    if (req_out[i] != 0) {
      utils::Assert(req_in[i] == 0, "cannot get and receive request");
      utils::Assert(static_cast<int>(i) == best_link, "request result inconsistent");
    }
  }
  *p_recvlink = best_link;
  return kSuccess;
}
/*!
 * \brief try to finish the data recovery request,
 *        this function is used together with TryDecideRouting
 * \param role the current role of the node
 * \param sendrecvbuf_ the buffer to store the data to be sent/recived
 *          - if the role is kHaveData, this stores the data to be sent
 *          - if the role is kRequestData, this is the buffer to store the result
 *          - if the role is kPassData, this will not be used, and can be NULL
 * \param size the size of the data, obtained from TryDecideRouting
 * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
 * \param req_in the request of each link to send data, obtained from TryDecideRouting
 *
 * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
 * \sa ReturnType, TryDecideRouting
 */
AllreduceRobust::ReturnType
AllreduceRobust::TryRecoverData(RecoverType role,
                                void *sendrecvbuf_,
                                size_t size,
                                int recv_link,
                                const std::vector<bool> &req_in) {
  RefLinkVector &links = tree_links;
  // no need to run recovery for zero size messages
  if (links.size() == 0 || size == 0) return kSuccess;
  utils::Assert(req_in.size() == links.size(), "TryRecoverData");
  const int nlink = static_cast<int>(links.size());
  {
    bool req_data = role == kRequestData;
    for (int i = 0; i < nlink; ++i) {
      if (req_in[i]) {
        utils::Assert(i != recv_link, "TryDecideRouting");
        req_data = true;
      }
    }
    // do not need to provide data or receive data, directly exit
    if (!req_data) return kSuccess;
  }
  utils::Assert(recv_link >= 0 || role == kHaveData, "recv_link must be active");
  if (role == kPassData) {
    links[recv_link].InitBuffer(1, size, reduce_buffer_size);
  }

xgboost/rabit/src/allreduce_robust.cc  view on Meta::CPAN

    global_checkpoint.resize(0);
    utils::MemoryBufferStream fs(&global_checkpoint);
    fs.Write(&version_number, sizeof(version_number));
    global_lazycheck->Save(&fs);
    global_lazycheck = NULL;
  }
  // recover global checkpoint
  size_t size = this->global_checkpoint.length();
  int recv_link;
  std::vector<bool> req_in;
  succ = TryDecideRouting(role, &size, &recv_link, &req_in);
  if (succ != kSuccess) return succ;
  if (role == kRequestData) {
    global_checkpoint.resize(size);
  }
  if (size == 0) return kSuccess;
  return TryRecoverData(role, BeginPtr(global_checkpoint), size, recv_link, req_in);
}
/*!
 * \brief try to get the result of operation specified by seqno
 *

xgboost/rabit/src/allreduce_robust.cc  view on Meta::CPAN

  if (!requester) {
    sendrecvbuf = resbuf.Query(seqno, &size);
    role = sendrecvbuf != NULL ? kHaveData : kPassData;
  } else {
    role = kRequestData;
  }
  int recv_link;
  std::vector<bool> req_in;
  // size of data
  size_t data_size = size;
  ReturnType succ = TryDecideRouting(role, &data_size, &recv_link, &req_in);
  if (succ != kSuccess) return succ;
  utils::Check(data_size != 0, "zero size check point is not allowed");
  if (role == kRequestData || role == kHaveData) {
    utils::Check(data_size == size,
                 "Allreduce Recovered data size do not match the specification of function call.\n"\
                 "Please check if calling sequence of recovered program is the " \
                 "same the original one in current VersionNumber");
  }
  return TryRecoverData(role, sendrecvbuf, data_size, recv_link, req_in);
}

xgboost/rabit/src/allreduce_robust.h  view on Meta::CPAN

   *
   * \param buf the buffer to store the result, this parameter is only used when current node is requester
   * \param size the total size of the buffer, this parameter is only used when current node is requester
   * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
   * \param requester whether current node is the requester
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType
   */
  ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
  /*!
   * \brief try to decide the routing strategy for recovery
   * \param role the current role of the node
   * \param p_size used to store the size of the message, for node in state kHaveData,
   *               this size must be set correctly before calling the function
   *               for others, this surves as output parameter

   * \param p_recvlink used to store the link current node should recv data from, if necessary
   *          this can be -1, which means current node have the data
   * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
   *
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType, TryRecoverData
   */
  ReturnType TryDecideRouting(RecoverType role,
                              size_t *p_size,
                              int *p_recvlink,
                              std::vector<bool> *p_req_in);
  /*!
   * \brief try to finish the data recovery request,
   *        this function is used together with TryDecideRouting
   * \param role the current role of the node
   * \param sendrecvbuf_ the buffer to store the data to be sent/recived
   *          - if the role is kHaveData, this stores the data to be sent
   *          - if the role is kRequestData, this is the buffer to store the result
   *          - if the role is kPassData, this will not be used, and can be NULL
   * \param size the size of the data, obtained from TryDecideRouting
   * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
   * \param req_in the request of each link to send data, obtained from TryDecideRouting
   *
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType, TryDecideRouting
   */
  ReturnType TryRecoverData(RecoverType role,
                            void *sendrecvbuf_,
                            size_t size,
                            int recv_link,
                            const std::vector<bool> &req_in);
  /*!
   * \brief try to recover the local state, making each local state to be the result of itself
   *        plus replication of states in previous num_local_replica hops in the ring
   *

xgboost/rabit/src/allreduce_robust.h  view on Meta::CPAN

  // result buffer of all reduce
  ResultBuffer resbuf;
  // last check point global model
  std::string global_checkpoint;
  // lazy checkpoint of global model
  const Serializable *global_lazycheck;
  // number of replica for local state/model
  int num_local_replica;
  // number of default local replica
  int default_local_replica;
  // flag to decide whether local model is used, -1: unknown, 0: no, 1:yes
  int use_local_model;
  // number of replica for global state/model
  int num_global_replica;
  // number of times recovery happens
  int recover_counter;
  // --- recovery data structure for local checkpoint
  // there is two version of the data structure,
  // at one time one version is valid and another is used as temp memory
  // pointer to memory position in the local model
  // local model is stored in CSR format(like a sparse matrices)

xgboost/rabit/src/socket.h  view on Meta::CPAN

  }
  /*! \brief get a new connection */
  TCPSocket Accept(void) {
    SOCKET newfd = accept(sockfd, NULL, NULL);
    if (newfd == INVALID_SOCKET) {
      Socket::Error("Accept");
    }
    return TCPSocket(newfd);
  }
  /*!
   * \brief decide whether the socket is at OOB mark
   * \return 1 if at mark, 0 if not, -1 if an error occured
   */
  inline int AtMark(void) const {
#ifdef _WIN32
    unsigned long atmark;  // NOLINT(*)
    if (ioctlsocket(sockfd, SIOCATMARK, &atmark) != NO_ERROR) return -1;
#else
    int atmark;
    if (ioctl(sockfd, SIOCATMARK, &atmark) == -1) return -1;
#endif

xgboost/src/data/data.cc  view on Meta::CPAN

namespace data {
SparsePage::Format* SparsePage::Format::Create(const std::string& name) {
  auto *e = ::dmlc::Registry< ::xgboost::data::SparsePageFormatReg>::Get()->Find(name);
  if (e == nullptr) {
    LOG(FATAL) << "Unknown format type " << name;
  }
  return (e->body)();
}

std::pair<std::string, std::string>
SparsePage::Format::DecideFormat(const std::string& cache_prefix) {
  size_t pos = cache_prefix.rfind(".fmt-");

  if (pos != std::string::npos) {
    std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
    size_t cpos = fmt.rfind('-');
    if (cpos != std::string::npos) {
      return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
    } else {
      return std::make_pair(fmt, fmt);
    }

xgboost/src/data/simple_dmatrix.h  view on Meta::CPAN

  }

  bool HaveColAccess() const override {
    return col_size_.size() != 0;
  }

  const RowSet& buffered_rowset() const override {
    return buffered_rowset_;
  }

  size_t GetColSize(size_t cidx) const override {
    return col_size_[cidx];
  }

  float GetColDensity(size_t cidx) const override {
    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
  }

  dmlc::DataIter<ColBatch>* ColIterator() override;

  dmlc::DataIter<ColBatch>* ColIterator(const std::vector<bst_uint>& fset) override;

  void InitColAccess(const std::vector<bool>& enabled,
                     float subsample,
                     size_t max_row_perbatch) override;

xgboost/src/data/sparse_batch_page.h  view on Meta::CPAN

   * \brief save the data to fo, when a page was written.
   * \param fo output stream
   */
  virtual void Write(const SparsePage& page, dmlc::Stream* fo) = 0;
  /*!
   * \brief Create sparse page of format.
   * \return The created format functors.
   */
  static Format* Create(const std::string& name);
  /*!
   * \brief decide the format from cache prefix.
   * \return pair of row format, column format type of the cache prefix.
   */
  static std::pair<std::string, std::string> DecideFormat(const std::string& cache_prefix);
};

#if DMLC_ENABLE_STD_THREAD
/*!
 * \brief A threaded writer to write sparse batch page to sharded files.
 */
class SparsePage::Writer {
 public:
  /*!
   * \brief constructor

xgboost/src/data/sparse_page_dmatrix.cc  view on Meta::CPAN

      return true;
    } else {
      return false;
    }
  };

  std::vector<std::string> cache_shards = common::Split(cache_info_, ':');
  std::vector<std::string> name_shards, format_shards;
  for (const std::string& prefix : cache_shards) {
    name_shards.push_back(prefix + ".col.page");
    format_shards.push_back(SparsePage::Format::DecideFormat(prefix).second);
  }

  {
    SparsePage::Writer writer(name_shards, format_shards, 6);
    std::shared_ptr<SparsePage> page;
    writer.Alloc(&page); page->Clear();

    double tstart = dmlc::GetTime();
    size_t bytes_write = 0;
    // print every 4 sec.



( run in 1.669 second using v1.01-cache-2.11-cpan-4505f990765 )