Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/rabit/src/allreduce_robust.h  view on Meta::CPAN

   * \return this function can return kSuccess or kSockError
   *         when kSockError is returned, it simply means there are bad sockets in the links,
   *         and some link recovery proceduer is needed
   */
  ReturnType TryResetLinks(void);
  /*!
   * \brief if err_type indicates an error
   *         recover links according to the error type reported
   *        if there is no error, return true
   * \param err_type the type of error happening in the system
   * \return true if err_type is kSuccess, false otherwise
   */
  bool CheckAndRecover(ReturnType err_type);
  /*!
   * \brief try to run recover execution for a request action described by flag and seqno,
   *        the function will keep blocking to run possible recovery operations before the specified action,
   *        until the requested result is received by a recovering procedure,
   *        or the function discovers that the requested action is not yet executed, and return false
   *
   * \param buf the buffer to store the result
   * \param size the total size of the buffer
   * \param flag flag information about the action \sa ActionSummary
   * \param seqno sequence number of the action, if it is special action with flag set,
   *        seqno needs to be set to ActionSummary::kSpecialOp
   *
   * \return if this function can return true or false
   *    - true means buf already set to the
   *           result by recovering procedure, the action is complete, no further action is needed
   *    - false means this is the lastest action that has not yet been executed, need to execute the action
   */
  bool RecoverExec(void *buf, size_t size, int flag,
                   int seqno = ActionSummary::kSpecialOp);
  /*!
   * \brief try to load check point
   *
   *        This is a collaborative function called by all nodes
   *        only the nodes with requester set to true really needs to load the check point
   *        other nodes acts as collaborative roles to complete this request
   *
   * \param requester whether current node is the requester
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType
   */
  ReturnType TryLoadCheckPoint(bool requester);
  /*!
   * \brief try to get the result of operation specified by seqno
   *
   *        This is a collaborative function called by all nodes
   *        only the nodes with requester set to true really needs to get the result
   *        other nodes acts as collaborative roles to complete this request
   *
   * \param buf the buffer to store the result, this parameter is only used when current node is requester
   * \param size the total size of the buffer, this parameter is only used when current node is requester
   * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
   * \param requester whether current node is the requester
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType
   */
  ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
  /*!
   * \brief try to decide the routing strategy for recovery
   * \param role the current role of the node
   * \param p_size used to store the size of the message, for node in state kHaveData,
   *               this size must be set correctly before calling the function
   *               for others, this surves as output parameter

   * \param p_recvlink used to store the link current node should recv data from, if necessary
   *          this can be -1, which means current node have the data
   * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
   *
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType, TryRecoverData
   */
  ReturnType TryDecideRouting(RecoverType role,
                              size_t *p_size,
                              int *p_recvlink,
                              std::vector<bool> *p_req_in);
  /*!
   * \brief try to finish the data recovery request,
   *        this function is used together with TryDecideRouting
   * \param role the current role of the node
   * \param sendrecvbuf_ the buffer to store the data to be sent/recived
   *          - if the role is kHaveData, this stores the data to be sent
   *          - if the role is kRequestData, this is the buffer to store the result
   *          - if the role is kPassData, this will not be used, and can be NULL
   * \param size the size of the data, obtained from TryDecideRouting
   * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
   * \param req_in the request of each link to send data, obtained from TryDecideRouting
   *
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType, TryDecideRouting
   */
  ReturnType TryRecoverData(RecoverType role,
                            void *sendrecvbuf_,
                            size_t size,
                            int recv_link,
                            const std::vector<bool> &req_in);
  /*!
   * \brief try to recover the local state, making each local state to be the result of itself
   *        plus replication of states in previous num_local_replica hops in the ring
   *
   * The input parameters must contain the valid local states available in current nodes,
   * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
   * If there is sufficient information in the ring, when the function returns, local_chkpt will
   * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
   * If there is no sufficient information in the ring, this function the number of checkpoints
   * will be less than the specified value
   *
   * \param p_local_rptr the pointer to the segment pointers in the states array
   * \param p_local_chkpt the pointer to the storage of local check points
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType
   */
  ReturnType TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
                                  std::string *p_local_chkpt);
  /*!
   * \brief try to checkpoint local state, this function is called in normal executation phase
   *    of checkpoint that contains local state
o   *  the input state must exactly one saved state(local state of current node),
   *  after complete, this function will get local state from previous num_local_replica nodes and put them
   *  into local_chkpt and local_rptr
   *
   *  It is also OK to call TryRecoverLocalState instead,
   *  TryRecoverLocalState makes less assumption about the input, and requires more communications
   *
   * \param p_local_rptr the pointer to the segment pointers in the states array
   * \param p_local_chkpt the pointer to the storage of local check points
   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
   * \sa ReturnType, TryRecoverLocalState
   */
  ReturnType TryCheckinLocalState(std::vector<size_t> *p_local_rptr,
                                  std::string *p_local_chkpt);
  /*!
   * \brief perform a ring passing to receive data from prev link, and sent data to next link
   *  this allows data to stream over a ring structure
   *  sendrecvbuf[0:read_ptr] are already provided by current node
   *  current node will recv sendrecvbuf[read_ptr:read_end] from prev link
   *  current node will send sendrecvbuf[write_ptr:write_end] to next link
   *  write_ptr will wait till the data is readed before sending the data
   *  this function requires read_end >= write_end
   *
   * \param sendrecvbuf_ the place to hold the incoming and outgoing data
   * \param read_ptr the initial read pointer
   * \param read_end the ending position to read
   * \param write_ptr the initial write pointer
   * \param write_end the ending position to write
   * \param read_link pointer to link to previous position in ring
   * \param write_link pointer to link of next position in ring
   */
  ReturnType RingPassing(void *senrecvbuf_,
                         size_t read_ptr,
                         size_t read_end,
                         size_t write_ptr,
                         size_t write_end,
                         LinkRecord *read_link,
                         LinkRecord *write_link);
  /*!
   * \brief run message passing algorithm on the allreduce tree
   *        the result is edge message stored in p_edge_in and p_edge_out
   * \param node_value the value associated with current node
   * \param p_edge_in used to store input message from each of the edge
   * \param p_edge_out used to store output message from each of the edge
   * \param func a function that defines the message passing rule
   *        Parameters of func:
   *           - node_value same as node_value in the main function
   *           - edge_in the array of input messages from each edge,
   *                     this includes the output edge, which should be excluded
   *           - out_index array the index of output edge, the function should
   *                       exclude the output edge when compute the message passing value
   *        Return of func:
   *           the function returns the output message based on the input message and node_value
   *
   * \tparam EdgeType type of edge message, must be simple struct
   * \tparam NodeType type of node value
   */
  template<typename NodeType, typename EdgeType>
  inline ReturnType MsgPassing(const NodeType &node_value,
                               std::vector<EdgeType> *p_edge_in,
                               std::vector<EdgeType> *p_edge_out,
                               EdgeType(*func)
                               (const NodeType &node_value,
                                const std::vector<EdgeType> &edge_in,
                                size_t out_index));
  //---- recovery data structure ----
  // the round of result buffer, used to mode the result
  int result_buffer_round;
  // result buffer of all reduce
  ResultBuffer resbuf;
  // last check point global model
  std::string global_checkpoint;
  // lazy checkpoint of global model
  const Serializable *global_lazycheck;
  // number of replica for local state/model
  int num_local_replica;
  // number of default local replica
  int default_local_replica;
  // flag to decide whether local model is used, -1: unknown, 0: no, 1:yes
  int use_local_model;
  // number of replica for global state/model
  int num_global_replica;
  // number of times recovery happens
  int recover_counter;
  // --- recovery data structure for local checkpoint
  // there is two version of the data structure,
  // at one time one version is valid and another is used as temp memory
  // pointer to memory position in the local model
  // local model is stored in CSR format(like a sparse matrices)
  // local_model[rptr[0]:rptr[1]] stores the model of current node
  // local_model[rptr[k]:rptr[k+1]] stores the model of node in previous k hops
  std::vector<size_t> local_rptr[2];
  // storage for local model replicas
  std::string local_chkpt[2];
  // version of local checkpoint can be 1 or 0
  int local_chkpt_version;
};
}  // namespace engine
}  // namespace rabit
// implementation of inline template function
#include "./allreduce_robust-inl.h"
#endif  // RABIT_ALLREDUCE_ROBUST_H_



( run in 0.619 second using v1.01-cache-2.11-cpan-39bf76dae61 )