view release on metacpan or search on metacpan
xgboost/NEWS.md view on Meta::CPAN
* Add CMake build system
- https://github.com/dmlc/xgboost/pull/1314
## v0.47 (2016.01.14)
* Changes in R library
- fixed possible problem of poisson regression.
- switched from 0 to NA for missing values.
- exposed access to additional model parameters.
* Changes in Python library
- throws exception instead of crash terminal when a parameter error happens.
- has importance plot and tree plot functions.
- accepts different learning rates for each boosting round.
- allows model training continuation from previously saved model.
- allows early stopping in CV.
- allows feval to return a list of tuples.
- allows eval_metric to handle additional format.
- improved compatibility in sklearn module.
- additional parameters added for sklearn wrapper.
- added pip installation functionality.
- supports more Pandas DataFrame dtypes.
xgboost/R-package/tests/testthat/test_helpers.R view on Meta::CPAN
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE])
# using integer node ID instead of character
dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE)
expect_equal(as.integer(tstrsplit(dt.tree$Yes, '-')[[2]]), dt.tree.int$Yes)
expect_equal(as.integer(tstrsplit(dt.tree$No, '-')[[2]]), dt.tree.int$No)
expect_equal(as.integer(tstrsplit(dt.tree$Missing, '-')[[2]]), dt.tree.int$Missing)
})
test_that("xgb.model.dt.tree throws error for gblinear", {
expect_error(xgb.model.dt.tree(model = bst.GLM))
})
test_that("xgb.importance works with and without feature names", {
importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
expect_equal(dim(importance.Tree), c(7, 4))
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
expect_output(str(importance.Tree), 'Feature.*\\"Age\\"')
importance.Tree.0 <- xgb.importance(model = bst.Tree)
xgboost/cub/eclipse code style profile.xml view on Meta::CPAN
<setting id="org.eclipse.cdt.core.formatter.alignment_for_compact_if" value="0"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration" value="48"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header" value="false"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_body" value="true"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_block" value="true"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="48"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.tabulation.char" value="space"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
<setting id="org.eclipse.cdt.core.formatter.compact_else_if" value="true"/>
xgboost/demo/kaggle-higgs/higgs-numpy.py view on Meta::CPAN
# since we only need the rank
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1
param['nthread'] = 16
# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())+[('eval_metric', 'ams@0.15')]
watchlist = [ (xgmat,'train') ]
# boost 120 trees
num_round = 120
print ('loading data end, start to boost trees')
bst = xgb.train( plst, xgmat, num_round, watchlist );
# save out model
bst.save_model('higgs.model')
xgboost/dmlc-core/doc/parameter.md view on Meta::CPAN
Parameter Structure for Machine Learning
========================================
One of the most important ingredients of machine learning projects are the parameters.
Parameters act as a way of communication between users and the library. In this article, we will introduce the parameter module of DMLC, a lightweight C++ module that is designed to support
general machine learning libraries. It comes with the following nice properties:
- Easy declaration of typed fields, default values and constraints.
- Auto checking of constraints and throw exceptions when constraint is not met.
- Auto generation of human readable docstrings on parameters.
- Serialization and de-serialization into JSON and ```std::map<std::string, std::string>```.
Use Parameter Module
--------------------
### Declare the Parameter
In the dmlc parameter module, every parameter can be declared as a structure.
This means you can easily access these fields as they normally are efficiently.
For example, it is very common to write
```c++
xgboost/dmlc-core/doc/parameter.md view on Meta::CPAN
{"num_hidden", "100"},
{"activation", "relu"},
{"name", "myname"}
};
// set the parameters
param.Init(param_data);
return 0;
}
```
After the ```Init``` function is called, the ```param``` will be filled with the specified key values in ```param_data```.
More importantly, the ```Init``` function will do automatic checking of parameter range and throw an ```dmlc::ParamError```
with detailed error message if things went wrong.
### Generate Human Readable Docstrings
Another useful feature of the parameter module is to get an human readable docstring of the parameter.
This is helpful when we are creating language binding such as python and R, and we can use it to generate docstring of
foreign language interface.
The following code obtains the dostring of ```MyParam```.
```c++
std::string docstring = MyParam::__DOC__();
xgboost/dmlc-core/include/dmlc/base.h view on Meta::CPAN
*/
#ifndef DMLC_BASE_H_
#define DMLC_BASE_H_
/*! \brief whether use glog for logging */
#ifndef DMLC_USE_GLOG
#define DMLC_USE_GLOG 0
#endif
/*!
* \brief whether throw dmlc::Error instead of
* directly calling abort when FATAL error occured
* NOTE: this may still not be perfect.
* do not use FATAL and CHECK in destructors
*/
#ifndef DMLC_LOG_FATAL_THROW
#define DMLC_LOG_FATAL_THROW 1
#endif
/*!
* \brief whether always log a message before throw
* This can help identify the error that cannot be catched.
*/
#ifndef DMLC_LOG_BEFORE_THROW
#define DMLC_LOG_BEFORE_THROW 1
#endif
/*!
* \brief Whether to use customized logger,
* whose output can be decided by other libraries.
*/
xgboost/dmlc-core/include/dmlc/base.h view on Meta::CPAN
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
//! \endcond
#else
#include <inttypes.h>
#endif
#include <string>
#include <vector>
#if defined(_MSC_VER) && _MSC_VER < 1900
#define noexcept_true throw ()
#define noexcept_false
#define noexcept(a) noexcept_##a
#endif
#if DMLC_USE_CXX11
#define DMLC_THROW_EXCEPTION noexcept(false)
#define DMLC_NO_EXCEPTION noexcept(true)
#else
#define DMLC_THROW_EXCEPTION
#define DMLC_NO_EXCEPTION
xgboost/dmlc-core/include/dmlc/json.h view on Meta::CPAN
* \brief Constructor.
* \param is the input stream.
*/
explicit JSONReader(std::istream *is)
: is_(is),
line_count_r_(0),
line_count_n_(0) {}
/*!
* \brief Parse next JSON string.
* \param out_str the output string.
* \throw dmlc::Error when next token is not string
*/
inline void ReadString(std::string *out_str);
/*!
* \brief Read Number.
* \param out_value output value;
* \throw dmlc::Error when next token is not number of ValueType.
* \tparam ValueType type of the number
*/
template<typename ValueType>
inline void ReadNumber(ValueType *out_value);
/*!
* \brief Begin parsing an object.
* \code
* std::string key;
* // value can be any type that is json serializable.
* std::string value;
xgboost/dmlc-core/include/dmlc/json.h view on Meta::CPAN
/*!
* \brief Try to read the next element in the array.
* If this call is successful, user can proceed to call
* reader->Read to read in the value.
* \return true if the read is successful, false if we are at end of the array.
*/
inline bool NextArrayItem();
/*!
* \brief Read next ValueType.
* \param out_value any STL or json readable type to be read
* \throw dmlc::Error when the read of ValueType is not successful.
* \tparam ValueType the data type to be read.
*/
template<typename ValueType>
inline void Read(ValueType *out_value);
/*! \return current line count */
inline std::string line_info() const {
char temp[64];
std::ostringstream os;
os << " Line " << std::max(line_count_r_, line_count_n_);
xgboost/dmlc-core/include/dmlc/logging.h view on Meta::CPAN
#include <vector>
#include <stdexcept>
#include "./base.h"
#if DMLC_LOG_STACK_TRACE
#include <execinfo.h>
#endif
namespace dmlc {
/*!
* \brief exception class that will be thrown by
* default logger if DMLC_LOG_FATAL_THROW == 1
*/
struct Error : public std::runtime_error {
/*!
* \brief constructor
* \param s the error message
*/
explicit Error(const std::string &s) : std::runtime_error(s) {}
};
} // namespace dmlc
xgboost/dmlc-core/include/dmlc/logging.h view on Meta::CPAN
int nframes = backtrace(stack, MAX_STACK_SIZE);
log_stream_ << "\n\n" << "Stack trace returned " << nframes << " entries:\n";
char **msgs = backtrace_symbols(stack, nframes);
if (msgs != nullptr) {
for (int i = 0; i < nframes; ++i) {
log_stream_ << "[bt] (" << i << ") " << msgs[i] << "\n";
}
}
#endif
// throwing out of destructor is evil
// hopefully we can do it here
// also log the message before throw
#if DMLC_LOG_BEFORE_THROW
LOG(ERROR) << log_stream_.str();
#endif
throw Error(log_stream_.str());
}
private:
std::ostringstream log_stream_;
DateLogger pretty_date_;
LogMessageFatal(const LogMessageFatal&);
void operator=(const LogMessageFatal&);
};
#endif
xgboost/dmlc-core/include/dmlc/optional.h view on Meta::CPAN
}
/*! \brief non-const dereference operator */
T& operator*() { // NOLINT(*)
return *reinterpret_cast<T*>(&val);
}
/*! \brief const dereference operator */
const T& operator*() const {
return *reinterpret_cast<const T*>(&val);
}
/*! \brief return the holded value.
* throws std::logic_error if holding no value
*/
const T& value() const {
if (is_none) {
throw std::logic_error("bad optional access");
}
return *reinterpret_cast<const T*>(&val);
}
/*! \brief whether this object is holding a value */
explicit operator bool() const { return !is_none; }
private:
// whether this is none
bool is_none;
// on stack storage of value
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
#include <utility>
#include <iostream>
#include "./base.h"
#include "./json.h"
#include "./logging.h"
#include "./type_traits.h"
#include "./optional.h"
namespace dmlc {
// this file is backward compatible with non-c++11
/*! \brief Error throwed by parameter checking */
struct ParamError : public dmlc::Error {
/*!
* \brief constructor
* \param msg error message
*/
explicit ParamError(const std::string &msg)
: dmlc::Error(msg) {}
};
/*!
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
* \tparam PType the type of parameter struct
*
* \sa DMLC_DECLARE_FIELD, DMLC_REGISTER_PARAMETER, DMLC_DECLARE_PARAMETER
*/
template<typename PType>
struct Parameter {
public:
/*!
* \brief initialize the parameter by keyword arguments.
* This function will initialize the parameter struct, check consistency
* and throw error if something wrong happens.
*
* \param kwargs map of keyword arguments, or vector of pairs
* \parma option The option on initialization.
* \tparam Container container type
* \throw ParamError when something go wrong.
*/
template<typename Container>
inline void Init(const Container &kwargs,
parameter::ParamInitOption option = parameter::kAllowHidden) {
PType::__MANAGER__()->RunInit(static_cast<PType*>(this),
kwargs.begin(), kwargs.end(),
NULL,
option);
}
/*!
* \brief initialize the parameter by keyword arguments.
* This is same as Init, but allow unknown arguments.
*
* \param kwargs map of keyword arguments, or vector of pairs
* \tparam Container container type
* \throw ParamError when something go wrong.
* \return vector of pairs of unknown arguments.
*/
template<typename Container>
inline std::vector<std::pair<std::string, std::string> >
InitAllowUnknown(const Container &kwargs) {
std::vector<std::pair<std::string, std::string> > unknown;
PType::__MANAGER__()->RunInit(static_cast<PType*>(this),
kwargs.begin(), kwargs.end(),
&unknown, parameter::kAllowUnknown);
return unknown;
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
/*!
* \brief Write the parameters in JSON format.
* \param writer JSONWriter used for writing.
*/
inline void Save(dmlc::JSONWriter *writer) const {
writer->Write(this->__DICT__());
}
/*!
* \brief Load the parameters from JSON.
* \param reader JSONReader used for loading.
* \throw ParamError when something go wrong.
*/
inline void Load(dmlc::JSONReader *reader) {
std::map<std::string, std::string> kwargs;
reader->Read(&kwargs);
this->Init(kwargs);
}
/*!
* \brief Get the fields of the parameters.
* \return List of ParamFieldInfo of each field.
*/
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
*/
class FieldAccessEntry {
public:
FieldAccessEntry()
: has_default_(false) {}
/*! \brief destructor */
virtual ~FieldAccessEntry() {}
/*!
* \brief set the default value.
* \param head the pointer to the head of the struct
* \throw error if no default is presented
*/
virtual void SetDefault(void *head) const = 0;
/*!
* \brief set the parameter by string value
* \param head the pointer to the head of the struct
* \param value the value to be set
*/
virtual void Set(void *head, const std::string &value) const = 0;
// check if value is OK
virtual void Check(void *head) const {}
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
return it->second;
}
/*!
* \brief set parameter by keyword arguments.
* \param head head to the parameter field.
* \param begin begin iterator of original kwargs
* \param end end iterator of original kwargs
* \param unknown_args optional, used to hold unknown arguments
* When it is specified, unknown arguments will be stored into here, instead of raise an error
* \tparam RandomAccessIterator iterator type
* \throw ParamError when there is unknown argument and unknown_args == NULL, or required argument is missing.
*/
template<typename RandomAccessIterator>
inline void RunInit(void *head,
RandomAccessIterator begin,
RandomAccessIterator end,
std::vector<std::pair<std::string, std::string> > *unknown_args,
parameter::ParamInitOption option) const {
std::set<FieldAccessEntry*> selected_args;
for (RandomAccessIterator it = begin; it != end; ++it) {
FieldAccessEntry *e = Find(it->first);
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
if (option == parameter::kAllowHidden &&
it->first.length() > 4 &&
it->first.find("__") == 0 &&
it->first.rfind("__") == it->first.length()-2) {
continue;
}
std::ostringstream os;
os << "Cannot find argument \'" << it->first << "\', Possible Arguments:\n";
os << "----------------\n";
PrintDocString(os);
throw dmlc::ParamError(os.str());
}
}
}
}
for (std::map<std::string, FieldAccessEntry*>::const_iterator it = entry_map_.begin();
it != entry_map_.end(); ++it) {
if (selected_args.count(it->second) == 0) {
it->second->SetDefault(head);
}
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
if (!isspace(ch)) {
is.setstate(std::ios::failbit); break;
}
}
}
if (is.fail()) {
std::ostringstream os;
os << "Invalid Parameter format for " << key_
<< " expect " << type_ << " but value=\'" << value<< '\'';
throw dmlc::ParamError(os.str());
}
}
virtual std::string GetStringValue(void *head) const {
std::ostringstream os;
PrintValue(os, this->Get(head));
return os.str();
}
virtual ParamFieldInfo GetFieldInfo() const {
ParamFieldInfo info;
std::ostringstream os;
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
info.type_info_str = os.str();
info.description = description_;
return info;
}
// implement set head to default value
virtual void SetDefault(void *head) const {
if (!has_default_) {
std::ostringstream os;
os << "Required parameter " << key_
<< " of " << type_ << " is not presented";
throw dmlc::ParamError(os.str());
} else {
this->Get(head) = default_value_;
}
}
// return reference of self as derived type
inline TEntry &self() {
return *(static_cast<TEntry*>(this));
}
// implement set_default
inline TEntry &set_default(const DType &default_value) {
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
}
// consistency check for numeric ranges
virtual void Check(void *head) const {
FieldEntryBase<TEntry, DType>::Check(head);
DType v = this->Get(head);
if (has_begin_ && has_end_) {
if (v < begin_ || v > end_) {
std::ostringstream os;
os << "value " << v << " for Parameter " << this->key_
<< " exceed bound [" << begin_ << ',' << end_ <<']';
throw dmlc::ParamError(os.str());
}
} else if (has_begin_ && v < begin_) {
std::ostringstream os;
os << "value " << v << " for Parameter " << this->key_
<< " should be greater equal to " << begin_;
throw dmlc::ParamError(os.str());
} else if (has_end_ && v > end_) {
std::ostringstream os;
os << "value " << v << " for Parameter " << this->key_
<< " should be smaller equal to " << end_;
throw dmlc::ParamError(os.str());
}
}
protected:
// whether it have begin and end range
bool has_begin_, has_end_;
// data bound
DType begin_, end_;
};
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
typedef FieldEntryNumeric<FieldEntry<int>, int> Parent;
// override set
virtual void Set(void *head, const std::string &value) const {
if (is_enum_) {
std::map<std::string, int>::const_iterator it = enum_map_.find(value);
std::ostringstream os;
if (it == enum_map_.end()) {
os << "Invalid Input: \'" << value;
os << "\', valid values are: ";
PrintEnums(os);
throw dmlc::ParamError(os.str());
} else {
os << it->second;
Parent::Set(head, os.str());
}
} else {
Parent::Set(head, value);
}
}
virtual ParamFieldInfo GetFieldInfo() const {
if (is_enum_) {
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
inline FieldEntry<int> &add_enum(const std::string &key, int value) {
if ((enum_map_.size() != 0 && enum_map_.count(key) != 0) || \
enum_back_map_.count(value) != 0) {
std::ostringstream os;
os << "Enum " << "(" << key << ": " << value << " exisit!" << ")\n";
os << "Enums: ";
for (std::map<std::string, int>::const_iterator it = enum_map_.begin();
it != enum_map_.end(); ++it) {
os << "(" << it->first << ": " << it->second << "), ";
}
throw dmlc::ParamError(os.str());
}
enum_map_[key] = value;
enum_back_map_[value] = key;
is_enum_ = true;
return this->self();
}
protected:
// enum flag
bool is_enum_;
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
typedef FieldEntryBase<FieldEntry<optional<int> >, optional<int> > Parent;
// override set
virtual void Set(void *head, const std::string &value) const {
if (is_enum_ && value != "None") {
std::map<std::string, int>::const_iterator it = enum_map_.find(value);
std::ostringstream os;
if (it == enum_map_.end()) {
os << "Invalid Input: \'" << value;
os << "\', valid values are: ";
PrintEnums(os);
throw dmlc::ParamError(os.str());
} else {
os << it->second;
Parent::Set(head, os.str());
}
} else {
Parent::Set(head, value);
}
}
virtual ParamFieldInfo GetFieldInfo() const {
if (is_enum_) {
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
CHECK_NE(key, "None") << "None is reserved for empty optional<int>";
if ((enum_map_.size() != 0 && enum_map_.count(key) != 0) || \
enum_back_map_.count(value) != 0) {
std::ostringstream os;
os << "Enum " << "(" << key << ": " << value << " exisit!" << ")\n";
os << "Enums: ";
for (std::map<std::string, int>::const_iterator it = enum_map_.begin();
it != enum_map_.end(); ++it) {
os << "(" << it->first << ": " << it->second << "), ";
}
throw dmlc::ParamError(os.str());
}
enum_map_[key] = value;
enum_back_map_[value] = key;
is_enum_ = true;
return this->self();
}
protected:
// enum flag
bool is_enum_;
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
} else if (lower_case == "false") {
ref = false;
} else if (lower_case == "1") {
ref = true;
} else if (lower_case == "0") {
ref = false;
} else {
std::ostringstream os;
os << "Invalid Parameter format for " << key_
<< " expect " << type_ << " but value=\'" << value<< '\'';
throw dmlc::ParamError(os.str());
}
}
protected:
// print default string
virtual void PrintValue(std::ostream &os, bool value) const { // NOLINT(*)
if (value) {
os << "True";
} else {
os << "False";
xgboost/dmlc-core/include/dmlc/parameter.h view on Meta::CPAN
// parent
typedef FieldEntryNumeric<FieldEntry<float>, float> Parent;
// override set
virtual void Set(void *head, const std::string &value) const {
try {
this->Get(head) = std::stof(value);
} catch (const std::invalid_argument &) {
std::ostringstream os;
os << "Invalid Parameter format for " << key_ << " expect " << type_
<< " but value=\'" << value << '\'';
throw dmlc::ParamError(os.str());
}
}
};
// specialize define for double. Uses stod for platform independent handling of
// INF, -INF, NAN, etc.
template <>
class FieldEntry<double>
: public FieldEntryNumeric<FieldEntry<double>, double> {
public:
// parent
typedef FieldEntryNumeric<FieldEntry<double>, double> Parent;
// override set
virtual void Set(void *head, const std::string &value) const {
try {
this->Get(head) = std::stod(value);
} catch (const std::invalid_argument &) {
std::ostringstream os;
os << "Invalid Parameter format for " << key_ << " expect " << type_
<< " but value=\'" << value << '\'';
throw dmlc::ParamError(os.str());
}
}
};
#endif // DMLC_USE_CXX11
} // namespace parameter
//! \endcond
// implement GetEnv
template<typename ValueType>
xgboost/dmlc-core/src/config.cc view on Meta::CPAN
namespace dmlc {
struct Token {
std::string buf;
bool is_string;
};
class TokenizeError : public exception {
public:
explicit TokenizeError(const string& msg = "tokenize error"): msg_(msg) { }
~TokenizeError() throw() {}
virtual const char* what() const throw() {
return msg_.c_str();
}
private:
string msg_;
};
class Tokenizer {
public:
explicit Tokenizer(istream& is): is_(is), state_(kNone) {} // NOLINT(*)
bool GetNextToken(Token* tok) {
xgboost/dmlc-core/src/config.cc view on Meta::CPAN
EatChar(); // eat the first quotation mark
char ch;
while ( (ch = PeekChar()) != '\"' ) {
switch (ch) {
case '\\':
EatChar();
ch = PeekChar();
if (ch == '\"') {
*tok += '\"';
} else {
throw TokenizeError("error parsing escape characters");
}
break;
case '\n': case '\r': case EOF:
throw TokenizeError("quotation mark is not closed");
default:
*tok += ch;
break;
}
EatChar();
}
EatChar(); // eat the last quotation mark
}
void ParseComments() {
xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/ApplicationMaster.java view on Meta::CPAN
// collection of tasks
private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
// collection of killed tasks
private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
// worker environment
private final Map<String, String> env = new java.util.HashMap<String, String>();
//add the blacklist
private Collection<String> blackList = new java.util.HashSet();
public static void main(String[] args) throws Exception {
new ApplicationMaster().run(args);
}
private ApplicationMaster() throws IOException {
dfs = FileSystem.get(conf);
userName = UserGroupInformation.getCurrentUser().getShortUserName();
credentials = UserGroupInformation.getCurrentUser().getCredentials();
}
/**
* setup security token given current user
* @return the ByeBuffer containing the security tokens
* @throws IOException
*/
private ByteBuffer setupTokens() {
try {
DataOutputBuffer dob = new DataOutputBuffer();
credentials.writeTokenStorageToStream(dob);
return ByteBuffer.wrap(dob.getData(), 0, dob.getLength()).duplicate();
} catch (IOException e) {
throw new RuntimeException(e); // TODO: FIXME
}
}
/**
* get integer argument from environment variable
*
* @param name
* name of key
* @param required
* whether this is required
* @param defv
* default value
* @return the requested result
*/
private int getEnvInteger(String name, boolean required, int defv)
throws IOException {
String value = System.getenv(name);
if (value == null) {
if (required) {
throw new IOException("environment variable " + name
+ " not set");
} else {
return defv;
}
}
return Integer.valueOf(value);
}
/**
* initialize from arguments and command lines
*
* @param args
*/
private void initArgs(String args[]) throws IOException {
LOG.info("Start AM as user=" + this.userName);
// get user name
userName = UserGroupInformation.getCurrentUser().getShortUserName();
// cached maps
Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
for (int i = 0; i < args.length; ++i) {
if (args[i].equals("-file")) {
String[] arr = args[++i].split("#");
Path path = new Path(arr[0]);
if (arr.length == 1) {
xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/ApplicationMaster.java view on Meta::CPAN
numServer = this.getEnvInteger("DMLC_NUM_SERVER", true, numServer);
numTasks = numWorker + numServer;
maxNumAttempt = this.getEnvInteger("DMLC_MAX_ATTEMPT", false,
maxNumAttempt);
LOG.info("Try to start " + numServer + " Servers and " + numWorker + " Workers");
}
/**
* called to start the application
*/
private void run(String args[]) throws Exception {
this.initArgs(args);
this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000,
new RMCallbackHandler());
this.nmClient = NMClientAsync
.createNMClientAsync(new NMCallbackHandler());
this.rmClient.init(conf);
this.rmClient.start();
this.nmClient.init(conf);
this.nmClient.start();
RegisterApplicationMasterResponse response = this.rmClient
xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/ApplicationMaster.java view on Meta::CPAN
nmClient.stop();
LOG.info(diagnostics);
} catch (Exception e) {
diagnostics = e.toString();
}
rmClient.unregisterApplicationMaster(
success ? FinalApplicationStatus.SUCCEEDED
: FinalApplicationStatus.FAILED, diagnostics,
appTrackerUrl);
if (!success)
throw new Exception("Application not successful");
}
/**
* check if the job finishes
*
* @return whether we finished all the jobs
*/
private synchronized boolean doneAllJobs() {
return pendingTasks.size() == 0 && runningTasks.size() == 0;
}
xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/Client.java view on Meta::CPAN
private String jobName = "";
// queue
private String queue = "default";
// ApplicationMaster classpath
private String appCp = null;
// ApplicationMaster env
private Map<String, String> env = new java.util.HashMap<String, String>();
/**
* constructor
* @throws IOException
*/
private Client() throws IOException {
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/core-site.xml"));
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/hdfs-site.xml"));
dfs = FileSystem.get(conf);
userName = UserGroupInformation.getCurrentUser().getShortUserName();
credentials = UserGroupInformation.getCurrentUser().getCredentials();
}
/**
* setup security token given current user
* @return the ByeBuffer containing the security tokens
* @throws IOException
*/
private ByteBuffer setupTokens() throws IOException {
DataOutputBuffer buffer = new DataOutputBuffer();
String loc = System.getenv().get("HADOOP_TOKEN_FILE_LOCATION");
if ((loc != null && loc.trim().length() > 0)
|| (!UserGroupInformation.isSecurityEnabled())) {
this.credentials.writeTokenStorageToStream(buffer);
} else {
// Note: Credentials class is marked as LimitedPrivate for HDFS and MapReduce
Credentials credentials = new Credentials();
String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL);
if (tokenRenewer == null || tokenRenewer.length() == 0) {
throw new IOException(
"Can't get Master Kerberos principal for the RM to use as renewer");
}
// For now, only getting tokens for the default file-system.
final Token<?> tokens[] = dfs.addDelegationTokens(tokenRenewer, credentials);
if (tokens != null) {
for (Token<?> token : tokens) {
LOG.info("Got dt for " + dfs.getUri() + "; " + token);
}
}
xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/Client.java view on Meta::CPAN
}
return ByteBuffer.wrap(buffer.getData(), 0, buffer.getLength());
}
/**
* setup all the cached files
*
* @param fmaps
* the file maps
* @return the resource map
* @throws IOException
*/
private Map<String, LocalResource> setupCacheFiles(ApplicationId appId) throws IOException {
// create temporary dmlc directory
Path tmpPath = new Path(this.tempdir);
if (!dfs.exists(tmpPath)) {
dfs.mkdirs(tmpPath, permTemp);
LOG.info("HDFS temp directory do not exist, creating.. " + tmpPath);
}
tmpPath = new Path(tmpPath + "/temp-dmlc-yarn-" + appId);
if (dfs.exists(tmpPath)) {
dfs.delete(tmpPath, true);
}
xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/Client.java view on Meta::CPAN
String[] pair = args[++i].split("=", 2);
env.put(pair[0], (pair.length == 1) ? "" : pair[1]);
} else {
sargs.append(" ");
sargs.append(args[i]);
}
}
this.appArgs = sargs.toString();
}
private void run(String[] args) throws Exception {
if (args.length == 0) {
System.out.println("Usage: [options] [commands..]");
System.out.println("options: [-file filename] [-appcp appClasspath]");
return;
}
this.initArgs(args);
// Create yarnClient
YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
xgboost/dmlc-core/tracker/yarn/src/main/java/org/apache/hadoop/yarn/dmlc/Client.java view on Meta::CPAN
this.yarnClient = yarnClient;
}
public void handle(Signal signal){
try{
yarnClient.killApplication(appId);
}catch (Exception e){
System.out.println("yarn client exception");
}
}
}
public static void main(String[] args) throws Exception {
new Client().run(args);
}
}
xgboost/jvm-packages/checkstyle.xml view on Meta::CPAN
value="GenericWhitespace ''{0}'' is preceded with whitespace."/>
<message key="ws.illegalFollow"
value="GenericWhitespace ''{0}'' should followed by whitespace."/>
<message key="ws.notPreceded"
value="GenericWhitespace ''{0}'' is not preceded with whitespace."/>
</module>
<module name="Indentation">
<property name="basicOffset" value="2"/>
<property name="braceAdjustment" value="0"/>
<property name="caseIndent" value="2"/>
<property name="throwsIndent" value="4"/>
<property name="lineWrappingIndentation" value="4"/>
<property name="arrayInitIndent" value="2"/>
</module>
<module name="ImportOrder">
<property name="separated" value="true"/>
<property name="ordered" value="true"/>
<property name="groups" value="/^javax?\./,scala,*,ml.dmlc.xgboost4j"/>
</module>
<module name="MethodParamPad"/>
<module name="AnnotationLocation">
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java view on Meta::CPAN
for (int i = 0; i < fPredicts.length; i++) {
if (!Arrays.equals(fPredicts[i], sPredicts[i])) {
return false;
}
}
return true;
}
public static void saveDumpModel(String modelPath, String[] modelInfos) throws IOException {
try{
PrintWriter writer = new PrintWriter(modelPath, "UTF-8");
for(int i = 0; i < modelInfos.length; ++ i) {
writer.print("booster[" + i + "]:\n");
writer.print(modelInfos[i]);
}
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException, XGBoostError {
// load file from text file, also binary buffer generated by xgboost4j
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("eta", 1.0);
params.put("max_depth", 2);
params.put("silent", 1);
params.put("objective", "binary:logistic");
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java view on Meta::CPAN
import ml.dmlc.xgboost4j.java.DMatrix;
import ml.dmlc.xgboost4j.java.XGBoost;
import ml.dmlc.xgboost4j.java.XGBoostError;
/**
* example for start from a initial base prediction
*
* @author hzx
*/
public class BoostFromPrediction {
public static void main(String[] args) throws XGBoostError {
System.out.println("start running example to start from a initial prediction");
// load file from text file, also binary buffer generated by xgboost4j
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
//specify parameters
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("eta", 1.0);
params.put("max_depth", 2);
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java view on Meta::CPAN
import ml.dmlc.xgboost4j.java.DMatrix;
import ml.dmlc.xgboost4j.java.XGBoost;
import ml.dmlc.xgboost4j.java.XGBoostError;
/**
* an example of cross validation
*
* @author hzx
*/
public class CrossValidation {
public static void main(String[] args) throws IOException, XGBoostError {
//load train mat
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
//set params
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("eta", 1.0);
params.put("max_depth", 3);
params.put("silent", 1);
params.put("nthread", 6);
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java view on Meta::CPAN
error++;
} else if (labels[i] == 1f && predicts[i][0] <= 0) {
error++;
}
}
return error / labels.length;
}
}
public static void main(String[] args) throws XGBoostError {
//load train mat (svmlight format)
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
//load valid mat (svmlight format)
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("eta", 1.0);
params.put("max_depth", 2);
params.put("silent", 1);
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java view on Meta::CPAN
import ml.dmlc.xgboost4j.java.DMatrix;
import ml.dmlc.xgboost4j.java.XGBoost;
import ml.dmlc.xgboost4j.java.XGBoostError;
/**
* simple example for using external memory version
*
* @author hzx
*/
public class ExternalMemory {
public static void main(String[] args) throws XGBoostError {
//this is the only difference, add a # followed by a cache prefix name
//several cache file with the prefix will be generated
//currently only support convert from libsvm file
DMatrix trainMat = new DMatrix("../demo/data/agaricus.txt.train#dtrain.cache");
DMatrix testMat = new DMatrix("../demo/data/agaricus.txt.test#dtest.cache");
//specify parameters
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("eta", 1.0);
params.put("max_depth", 2);
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java view on Meta::CPAN
import ml.dmlc.xgboost4j.java.XGBoostError;
import ml.dmlc.xgboost4j.java.example.util.CustomEval;
/**
* this is an example of fit generalized linear model in xgboost
* basically, we are using linear model, instead of tree for our boosters
*
* @author hzx
*/
public class GeneralizedLinearModel {
public static void main(String[] args) throws XGBoostError {
// load file from text file, also binary buffer generated by xgboost4j
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
//specify parameters
//change booster to gblinear, so that we are fitting a linear model
// alpha is the L1 regularizer
//lambda is the L2 regularizer
//you can also set lambda_bias which is L2 regularizer on the bias term
HashMap<String, Object> params = new HashMap<String, Object>();
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java view on Meta::CPAN
import ml.dmlc.xgboost4j.java.XGBoost;
import ml.dmlc.xgboost4j.java.XGBoostError;
import ml.dmlc.xgboost4j.java.example.util.CustomEval;
/**
* predict first ntree
*
* @author hzx
*/
public class PredictFirstNtree {
public static void main(String[] args) throws XGBoostError {
// load file from text file, also binary buffer generated by xgboost4j
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
//specify parameters
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("eta", 1.0);
params.put("max_depth", 2);
params.put("silent", 1);
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java view on Meta::CPAN
import ml.dmlc.xgboost4j.java.DMatrix;
import ml.dmlc.xgboost4j.java.XGBoost;
import ml.dmlc.xgboost4j.java.XGBoostError;
/**
* predict leaf indices
*
* @author hzx
*/
public class PredictLeafIndices {
public static void main(String[] args) throws XGBoostError {
// load file from text file, also binary buffer generated by xgboost4j
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
//specify parameters
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("eta", 1.0);
params.put("max_depth", 2);
params.put("silent", 1);
params.put("objective", "binary:logistic");
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/util/DataLoader.java view on Meta::CPAN
public int ncol;
}
public static class CSRSparseData {
public float[] labels;
public float[] data;
public long[] rowHeaders;
public int[] colIndex;
}
public static DenseData loadCSVFile(String filePath) throws IOException {
DenseData denseData = new DenseData();
File f = new File(filePath);
FileInputStream in = new FileInputStream(f);
BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
denseData.nrow = 0;
denseData.ncol = -1;
String line;
List<Float> tlabels = new ArrayList<>();
xgboost/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/util/DataLoader.java view on Meta::CPAN
reader.close();
in.close();
denseData.labels = ArrayUtils.toPrimitive(tlabels.toArray(new Float[tlabels.size()]));
denseData.data = ArrayUtils.toPrimitive(tdata.toArray(new Float[tdata.size()]));
return denseData;
}
public static CSRSparseData loadSVMFile(String filePath) throws IOException {
CSRSparseData spData = new CSRSparseData();
List<Float> tlabels = new ArrayList<>();
List<Float> tdata = new ArrayList<>();
List<Long> theaders = new ArrayList<>();
List<Integer> tindex = new ArrayList<>();
File f = new File(filePath);
FileInputStream in = new FileInputStream(f);
BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
xgboost/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala view on Meta::CPAN
* @param round Number of rounds to train.
*/
def train(dtrain: DataSet[LabeledVector], params: Map[String, Any], round: Int):
XGBoostModel = {
val tracker = new RabitTracker(dtrain.getExecutionEnvironment.getParallelism)
if (tracker.start(0L)) {
dtrain
.mapPartition(new MapFunction(params, round, tracker.getWorkerEnvs))
.reduce((x, y) => x).collect().head
} else {
throw new Error("Tracker cannot be started")
null
}
}
}
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
nUndefined += 1 // don't waste space for all-NaNs.
} else {
builder += baseMargin
}
}
if (nUndefined == nTotal) {
None
} else if (nUndefined == 0) {
Some(builder.result())
} else {
throw new IllegalArgumentException(
s"Encountered a partition with $nUndefined NaN base margin values. " +
"If you want to specify base margin, ensure all values are non-NaN.")
}
}
private[spark] def buildDistributedBoosters(
trainingSet: RDD[XGBLabeledPoint],
params: Map[String, Any],
rabitEnv: java.util.Map[String, String],
numWorkers: Int,
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
} else {
trainingSet
}
val partitionedBaseMargin = partitionedTrainingSet.map(_.baseMargin)
val appName = partitionedTrainingSet.context.appName
// to workaround the empty partitions in training dataset,
// this might not be the best efficient implementation, see
// (https://github.com/dmlc/xgboost/issues/1277)
partitionedTrainingSet.zipPartitions(partitionedBaseMargin) { (trainingPoints, baseMargins) =>
if (trainingPoints.isEmpty) {
throw new XGBoostError(
s"detected an empty partition in the training data, partition ID:" +
s" ${TaskContext.getPartitionId()}")
}
val cacheFileName = if (useExternalMemory) {
s"$appName-${TaskContext.get().stageId()}-" +
s"dtrain_cache-${TaskContext.getPartitionId()}"
} else {
null
}
rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
* @param round the number of iterations
* @param nWorkers the number of xgboost workers, 0 by default which means that the number of
* workers equals to the partition number of trainingData RDD
* @param obj the user-defined objective function, null by default
* @param eval the user-defined evaluation function, null by default
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
* true, the user may save the RAM cost for running XGBoost within Spark
* @param missing the value represented the missing value in the dataset
* @param featureCol the name of input column, "features" as default value
* @param labelCol the name of output column, "label" as default value
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
* @return XGBoostModel when successful training
*/
@throws(classOf[XGBoostError])
def trainWithDataFrame(
trainingData: Dataset[_],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN,
featureCol: String = "features",
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
* @param trainingData the trainingset represented as RDD
* @param params Map containing the configuration entries
* @param round the number of iterations
* @param nWorkers the number of xgboost workers, 0 by default which means that the number of
* workers equals to the partition number of trainingData RDD
* @param obj the user-defined objective function, null by default
* @param eval the user-defined evaluation function, null by default
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
* true, the user may save the RAM cost for running XGBoost within Spark
* @param missing the value represented the missing value in the dataset
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
* @return XGBoostModel when successful training
*/
@deprecated("Use XGBoost.trainWithRDD instead.")
def train(
trainingData: RDD[MLLabeledPoint],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
* @param trainingData the trainingset represented as RDD
* @param params Map containing the configuration entries
* @param round the number of iterations
* @param nWorkers the number of xgboost workers, 0 by default which means that the number of
* workers equals to the partition number of trainingData RDD
* @param obj the user-defined objective function, null by default
* @param eval the user-defined evaluation function, null by default
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
* true, the user may save the RAM cost for running XGBoost within Spark
* @param missing the value represented the missing value in the dataset
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
* @return XGBoostModel when successful training
*/
@throws(classOf[XGBoostError])
def trainWithRDD(
trainingData: RDD[MLLabeledPoint],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN): XGBoostModel = {
import DataUtils._
val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, features) =>
features.asXGB.copy(label = label.toFloat)
}
trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
useExternalMemory, missing)
}
@throws(classOf[XGBoostError])
private[spark] def trainDistributed(
trainingData: RDD[XGBLabeledPoint],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN): XGBoostModel = {
if (params.contains("tree_method")) {
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
}
require(nWorkers > 0, "you must specify more than 0 workers")
if (obj != null) {
require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," +
" you have to specify the objective type as classification or regression with a" +
" customized objective function")
}
val trackerConf = params.get("tracker_conf") match {
case None => TrackerConf()
case Some(conf: TrackerConf) => conf
case _ => throw new IllegalArgumentException("parameter \"tracker_conf\" must be an " +
"instance of TrackerConf.")
}
val tracker = startTracker(nWorkers, trackerConf)
try {
val overriddenParams = overrideParamsAccordingToTaskCPUs(params, trainingData.sparkContext)
val boosters = buildDistributedBoosters(trainingData, overriddenParams,
tracker.getWorkerEnvs, nWorkers, round, obj, eval, useExternalMemory, missing)
val sparkJobThread = new Thread() {
override def run() {
// force the job
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala view on Meta::CPAN
xgboostModel
} else {
try {
if (sparkJobThread.isAlive) {
sparkJobThread.interrupt()
}
} catch {
case ie: InterruptedException =>
logger.info("spark job thread is interrupted")
}
throw new XGBoostError("XGBoostModel training failed")
}
}
private def loadGeneralModelParams(inputStream: FSDataInputStream): (String, String, String) = {
val featureCol = inputStream.readUTF()
val labelCol = inputStream.readUTF()
val predictionCol = inputStream.readUTF()
(featureCol, labelCol, predictionCol)
}
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassificationModel.scala view on Meta::CPAN
def numClasses: Int = numOfClasses
override def copy(extra: ParamMap): XGBoostClassificationModel = {
val clsModel = defaultCopy(extra).asInstanceOf[XGBoostClassificationModel]
clsModel._booster = booster
clsModel
}
override protected def predict(features: MLVector): Double = {
throw new Exception("XGBoost does not support online prediction ")
}
}
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressionModel.scala view on Meta::CPAN
val transformerForArrayTypedPredCol =
udf((regressionResults: mutable.WrappedArray[Float]) => regressionResults(0))
testSet.sparkSession.createDataFrame(predictRDD,
schema = testSet.schema.add(tempPredColName, ArrayType(FloatType, containsNull = false))
).withColumn(
$(predictionCol),
transformerForArrayTypedPredCol.apply(col(tempPredColName))).drop(tempPredColName)
}
override protected def predict(features: MLVector): Double = {
throw new Exception("XGBoost does not support online prediction for now")
}
override def copy(extra: ParamMap): XGBoostRegressionModel = {
val regModel = defaultCopy(extra).asInstanceOf[XGBoostRegressionModel]
regModel._booster = booster
regModel
}
}
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala view on Meta::CPAN
implicit val format = DefaultFormats
params match {
case JObject(pairs) =>
val values = pairs.filter { case (pName, jsonValue) =>
pName == paramName
}.map(_._2)
assert(values.length == 1, s"Expected one instance of Param '$paramName' but found" +
s" ${values.length} in JSON Params: " + pairs.map(_.toString).mkString(", "))
values.head
case _ =>
throw new IllegalArgumentException(
s"Cannot recognize JSON metadata: $metadataJson.")
}
}
}
/**
* Load metadata saved using [[DefaultXGBoostParamsWriter.saveMetadata()]]
*
* @param expectedClassName If non empty, this is checked against the loaded metadata.
* @throws IllegalArgumentException if expectedClassName is specified and does not match metadata
*/
def loadMetadata(path: String, sc: SparkContext, expectedClassName: String = ""): Metadata = {
val metadataPath = new Path(path, "metadata").toString
val metadataStr = sc.textFile(metadataPath, 1).first()
parseMetadata(metadataStr, expectedClassName)
}
/**
* Parse metadata JSON string produced by [[DefaultXGBoostParamsWriter.getMetadataToSave()]].
* This is a helper function for [[loadMetadata()]].
*
* @param metadataStr JSON string of metadata
* @param expectedClassName If non empty, this is checked against the loaded metadata.
* @throws IllegalArgumentException if expectedClassName is specified and does not match metadata
*/
def parseMetadata(metadataStr: String, expectedClassName: String = ""): Metadata = {
val metadata = parse(metadataStr)
implicit val format = DefaultFormats
val className = (metadata \ "class").extract[String]
val uid = (metadata \ "uid").extract[String]
val timestamp = (metadata \ "timestamp").extract[Long]
val sparkVersion = (metadata \ "sparkVersion").extract[String]
val params = metadata \ "paramMap"
xgboost/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala view on Meta::CPAN
def getAndSetParams(instance: Params, metadata: Metadata): Unit = {
implicit val format = DefaultFormats
metadata.params match {
case JObject(pairs) =>
pairs.foreach { case (paramName, jsonValue) =>
val param = instance.getParam(paramName)
val value = param.jsonDecode(compact(render(jsonValue)))
instance.set(param, value)
}
case _ =>
throw new IllegalArgumentException(
s"Cannot recognize JSON metadata: ${metadata.metadataJson}.")
}
}
/**
* Load a `Params` instance from the given path, and return it.
* This assumes the instance implements [[org.apache.spark.ml.util.MLReadable]].
*/
def loadParamsInstance[T](path: String, sc: SparkContext): T = {
val metadata = DefaultXGBoostParamsReader.loadMetadata(path, sc)
xgboost/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitTrackerRobustnessSuite.scala view on Meta::CPAN
import ml.dmlc.xgboost4j.java.IRabitTracker.TrackerStatus
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.FunSuite
class RabitTrackerRobustnessSuite extends FunSuite with PerTest {
test("test Java RabitTracker wrapper's exception handling: it should not hang forever.") {
/*
Deliberately create new instances of SparkContext in each unit test to avoid reusing the
same thread pool spawned by the local mode of Spark. As these tests simulate worker crashes
by throwing exceptions, the crashed worker thread never calls Rabit.shutdown, and therefore
corrupts the internal state of the native Rabit C++ code. Calling Rabit.init() in subsequent
tests on a reentrant thread will crash the entire Spark application, an undesired side-effect
that should be avoided.
*/
val rdd = sc.parallelize(1 to numWorkers, numWorkers).cache()
val tracker = new PyRabitTracker(numWorkers)
tracker.start(0)
val trackerEnvs = tracker.getWorkerEnvs
val workerCount: Int = numWorkers
/*
Simulate worker crash events by creating dummy Rabit workers, and throw exceptions in the
last created worker. A cascading event chain will be triggered once the RuntimeException is
thrown: the thread running the dummy spark job (sparkThread) catches the exception and
delegates it to the UnCaughtExceptionHandler, which is the Rabit tracker itself.
The Java RabitTracker class reacts to exceptions by killing the spawned process running
the Python tracker. If at least one Rabit worker has yet connected to the tracker before
it is killed, the resulted connection failure will trigger the Rabit worker to call
"exit(-1);" in the native C++ code, effectively ending the dummy Spark task.
In cluster (standalone or YARN) mode of Spark, tasks are run in containers and thus are
isolated from each other. That is, one task calling "exit(-1);" has no effect on other tasks
running in separate containers. However, as unit tests are run in Spark local mode, in which
tasks are executed by threads belonging to the same process, one thread calling "exit(-1);"
ultimately kills the entire process, which also happens to host the Spark driver, causing
the entire Spark application to crash.
To prevent unit tests from crashing, deterministic delays were introduced to make sure that
the exception is thrown at last, ideally after all worker connections have been established.
For the same reason, the Java RabitTracker class delays the killing of the Python tracker
process to ensure that pending worker connections are handled.
*/
val dummyTasks = rdd.mapPartitions { iter =>
Rabit.init(trackerEnvs)
val index = iter.next()
Thread.sleep(100 + index * 10)
if (index == workerCount) {
// kill the worker by throwing an exception
throw new RuntimeException("Worker exception.")
}
Rabit.shutdown()
Iterator(index)
}.cache()
val sparkThread = new Thread() {
override def run(): Unit = {
// forces a Spark job.
dummyTasks.foreachPartition(() => _)
}
xgboost/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitTrackerRobustnessSuite.scala view on Meta::CPAN
val tracker = new ScalaRabitTracker(numWorkers)
tracker.start(0)
val trackerEnvs = tracker.getWorkerEnvs
val workerCount: Int = numWorkers
val dummyTasks = rdd.mapPartitions { iter =>
Rabit.init(trackerEnvs)
val index = iter.next()
Thread.sleep(100 + index * 10)
if (index == workerCount) {
// kill the worker by throwing an exception
throw new RuntimeException("Worker exception.")
}
Rabit.shutdown()
Iterator(index)
}.cache()
val sparkThread = new Thread() {
override def run(): Unit = {
// forces a Spark job.
dummyTasks.foreachPartition(() => _)
}
xgboost/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSparkPipelinePersistence.scala view on Meta::CPAN
}
private def delete(f: File) {
if (f.exists()) {
if (f.isDirectory()) {
for (c <- f.listFiles()) {
delete(c)
}
}
if (!f.delete()) {
throw new FileNotFoundException("Failed to delete file: " + f)
}
}
}
test("test persistence of XGBoostEstimator") {
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "multi:softmax", "num_class" -> "6")
val xgbEstimator = new XGBoostEstimator(paramMap)
xgbEstimator.write.overwrite().save("./testxgbEst")
val loadedxgbEstimator = XGBoostEstimator.read.load("./testxgbEst")