Alien-XGBoost
view release on metacpan or search on metacpan
xgboost/python-package/xgboost/core.py view on Meta::CPAN
# coding: utf-8
# pylint: disable=too-many-arguments, too-many-branches, invalid-name
# pylint: disable=too-many-branches, too-many-lines, W0141
"""Core XGBoost Library."""
from __future__ import absolute_import
import sys
import os
import ctypes
import collections
import re
import numpy as np
import scipy.sparse
from .libpath import find_lib_path
from .compat import STRING_TYPES, PY3, DataFrame, py_str, PANDAS_INSTALLED
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
c_bst_ulong = ctypes.c_uint64
class XGBoostError(Exception):
"""Error thrown by xgboost trainer."""
pass
class EarlyStopException(Exception):
"""Exception to signal early stopping.
Parameters
----------
best_iteration : int
The best iteration stopped.
"""
def __init__(self, best_iteration):
super(EarlyStopException, self).__init__()
self.best_iteration = best_iteration
# Callback environment used by callbacks
CallbackEnv = collections.namedtuple(
"XGBoostCallbackEnv",
["model",
"cvfolds",
"iteration",
"begin_iteration",
"end_iteration",
"rank",
"evaluation_result_list"])
def from_pystr_to_cstr(data):
"""Convert a list of Python str to C pointer
Parameters
----------
data : list
list of str
"""
if isinstance(data, list):
pointers = (ctypes.c_char_p * len(data))()
if PY3:
data = [bytes(d, 'utf-8') for d in data]
else:
data = [d.encode('utf-8') if isinstance(d, unicode) else d
for d in data]
pointers[:] = data
return pointers
else:
# copy from above when we actually use it
raise NotImplementedError
def from_cstr_to_pystr(data, length):
"""Revert C pointer to Python str
Parameters
----------
data : ctypes pointer
pointer to data
length : ctypes pointer
pointer to length of data
"""
if PY3:
res = []
for i in range(length.value):
try:
res.append(str(data[i].decode('ascii')))
except UnicodeDecodeError:
res.append(str(data[i].decode('utf-8')))
else:
res = []
for i in range(length.value):
try:
res.append(str(data[i].decode('ascii')))
except UnicodeDecodeError:
res.append(unicode(data[i].decode('utf-8')))
return res
def _load_lib():
"""Load xgboost Library."""
lib_path = find_lib_path()
if len(lib_path) == 0:
return None
lib = ctypes.cdll.LoadLibrary(lib_path[0])
lib.XGBGetLastError.restype = ctypes.c_char_p
return lib
# load the XGBoost library globally
_LIB = _load_lib()
def _check_call(ret):
"""Check the return value of C API call
This function will raise exception when error occurs.
Wrap every API call with this function
Parameters
----------
ret : int
return value from API calls
"""
if ret != 0:
raise XGBoostError(_LIB.XGBGetLastError())
def ctypes2numpy(cptr, length, dtype):
"""Convert a ctypes pointer array to a numpy array.
"""
if not isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
raise RuntimeError('expected float pointer')
res = np.zeros(length, dtype=dtype)
if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
raise RuntimeError('memmove failed')
return res
def ctypes2buffer(cptr, length):
"""Convert ctypes pointer to buffer type."""
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
raise RuntimeError('expected char pointer')
res = bytearray(length)
rptr = (ctypes.c_char * length).from_buffer(res)
if not ctypes.memmove(rptr, cptr, length):
raise RuntimeError('memmove failed')
return res
def c_str(string):
"""Convert a python string to cstring."""
return ctypes.c_char_p(string.encode('utf-8'))
def c_array(ctype, values):
"""Convert a python string to c array."""
return (ctype * len(values))(*values)
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float',
'bool': 'i'}
def _maybe_pandas_data(data, feature_names, feature_types):
""" Extract internal data from pd.DataFrame for DMatrix data """
if not isinstance(data, DataFrame):
return data, feature_names, feature_types
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = """DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
if feature_names is None:
feature_names = data.columns.format()
if feature_types is None:
feature_types = [PANDAS_DTYPE_MAPPER[dtype.name] for dtype in data_dtypes]
data = data.values.astype('float')
return data, feature_names, feature_types
def _maybe_pandas_label(label):
""" Extract internal data from pd.DataFrame for DMatrix label """
if isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
else:
label = label.values.astype('float')
# pd.Series can be passed to xgb as it is
return label
class DMatrix(object):
"""Data Matrix used in XGBoost.
DMatrix is a internal data structure that used by XGBoost
( run in 0.405 second using v1.01-cache-2.11-cpan-63c85eba8c4 )