Alien-XGBoost

 view release on metacpan or  search on metacpan

xgboost/python-package/xgboost/core.py  view on Meta::CPAN

# coding: utf-8
# pylint: disable=too-many-arguments, too-many-branches, invalid-name
# pylint: disable=too-many-branches, too-many-lines, W0141
"""Core XGBoost Library."""
from __future__ import absolute_import

import sys
import os
import ctypes
import collections
import re

import numpy as np
import scipy.sparse

from .libpath import find_lib_path

from .compat import STRING_TYPES, PY3, DataFrame, py_str, PANDAS_INSTALLED

# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
c_bst_ulong = ctypes.c_uint64


class XGBoostError(Exception):
    """Error thrown by xgboost trainer."""
    pass


class EarlyStopException(Exception):
    """Exception to signal early stopping.

    Parameters
    ----------
    best_iteration : int
        The best iteration stopped.
    """
    def __init__(self, best_iteration):
        super(EarlyStopException, self).__init__()
        self.best_iteration = best_iteration


# Callback environment used by callbacks
CallbackEnv = collections.namedtuple(
    "XGBoostCallbackEnv",
    ["model",
     "cvfolds",
     "iteration",
     "begin_iteration",
     "end_iteration",
     "rank",
     "evaluation_result_list"])


def from_pystr_to_cstr(data):
    """Convert a list of Python str to C pointer

    Parameters
    ----------
    data : list
        list of str
    """

    if isinstance(data, list):
        pointers = (ctypes.c_char_p * len(data))()
        if PY3:
            data = [bytes(d, 'utf-8') for d in data]
        else:
            data = [d.encode('utf-8') if isinstance(d, unicode) else d
                    for d in data]
        pointers[:] = data
        return pointers
    else:
        # copy from above when we actually use it
        raise NotImplementedError


def from_cstr_to_pystr(data, length):
    """Revert C pointer to Python str

    Parameters
    ----------
    data : ctypes pointer
        pointer to data
    length : ctypes pointer
        pointer to length of data
    """
    if PY3:
        res = []
        for i in range(length.value):
            try:
                res.append(str(data[i].decode('ascii')))
            except UnicodeDecodeError:
                res.append(str(data[i].decode('utf-8')))
    else:
        res = []
        for i in range(length.value):
            try:
                res.append(str(data[i].decode('ascii')))
            except UnicodeDecodeError:
                res.append(unicode(data[i].decode('utf-8')))
    return res


def _load_lib():
    """Load xgboost Library."""
    lib_path = find_lib_path()
    if len(lib_path) == 0:
        return None
    lib = ctypes.cdll.LoadLibrary(lib_path[0])
    lib.XGBGetLastError.restype = ctypes.c_char_p
    return lib


# load the XGBoost library globally
_LIB = _load_lib()


def _check_call(ret):
    """Check the return value of C API call

    This function will raise exception when error occurs.
    Wrap every API call with this function

    Parameters
    ----------
    ret : int
        return value from API calls
    """
    if ret != 0:
        raise XGBoostError(_LIB.XGBGetLastError())


def ctypes2numpy(cptr, length, dtype):
    """Convert a ctypes pointer array to a numpy array.
    """
    if not isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
        raise RuntimeError('expected float pointer')
    res = np.zeros(length, dtype=dtype)
    if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
        raise RuntimeError('memmove failed')
    return res


def ctypes2buffer(cptr, length):
    """Convert ctypes pointer to buffer type."""
    if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
        raise RuntimeError('expected char pointer')
    res = bytearray(length)
    rptr = (ctypes.c_char * length).from_buffer(res)
    if not ctypes.memmove(rptr, cptr, length):
        raise RuntimeError('memmove failed')
    return res


def c_str(string):
    """Convert a python string to cstring."""
    return ctypes.c_char_p(string.encode('utf-8'))


def c_array(ctype, values):
    """Convert a python string to c array."""
    return (ctype * len(values))(*values)


PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
                       'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
                       'float16': 'float', 'float32': 'float', 'float64': 'float',
                       'bool': 'i'}


def _maybe_pandas_data(data, feature_names, feature_types):
    """ Extract internal data from pd.DataFrame for DMatrix data """

    if not isinstance(data, DataFrame):
        return data, feature_names, feature_types

    data_dtypes = data.dtypes
    if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
        bad_fields = [data.columns[i] for i, dtype in
                      enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]

        msg = """DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields """
        raise ValueError(msg + ', '.join(bad_fields))

    if feature_names is None:
        feature_names = data.columns.format()

    if feature_types is None:
        feature_types = [PANDAS_DTYPE_MAPPER[dtype.name] for dtype in data_dtypes]

    data = data.values.astype('float')

    return data, feature_names, feature_types


def _maybe_pandas_label(label):
    """ Extract internal data from pd.DataFrame for DMatrix label """

    if isinstance(label, DataFrame):
        if len(label.columns) > 1:
            raise ValueError('DataFrame for label cannot have multiple columns')

        label_dtypes = label.dtypes
        if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
        else:
            label = label.values.astype('float')
    # pd.Series can be passed to xgb as it is

    return label


class DMatrix(object):
    """Data Matrix used in XGBoost.

    DMatrix is a internal data structure that used by XGBoost



( run in 0.405 second using v1.01-cache-2.11-cpan-63c85eba8c4 )